From: Tim Northover Date: Sat, 24 May 2014 12:50:23 +0000 (+0000) Subject: AArch64/ARM64: move ARM64 into AArch64's place X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=commitdiff_plain;h=29f94c72014eaa5d0d3b920686e689e79759cacb AArch64/ARM64: move ARM64 into AArch64's place This commit starts with a "git mv ARM64 AArch64" and continues out from there, renaming the C++ classes, intrinsics, and other target-local objects for consistency. "ARM64" test directories are also moved, and tests that began their life in ARM64 use an arm64 triple, those from AArch64 use an aarch64 triple. Both should be equivalent though. This finishes the AArch64 merge, and everyone should feel free to continue committing as normal now. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@209577 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/CMakeLists.txt b/CMakeLists.txt index b19ab0271ab..0d6eead42f6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -127,7 +127,7 @@ set(LLVM_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include) set(LLVM_LIBDIR_SUFFIX "" CACHE STRING "Define suffix of library directory name (32/64)" ) set(LLVM_ALL_TARGETS - ARM64 + AArch64 ARM CppBackend Hexagon @@ -143,7 +143,7 @@ set(LLVM_ALL_TARGETS ) # List of targets with JIT support: -set(LLVM_TARGETS_WITH_JIT X86 PowerPC ARM64 ARM Mips SystemZ) +set(LLVM_TARGETS_WITH_JIT X86 PowerPC AArch64 ARM Mips SystemZ) set(LLVM_TARGETS_TO_BUILD "all" CACHE STRING "Semicolon-separated list of targets to build, or \"all\".") diff --git a/autoconf/configure.ac b/autoconf/configure.ac index 344e66af65d..08f756c9214 100644 --- a/autoconf/configure.ac +++ b/autoconf/configure.ac @@ -419,9 +419,9 @@ AC_CACHE_CHECK([target architecture],[llvm_cv_target_arch], amd64-* | x86_64-*) llvm_cv_target_arch="x86_64" ;; sparc*-*) llvm_cv_target_arch="Sparc" ;; powerpc*-*) llvm_cv_target_arch="PowerPC" ;; - arm64*-*) llvm_cv_target_arch="ARM64" ;; + arm64*-*) llvm_cv_target_arch="AArch64" ;; arm*-*) llvm_cv_target_arch="ARM" ;; - aarch64*-*) llvm_cv_target_arch="ARM64" ;; + aarch64*-*) llvm_cv_target_arch="AArch64" ;; mips-* | mips64-*) llvm_cv_target_arch="Mips" ;; mipsel-* | mips64el-*) llvm_cv_target_arch="Mips" ;; xcore-*) llvm_cv_target_arch="XCore" ;; @@ -455,9 +455,9 @@ case $host in amd64-* | x86_64-*) host_arch="x86_64" ;; sparc*-*) host_arch="Sparc" ;; powerpc*-*) host_arch="PowerPC" ;; - arm64*-*) host_arch="ARM64" ;; + arm64*-*) host_arch="AArch64" ;; arm*-*) host_arch="ARM" ;; - aarch64*-*) host_arch="ARM64" ;; + aarch64*-*) host_arch="AArch64" ;; mips-* | mips64-*) host_arch="Mips" ;; mipsel-* | mips64el-*) host_arch="Mips" ;; xcore-*) host_arch="XCore" ;; @@ -796,7 +796,7 @@ else esac fi -TARGETS_WITH_JIT="ARM ARM64 Mips PowerPC SystemZ X86" +TARGETS_WITH_JIT="ARM AArch64 Mips PowerPC SystemZ X86" AC_SUBST(TARGETS_WITH_JIT,$TARGETS_WITH_JIT) dnl Allow enablement of building and installing docs @@ -949,7 +949,7 @@ if test "$llvm_cv_enable_crash_overrides" = "yes" ; then fi dnl List all possible targets -ALL_TARGETS="X86 Sparc PowerPC ARM ARM64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600" +ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600" AC_SUBST(ALL_TARGETS,$ALL_TARGETS) dnl Allow specific targets to be specified for building (or not) @@ -970,8 +970,8 @@ case "$enableval" in x86_64) TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;; sparc) TARGETS_TO_BUILD="Sparc $TARGETS_TO_BUILD" ;; powerpc) TARGETS_TO_BUILD="PowerPC $TARGETS_TO_BUILD" ;; - aarch64) TARGETS_TO_BUILD="ARM64 $TARGETS_TO_BUILD" ;; - arm64) TARGETS_TO_BUILD="ARM64 $TARGETS_TO_BUILD" ;; + aarch64) TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;; + arm64) TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;; arm) TARGETS_TO_BUILD="ARM $TARGETS_TO_BUILD" ;; mips) TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;; mipsel) TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;; @@ -989,7 +989,7 @@ case "$enableval" in x86_64) TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;; Sparc) TARGETS_TO_BUILD="Sparc $TARGETS_TO_BUILD" ;; PowerPC) TARGETS_TO_BUILD="PowerPC $TARGETS_TO_BUILD" ;; - AArch64) TARGETS_TO_BUILD="ARM64 $TARGETS_TO_BUILD" ;; + AArch64) TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;; ARM) TARGETS_TO_BUILD="ARM $TARGETS_TO_BUILD" ;; Mips) TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;; XCore) TARGETS_TO_BUILD="XCore $TARGETS_TO_BUILD" ;; diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake index ca4af73d92c..1325e790c80 100755 --- a/cmake/config-ix.cmake +++ b/cmake/config-ix.cmake @@ -372,7 +372,7 @@ elseif (LLVM_NATIVE_ARCH MATCHES "powerpc") elseif (LLVM_NATIVE_ARCH MATCHES "aarch64") set(LLVM_NATIVE_ARCH AArch64) elseif (LLVM_NATIVE_ARCH MATCHES "arm64") - set(LLVM_NATIVE_ARCH ARM64) + set(LLVM_NATIVE_ARCH AArch64) elseif (LLVM_NATIVE_ARCH MATCHES "arm") set(LLVM_NATIVE_ARCH ARM) elseif (LLVM_NATIVE_ARCH MATCHES "mips") diff --git a/configure b/configure index a5babe9c230..e1959dfee6c 100755 --- a/configure +++ b/configure @@ -4151,9 +4151,9 @@ else amd64-* | x86_64-*) llvm_cv_target_arch="x86_64" ;; sparc*-*) llvm_cv_target_arch="Sparc" ;; powerpc*-*) llvm_cv_target_arch="PowerPC" ;; - arm64*-*) llvm_cv_target_arch="ARM64" ;; + arm64*-*) llvm_cv_target_arch="AArch64" ;; arm*-*) llvm_cv_target_arch="ARM" ;; - aarch64*-*) llvm_cv_target_arch="ARM64" ;; + aarch64*-*) llvm_cv_target_arch="AArch64" ;; mips-* | mips64-*) llvm_cv_target_arch="Mips" ;; mipsel-* | mips64el-*) llvm_cv_target_arch="Mips" ;; xcore-*) llvm_cv_target_arch="XCore" ;; @@ -4188,9 +4188,9 @@ case $host in amd64-* | x86_64-*) host_arch="x86_64" ;; sparc*-*) host_arch="Sparc" ;; powerpc*-*) host_arch="PowerPC" ;; - arm64*-*) host_arch="ARM64" ;; + arm64*-*) host_arch="AArch64" ;; arm*-*) host_arch="ARM" ;; - aarch64*-*) host_arch="ARM64" ;; + aarch64*-*) host_arch="AArch64" ;; mips-* | mips64-*) host_arch="Mips" ;; mipsel-* | mips64el-*) host_arch="Mips" ;; xcore-*) host_arch="XCore" ;; @@ -5120,7 +5120,7 @@ else esac fi -TARGETS_WITH_JIT="ARM ARM64 Mips PowerPC SystemZ X86" +TARGETS_WITH_JIT="ARM AArch64 Mips PowerPC SystemZ X86" TARGETS_WITH_JIT=$TARGETS_WITH_JIT @@ -5357,7 +5357,7 @@ _ACEOF fi -ALL_TARGETS="X86 Sparc PowerPC ARM ARM64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600" +ALL_TARGETS="X86 Sparc PowerPC ARM AArch64 Mips XCore MSP430 CppBackend NVPTX Hexagon SystemZ R600" ALL_TARGETS=$ALL_TARGETS @@ -5380,8 +5380,8 @@ case "$enableval" in x86_64) TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;; sparc) TARGETS_TO_BUILD="Sparc $TARGETS_TO_BUILD" ;; powerpc) TARGETS_TO_BUILD="PowerPC $TARGETS_TO_BUILD" ;; - aarch64) TARGETS_TO_BUILD="ARM64 $TARGETS_TO_BUILD" ;; - arm64) TARGETS_TO_BUILD="ARM64 $TARGETS_TO_BUILD" ;; + aarch64) TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;; + arm64) TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;; arm) TARGETS_TO_BUILD="ARM $TARGETS_TO_BUILD" ;; mips) TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;; mipsel) TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;; @@ -5399,7 +5399,7 @@ case "$enableval" in x86_64) TARGETS_TO_BUILD="X86 $TARGETS_TO_BUILD" ;; Sparc) TARGETS_TO_BUILD="Sparc $TARGETS_TO_BUILD" ;; PowerPC) TARGETS_TO_BUILD="PowerPC $TARGETS_TO_BUILD" ;; - AArch64) TARGETS_TO_BUILD="ARM64 $TARGETS_TO_BUILD" ;; + AArch64) TARGETS_TO_BUILD="AArch64 $TARGETS_TO_BUILD" ;; ARM) TARGETS_TO_BUILD="ARM $TARGETS_TO_BUILD" ;; Mips) TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;; XCore) TARGETS_TO_BUILD="XCore $TARGETS_TO_BUILD" ;; diff --git a/docs/LangRef.rst b/docs/LangRef.rst index fa8d3c0b75f..9b72eca7de5 100644 --- a/docs/LangRef.rst +++ b/docs/LangRef.rst @@ -6877,7 +6877,7 @@ register in surrounding code, including inline assembly. Because of that, allocatable registers are not supported. Warning: So far it only works with the stack pointer on selected -architectures (ARM, ARM64, AArch64, PowerPC and x86_64). Significant amount of +architectures (ARM, AArch64, PowerPC and x86_64). Significant amount of work is needed to support other registers and even more so, allocatable registers. diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td index b133b4e4096..edd1621ef25 100644 --- a/include/llvm/IR/Intrinsics.td +++ b/include/llvm/IR/Intrinsics.td @@ -533,7 +533,7 @@ def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], include "llvm/IR/IntrinsicsPowerPC.td" include "llvm/IR/IntrinsicsX86.td" include "llvm/IR/IntrinsicsARM.td" -include "llvm/IR/IntrinsicsARM64.td" +include "llvm/IR/IntrinsicsAArch64.td" include "llvm/IR/IntrinsicsXCore.td" include "llvm/IR/IntrinsicsHexagon.td" include "llvm/IR/IntrinsicsNVVM.td" diff --git a/include/llvm/IR/IntrinsicsAArch64.td b/include/llvm/IR/IntrinsicsAArch64.td new file mode 100644 index 00000000000..23757aaef5c --- /dev/null +++ b/include/llvm/IR/IntrinsicsAArch64.td @@ -0,0 +1,636 @@ +//===- IntrinsicsAARCH64.td - Defines AARCH64 intrinsics ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines all of the AARCH64-specific intrinsics. +// +//===----------------------------------------------------------------------===// + +let TargetPrefix = "aarch64" in { + +def int_aarch64_ldxr : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>; +def int_aarch64_ldaxr : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>; +def int_aarch64_stxr : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_anyptr_ty]>; +def int_aarch64_stlxr : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_anyptr_ty]>; + +def int_aarch64_ldxp : Intrinsic<[llvm_i64_ty, llvm_i64_ty], [llvm_ptr_ty]>; +def int_aarch64_ldaxp : Intrinsic<[llvm_i64_ty, llvm_i64_ty], [llvm_ptr_ty]>; +def int_aarch64_stxp : Intrinsic<[llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_ptr_ty]>; +def int_aarch64_stlxp : Intrinsic<[llvm_i32_ty], + [llvm_i64_ty, llvm_i64_ty, llvm_ptr_ty]>; + +def int_aarch64_clrex : Intrinsic<[]>; + +def int_aarch64_sdiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, + LLVMMatchType<0>], [IntrNoMem]>; +def int_aarch64_udiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, + LLVMMatchType<0>], [IntrNoMem]>; +} + +//===----------------------------------------------------------------------===// +// Advanced SIMD (NEON) + +let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". + class AdvSIMD_2Scalar_Float_Intrinsic + : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem]>; + + class AdvSIMD_FPToIntRounding_Intrinsic + : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>; + + class AdvSIMD_1IntArg_Intrinsic + : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; + class AdvSIMD_1FloatArg_Intrinsic + : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + class AdvSIMD_1VectorArg_Intrinsic + : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>; + class AdvSIMD_1VectorArg_Expand_Intrinsic + : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>; + class AdvSIMD_1VectorArg_Long_Intrinsic + : Intrinsic<[llvm_anyvector_ty], [LLVMTruncatedType<0>], [IntrNoMem]>; + class AdvSIMD_1IntArg_Narrow_Intrinsic + : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem]>; + class AdvSIMD_1VectorArg_Narrow_Intrinsic + : Intrinsic<[llvm_anyint_ty], [LLVMExtendedType<0>], [IntrNoMem]>; + class AdvSIMD_1VectorArg_Int_Across_Intrinsic + : Intrinsic<[llvm_anyint_ty], [llvm_anyvector_ty], [IntrNoMem]>; + class AdvSIMD_1VectorArg_Float_Across_Intrinsic + : Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>; + + class AdvSIMD_2IntArg_Intrinsic + : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem]>; + class AdvSIMD_2FloatArg_Intrinsic + : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem]>; + class AdvSIMD_2VectorArg_Intrinsic + : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem]>; + class AdvSIMD_2VectorArg_Compare_Intrinsic + : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>], + [IntrNoMem]>; + class AdvSIMD_2Arg_FloatCompare_Intrinsic + : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>], + [IntrNoMem]>; + class AdvSIMD_2VectorArg_Long_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMTruncatedType<0>, LLVMTruncatedType<0>], + [IntrNoMem]>; + class AdvSIMD_2VectorArg_Wide_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, LLVMTruncatedType<0>], + [IntrNoMem]>; + class AdvSIMD_2VectorArg_Narrow_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMExtendedType<0>, LLVMExtendedType<0>], + [IntrNoMem]>; + class AdvSIMD_2Arg_Scalar_Narrow_Intrinsic + : Intrinsic<[llvm_anyint_ty], + [LLVMExtendedType<0>, llvm_i32_ty], + [IntrNoMem]>; + class AdvSIMD_2VectorArg_Scalar_Expand_BySize_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; + class AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMTruncatedType<0>], + [IntrNoMem]>; + class AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMTruncatedType<0>, llvm_i32_ty], + [IntrNoMem]>; + class AdvSIMD_2VectorArg_Tied_Narrow_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty], + [IntrNoMem]>; + + class AdvSIMD_3VectorArg_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem]>; + class AdvSIMD_3VectorArg_Scalar_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], + [IntrNoMem]>; + class AdvSIMD_3VectorArg_Tied_Narrow_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty, + LLVMMatchType<1>], [IntrNoMem]>; + class AdvSIMD_3VectorArg_Scalar_Tied_Narrow_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty, llvm_i32_ty], + [IntrNoMem]>; + class AdvSIMD_CvtFxToFP_Intrinsic + : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty], + [IntrNoMem]>; + class AdvSIMD_CvtFPToFx_Intrinsic + : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty], + [IntrNoMem]>; +} + +// Arithmetic ops + +let Properties = [IntrNoMem] in { + // Vector Add Across Lanes + def int_aarch64_neon_saddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic; + def int_aarch64_neon_uaddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic; + def int_aarch64_neon_faddv : AdvSIMD_1VectorArg_Float_Across_Intrinsic; + + // Vector Long Add Across Lanes + def int_aarch64_neon_saddlv : AdvSIMD_1VectorArg_Int_Across_Intrinsic; + def int_aarch64_neon_uaddlv : AdvSIMD_1VectorArg_Int_Across_Intrinsic; + + // Vector Halving Add + def int_aarch64_neon_shadd : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_uhadd : AdvSIMD_2VectorArg_Intrinsic; + + // Vector Rounding Halving Add + def int_aarch64_neon_srhadd : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_urhadd : AdvSIMD_2VectorArg_Intrinsic; + + // Vector Saturating Add + def int_aarch64_neon_sqadd : AdvSIMD_2IntArg_Intrinsic; + def int_aarch64_neon_suqadd : AdvSIMD_2IntArg_Intrinsic; + def int_aarch64_neon_usqadd : AdvSIMD_2IntArg_Intrinsic; + def int_aarch64_neon_uqadd : AdvSIMD_2IntArg_Intrinsic; + + // Vector Add High-Half + // FIXME: this is a legacy intrinsic for aarch64_simd.h. Remove it when that + // header is no longer supported. + def int_aarch64_neon_addhn : AdvSIMD_2VectorArg_Narrow_Intrinsic; + + // Vector Rounding Add High-Half + def int_aarch64_neon_raddhn : AdvSIMD_2VectorArg_Narrow_Intrinsic; + + // Vector Saturating Doubling Multiply High + def int_aarch64_neon_sqdmulh : AdvSIMD_2IntArg_Intrinsic; + + // Vector Saturating Rounding Doubling Multiply High + def int_aarch64_neon_sqrdmulh : AdvSIMD_2IntArg_Intrinsic; + + // Vector Polynominal Multiply + def int_aarch64_neon_pmul : AdvSIMD_2VectorArg_Intrinsic; + + // Vector Long Multiply + def int_aarch64_neon_smull : AdvSIMD_2VectorArg_Long_Intrinsic; + def int_aarch64_neon_umull : AdvSIMD_2VectorArg_Long_Intrinsic; + def int_aarch64_neon_pmull : AdvSIMD_2VectorArg_Long_Intrinsic; + + // 64-bit polynomial multiply really returns an i128, which is not legal. Fake + // it with a v16i8. + def int_aarch64_neon_pmull64 : + Intrinsic<[llvm_v16i8_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>; + + // Vector Extending Multiply + def int_aarch64_neon_fmulx : AdvSIMD_2FloatArg_Intrinsic { + let Properties = [IntrNoMem, Commutative]; + } + + // Vector Saturating Doubling Long Multiply + def int_aarch64_neon_sqdmull : AdvSIMD_2VectorArg_Long_Intrinsic; + def int_aarch64_neon_sqdmulls_scalar + : Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + + // Vector Halving Subtract + def int_aarch64_neon_shsub : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_uhsub : AdvSIMD_2VectorArg_Intrinsic; + + // Vector Saturating Subtract + def int_aarch64_neon_sqsub : AdvSIMD_2IntArg_Intrinsic; + def int_aarch64_neon_uqsub : AdvSIMD_2IntArg_Intrinsic; + + // Vector Subtract High-Half + // FIXME: this is a legacy intrinsic for aarch64_simd.h. Remove it when that + // header is no longer supported. + def int_aarch64_neon_subhn : AdvSIMD_2VectorArg_Narrow_Intrinsic; + + // Vector Rounding Subtract High-Half + def int_aarch64_neon_rsubhn : AdvSIMD_2VectorArg_Narrow_Intrinsic; + + // Vector Compare Absolute Greater-than-or-equal + def int_aarch64_neon_facge : AdvSIMD_2Arg_FloatCompare_Intrinsic; + + // Vector Compare Absolute Greater-than + def int_aarch64_neon_facgt : AdvSIMD_2Arg_FloatCompare_Intrinsic; + + // Vector Absolute Difference + def int_aarch64_neon_sabd : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_uabd : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_fabd : AdvSIMD_2VectorArg_Intrinsic; + + // Scalar Absolute Difference + def int_aarch64_sisd_fabd : AdvSIMD_2Scalar_Float_Intrinsic; + + // Vector Max + def int_aarch64_neon_smax : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_umax : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_fmax : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_fmaxnmp : AdvSIMD_2VectorArg_Intrinsic; + + // Vector Max Across Lanes + def int_aarch64_neon_smaxv : AdvSIMD_1VectorArg_Int_Across_Intrinsic; + def int_aarch64_neon_umaxv : AdvSIMD_1VectorArg_Int_Across_Intrinsic; + def int_aarch64_neon_fmaxv : AdvSIMD_1VectorArg_Float_Across_Intrinsic; + def int_aarch64_neon_fmaxnmv : AdvSIMD_1VectorArg_Float_Across_Intrinsic; + + // Vector Min + def int_aarch64_neon_smin : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_umin : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_fmin : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_fminnmp : AdvSIMD_2VectorArg_Intrinsic; + + // Vector Min/Max Number + def int_aarch64_neon_fminnm : AdvSIMD_2FloatArg_Intrinsic; + def int_aarch64_neon_fmaxnm : AdvSIMD_2FloatArg_Intrinsic; + + // Vector Min Across Lanes + def int_aarch64_neon_sminv : AdvSIMD_1VectorArg_Int_Across_Intrinsic; + def int_aarch64_neon_uminv : AdvSIMD_1VectorArg_Int_Across_Intrinsic; + def int_aarch64_neon_fminv : AdvSIMD_1VectorArg_Float_Across_Intrinsic; + def int_aarch64_neon_fminnmv : AdvSIMD_1VectorArg_Float_Across_Intrinsic; + + // Pairwise Add + def int_aarch64_neon_addp : AdvSIMD_2VectorArg_Intrinsic; + + // Long Pairwise Add + // FIXME: In theory, we shouldn't need intrinsics for saddlp or + // uaddlp, but tblgen's type inference currently can't handle the + // pattern fragments this ends up generating. + def int_aarch64_neon_saddlp : AdvSIMD_1VectorArg_Expand_Intrinsic; + def int_aarch64_neon_uaddlp : AdvSIMD_1VectorArg_Expand_Intrinsic; + + // Folding Maximum + def int_aarch64_neon_smaxp : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_umaxp : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_fmaxp : AdvSIMD_2VectorArg_Intrinsic; + + // Folding Minimum + def int_aarch64_neon_sminp : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_uminp : AdvSIMD_2VectorArg_Intrinsic; + def int_aarch64_neon_fminp : AdvSIMD_2VectorArg_Intrinsic; + + // Reciprocal Estimate/Step + def int_aarch64_neon_frecps : AdvSIMD_2FloatArg_Intrinsic; + def int_aarch64_neon_frsqrts : AdvSIMD_2FloatArg_Intrinsic; + + // Reciprocal Exponent + def int_aarch64_neon_frecpx : AdvSIMD_1FloatArg_Intrinsic; + + // Vector Saturating Shift Left + def int_aarch64_neon_sqshl : AdvSIMD_2IntArg_Intrinsic; + def int_aarch64_neon_uqshl : AdvSIMD_2IntArg_Intrinsic; + + // Vector Rounding Shift Left + def int_aarch64_neon_srshl : AdvSIMD_2IntArg_Intrinsic; + def int_aarch64_neon_urshl : AdvSIMD_2IntArg_Intrinsic; + + // Vector Saturating Rounding Shift Left + def int_aarch64_neon_sqrshl : AdvSIMD_2IntArg_Intrinsic; + def int_aarch64_neon_uqrshl : AdvSIMD_2IntArg_Intrinsic; + + // Vector Signed->Unsigned Shift Left by Constant + def int_aarch64_neon_sqshlu : AdvSIMD_2IntArg_Intrinsic; + + // Vector Signed->Unsigned Narrowing Saturating Shift Right by Constant + def int_aarch64_neon_sqshrun : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic; + + // Vector Signed->Unsigned Rounding Narrowing Saturating Shift Right by Const + def int_aarch64_neon_sqrshrun : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic; + + // Vector Narrowing Shift Right by Constant + def int_aarch64_neon_sqshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic; + def int_aarch64_neon_uqshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic; + + // Vector Rounding Narrowing Shift Right by Constant + def int_aarch64_neon_rshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic; + + // Vector Rounding Narrowing Saturating Shift Right by Constant + def int_aarch64_neon_sqrshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic; + def int_aarch64_neon_uqrshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic; + + // Vector Shift Left + def int_aarch64_neon_sshl : AdvSIMD_2IntArg_Intrinsic; + def int_aarch64_neon_ushl : AdvSIMD_2IntArg_Intrinsic; + + // Vector Widening Shift Left by Constant + def int_aarch64_neon_shll : AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic; + def int_aarch64_neon_sshll : AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic; + def int_aarch64_neon_ushll : AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic; + + // Vector Shift Right by Constant and Insert + def int_aarch64_neon_vsri : AdvSIMD_3VectorArg_Scalar_Intrinsic; + + // Vector Shift Left by Constant and Insert + def int_aarch64_neon_vsli : AdvSIMD_3VectorArg_Scalar_Intrinsic; + + // Vector Saturating Narrow + def int_aarch64_neon_scalar_sqxtn: AdvSIMD_1IntArg_Narrow_Intrinsic; + def int_aarch64_neon_scalar_uqxtn : AdvSIMD_1IntArg_Narrow_Intrinsic; + def int_aarch64_neon_sqxtn : AdvSIMD_1VectorArg_Narrow_Intrinsic; + def int_aarch64_neon_uqxtn : AdvSIMD_1VectorArg_Narrow_Intrinsic; + + // Vector Saturating Extract and Unsigned Narrow + def int_aarch64_neon_scalar_sqxtun : AdvSIMD_1IntArg_Narrow_Intrinsic; + def int_aarch64_neon_sqxtun : AdvSIMD_1VectorArg_Narrow_Intrinsic; + + // Vector Absolute Value + def int_aarch64_neon_abs : AdvSIMD_1IntArg_Intrinsic; + + // Vector Saturating Absolute Value + def int_aarch64_neon_sqabs : AdvSIMD_1IntArg_Intrinsic; + + // Vector Saturating Negation + def int_aarch64_neon_sqneg : AdvSIMD_1IntArg_Intrinsic; + + // Vector Count Leading Sign Bits + def int_aarch64_neon_cls : AdvSIMD_1VectorArg_Intrinsic; + + // Vector Reciprocal Estimate + def int_aarch64_neon_urecpe : AdvSIMD_1VectorArg_Intrinsic; + def int_aarch64_neon_frecpe : AdvSIMD_1FloatArg_Intrinsic; + + // Vector Square Root Estimate + def int_aarch64_neon_ursqrte : AdvSIMD_1VectorArg_Intrinsic; + def int_aarch64_neon_frsqrte : AdvSIMD_1FloatArg_Intrinsic; + + // Vector Bitwise Reverse + def int_aarch64_neon_rbit : AdvSIMD_1VectorArg_Intrinsic; + + // Vector Conversions Between Half-Precision and Single-Precision. + def int_aarch64_neon_vcvtfp2hf + : Intrinsic<[llvm_v4i16_ty], [llvm_v4f32_ty], [IntrNoMem]>; + def int_aarch64_neon_vcvthf2fp + : Intrinsic<[llvm_v4f32_ty], [llvm_v4i16_ty], [IntrNoMem]>; + + // Vector Conversions Between Floating-point and Fixed-point. + def int_aarch64_neon_vcvtfp2fxs : AdvSIMD_CvtFPToFx_Intrinsic; + def int_aarch64_neon_vcvtfp2fxu : AdvSIMD_CvtFPToFx_Intrinsic; + def int_aarch64_neon_vcvtfxs2fp : AdvSIMD_CvtFxToFP_Intrinsic; + def int_aarch64_neon_vcvtfxu2fp : AdvSIMD_CvtFxToFP_Intrinsic; + + // Vector FP->Int Conversions + def int_aarch64_neon_fcvtas : AdvSIMD_FPToIntRounding_Intrinsic; + def int_aarch64_neon_fcvtau : AdvSIMD_FPToIntRounding_Intrinsic; + def int_aarch64_neon_fcvtms : AdvSIMD_FPToIntRounding_Intrinsic; + def int_aarch64_neon_fcvtmu : AdvSIMD_FPToIntRounding_Intrinsic; + def int_aarch64_neon_fcvtns : AdvSIMD_FPToIntRounding_Intrinsic; + def int_aarch64_neon_fcvtnu : AdvSIMD_FPToIntRounding_Intrinsic; + def int_aarch64_neon_fcvtps : AdvSIMD_FPToIntRounding_Intrinsic; + def int_aarch64_neon_fcvtpu : AdvSIMD_FPToIntRounding_Intrinsic; + def int_aarch64_neon_fcvtzs : AdvSIMD_FPToIntRounding_Intrinsic; + def int_aarch64_neon_fcvtzu : AdvSIMD_FPToIntRounding_Intrinsic; + + // Vector FP Rounding: only ties to even is unrepresented by a normal + // intrinsic. + def int_aarch64_neon_frintn : AdvSIMD_1FloatArg_Intrinsic; + + // Scalar FP->Int conversions + + // Vector FP Inexact Narrowing + def int_aarch64_neon_fcvtxn : AdvSIMD_1VectorArg_Expand_Intrinsic; + + // Scalar FP Inexact Narrowing + def int_aarch64_sisd_fcvtxn : Intrinsic<[llvm_float_ty], [llvm_double_ty], + [IntrNoMem]>; +} + +let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". + class AdvSIMD_2Vector2Index_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [llvm_anyvector_ty, llvm_i64_ty, LLVMMatchType<0>, llvm_i64_ty], + [IntrNoMem]>; +} + +// Vector element to element moves +def int_aarch64_neon_vcopy_lane: AdvSIMD_2Vector2Index_Intrinsic; + +let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". + class AdvSIMD_1Vec_Load_Intrinsic + : Intrinsic<[llvm_anyvector_ty], [LLVMAnyPointerType>], + [IntrReadArgMem]>; + class AdvSIMD_1Vec_Store_Lane_Intrinsic + : Intrinsic<[], [llvm_anyvector_ty, llvm_i64_ty, llvm_anyptr_ty], + [IntrReadWriteArgMem, NoCapture<2>]>; + + class AdvSIMD_2Vec_Load_Intrinsic + : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], + [LLVMAnyPointerType>], + [IntrReadArgMem]>; + class AdvSIMD_2Vec_Load_Lane_Intrinsic + : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], + [LLVMMatchType<0>, LLVMMatchType<0>, + llvm_i64_ty, llvm_anyptr_ty], + [IntrReadArgMem]>; + class AdvSIMD_2Vec_Store_Intrinsic + : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>, + LLVMAnyPointerType>], + [IntrReadWriteArgMem, NoCapture<2>]>; + class AdvSIMD_2Vec_Store_Lane_Intrinsic + : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>, + llvm_i64_ty, llvm_anyptr_ty], + [IntrReadWriteArgMem, NoCapture<3>]>; + + class AdvSIMD_3Vec_Load_Intrinsic + : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>], + [LLVMAnyPointerType>], + [IntrReadArgMem]>; + class AdvSIMD_3Vec_Load_Lane_Intrinsic + : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>], + [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, + llvm_i64_ty, llvm_anyptr_ty], + [IntrReadArgMem]>; + class AdvSIMD_3Vec_Store_Intrinsic + : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMAnyPointerType>], + [IntrReadWriteArgMem, NoCapture<3>]>; + class AdvSIMD_3Vec_Store_Lane_Intrinsic + : Intrinsic<[], [llvm_anyvector_ty, + LLVMMatchType<0>, LLVMMatchType<0>, + llvm_i64_ty, llvm_anyptr_ty], + [IntrReadWriteArgMem, NoCapture<4>]>; + + class AdvSIMD_4Vec_Load_Intrinsic + : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], + [LLVMAnyPointerType>], + [IntrReadArgMem]>; + class AdvSIMD_4Vec_Load_Lane_Intrinsic + : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], + [LLVMMatchType<0>, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>, + llvm_i64_ty, llvm_anyptr_ty], + [IntrReadArgMem]>; + class AdvSIMD_4Vec_Store_Intrinsic + : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>, + LLVMAnyPointerType>], + [IntrReadWriteArgMem, NoCapture<4>]>; + class AdvSIMD_4Vec_Store_Lane_Intrinsic + : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>, + llvm_i64_ty, llvm_anyptr_ty], + [IntrReadWriteArgMem, NoCapture<5>]>; +} + +// Memory ops + +def int_aarch64_neon_ld1x2 : AdvSIMD_2Vec_Load_Intrinsic; +def int_aarch64_neon_ld1x3 : AdvSIMD_3Vec_Load_Intrinsic; +def int_aarch64_neon_ld1x4 : AdvSIMD_4Vec_Load_Intrinsic; + +def int_aarch64_neon_st1x2 : AdvSIMD_2Vec_Store_Intrinsic; +def int_aarch64_neon_st1x3 : AdvSIMD_3Vec_Store_Intrinsic; +def int_aarch64_neon_st1x4 : AdvSIMD_4Vec_Store_Intrinsic; + +def int_aarch64_neon_ld2 : AdvSIMD_2Vec_Load_Intrinsic; +def int_aarch64_neon_ld3 : AdvSIMD_3Vec_Load_Intrinsic; +def int_aarch64_neon_ld4 : AdvSIMD_4Vec_Load_Intrinsic; + +def int_aarch64_neon_ld2lane : AdvSIMD_2Vec_Load_Lane_Intrinsic; +def int_aarch64_neon_ld3lane : AdvSIMD_3Vec_Load_Lane_Intrinsic; +def int_aarch64_neon_ld4lane : AdvSIMD_4Vec_Load_Lane_Intrinsic; + +def int_aarch64_neon_ld2r : AdvSIMD_2Vec_Load_Intrinsic; +def int_aarch64_neon_ld3r : AdvSIMD_3Vec_Load_Intrinsic; +def int_aarch64_neon_ld4r : AdvSIMD_4Vec_Load_Intrinsic; + +def int_aarch64_neon_st2 : AdvSIMD_2Vec_Store_Intrinsic; +def int_aarch64_neon_st3 : AdvSIMD_3Vec_Store_Intrinsic; +def int_aarch64_neon_st4 : AdvSIMD_4Vec_Store_Intrinsic; + +def int_aarch64_neon_st2lane : AdvSIMD_2Vec_Store_Lane_Intrinsic; +def int_aarch64_neon_st3lane : AdvSIMD_3Vec_Store_Lane_Intrinsic; +def int_aarch64_neon_st4lane : AdvSIMD_4Vec_Store_Lane_Intrinsic; + +let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.". + class AdvSIMD_Tbl1_Intrinsic + : Intrinsic<[llvm_anyvector_ty], [llvm_v16i8_ty, LLVMMatchType<0>], + [IntrNoMem]>; + class AdvSIMD_Tbl2_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>], [IntrNoMem]>; + class AdvSIMD_Tbl3_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, + LLVMMatchType<0>], + [IntrNoMem]>; + class AdvSIMD_Tbl4_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, + LLVMMatchType<0>], + [IntrNoMem]>; + + class AdvSIMD_Tbx1_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_v16i8_ty, LLVMMatchType<0>], + [IntrNoMem]>; + class AdvSIMD_Tbx2_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty, + LLVMMatchType<0>], + [IntrNoMem]>; + class AdvSIMD_Tbx3_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty, + llvm_v16i8_ty, LLVMMatchType<0>], + [IntrNoMem]>; + class AdvSIMD_Tbx4_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty, + llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>], + [IntrNoMem]>; +} +def int_aarch64_neon_tbl1 : AdvSIMD_Tbl1_Intrinsic; +def int_aarch64_neon_tbl2 : AdvSIMD_Tbl2_Intrinsic; +def int_aarch64_neon_tbl3 : AdvSIMD_Tbl3_Intrinsic; +def int_aarch64_neon_tbl4 : AdvSIMD_Tbl4_Intrinsic; + +def int_aarch64_neon_tbx1 : AdvSIMD_Tbx1_Intrinsic; +def int_aarch64_neon_tbx2 : AdvSIMD_Tbx2_Intrinsic; +def int_aarch64_neon_tbx3 : AdvSIMD_Tbx3_Intrinsic; +def int_aarch64_neon_tbx4 : AdvSIMD_Tbx4_Intrinsic; + +let TargetPrefix = "aarch64" in { + class Crypto_AES_DataKey_Intrinsic + : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; + + class Crypto_AES_Data_Intrinsic + : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>; + + // SHA intrinsic taking 5 words of the hash (v4i32, i32) and 4 of the schedule + // (v4i32). + class Crypto_SHA_5Hash4Schedule_Intrinsic + : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty], + [IntrNoMem]>; + + // SHA intrinsic taking 5 words of the hash (v4i32, i32) and 4 of the schedule + // (v4i32). + class Crypto_SHA_1Hash_Intrinsic + : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + + // SHA intrinsic taking 8 words of the schedule + class Crypto_SHA_8Schedule_Intrinsic + : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; + + // SHA intrinsic taking 12 words of the schedule + class Crypto_SHA_12Schedule_Intrinsic + : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], + [IntrNoMem]>; + + // SHA intrinsic taking 8 words of the hash and 4 of the schedule. + class Crypto_SHA_8Hash4Schedule_Intrinsic + : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], + [IntrNoMem]>; +} + +// AES +def int_aarch64_crypto_aese : Crypto_AES_DataKey_Intrinsic; +def int_aarch64_crypto_aesd : Crypto_AES_DataKey_Intrinsic; +def int_aarch64_crypto_aesmc : Crypto_AES_Data_Intrinsic; +def int_aarch64_crypto_aesimc : Crypto_AES_Data_Intrinsic; + +// SHA1 +def int_aarch64_crypto_sha1c : Crypto_SHA_5Hash4Schedule_Intrinsic; +def int_aarch64_crypto_sha1p : Crypto_SHA_5Hash4Schedule_Intrinsic; +def int_aarch64_crypto_sha1m : Crypto_SHA_5Hash4Schedule_Intrinsic; +def int_aarch64_crypto_sha1h : Crypto_SHA_1Hash_Intrinsic; + +def int_aarch64_crypto_sha1su0 : Crypto_SHA_12Schedule_Intrinsic; +def int_aarch64_crypto_sha1su1 : Crypto_SHA_8Schedule_Intrinsic; + +// SHA256 +def int_aarch64_crypto_sha256h : Crypto_SHA_8Hash4Schedule_Intrinsic; +def int_aarch64_crypto_sha256h2 : Crypto_SHA_8Hash4Schedule_Intrinsic; +def int_aarch64_crypto_sha256su0 : Crypto_SHA_8Schedule_Intrinsic; +def int_aarch64_crypto_sha256su1 : Crypto_SHA_12Schedule_Intrinsic; + +//===----------------------------------------------------------------------===// +// CRC32 + +let TargetPrefix = "aarch64" in { + +def int_aarch64_crc32b : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrNoMem]>; +def int_aarch64_crc32cb : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrNoMem]>; +def int_aarch64_crc32h : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrNoMem]>; +def int_aarch64_crc32ch : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrNoMem]>; +def int_aarch64_crc32w : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrNoMem]>; +def int_aarch64_crc32cw : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrNoMem]>; +def int_aarch64_crc32x : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty], + [IntrNoMem]>; +def int_aarch64_crc32cx : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty], + [IntrNoMem]>; +} diff --git a/include/llvm/IR/IntrinsicsARM64.td b/include/llvm/IR/IntrinsicsARM64.td deleted file mode 100644 index 146ea5d970c..00000000000 --- a/include/llvm/IR/IntrinsicsARM64.td +++ /dev/null @@ -1,636 +0,0 @@ -//===- IntrinsicsARM64.td - Defines ARM64 intrinsics -------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines all of the ARM64-specific intrinsics. -// -//===----------------------------------------------------------------------===// - -let TargetPrefix = "arm64" in { - -def int_arm64_ldxr : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>; -def int_arm64_ldaxr : Intrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>; -def int_arm64_stxr : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_anyptr_ty]>; -def int_arm64_stlxr : Intrinsic<[llvm_i32_ty], [llvm_i64_ty, llvm_anyptr_ty]>; - -def int_arm64_ldxp : Intrinsic<[llvm_i64_ty, llvm_i64_ty], [llvm_ptr_ty]>; -def int_arm64_ldaxp : Intrinsic<[llvm_i64_ty, llvm_i64_ty], [llvm_ptr_ty]>; -def int_arm64_stxp : Intrinsic<[llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_ptr_ty]>; -def int_arm64_stlxp : Intrinsic<[llvm_i32_ty], - [llvm_i64_ty, llvm_i64_ty, llvm_ptr_ty]>; - -def int_arm64_clrex : Intrinsic<[]>; - -def int_arm64_sdiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, - LLVMMatchType<0>], [IntrNoMem]>; -def int_arm64_udiv : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, - LLVMMatchType<0>], [IntrNoMem]>; -} - -//===----------------------------------------------------------------------===// -// Advanced SIMD (NEON) - -let TargetPrefix = "arm64" in { // All intrinsics start with "llvm.arm64.". - class AdvSIMD_2Scalar_Float_Intrinsic - : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem]>; - - class AdvSIMD_FPToIntRounding_Intrinsic - : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>; - - class AdvSIMD_1IntArg_Intrinsic - : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; - class AdvSIMD_1FloatArg_Intrinsic - : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; - class AdvSIMD_1VectorArg_Intrinsic - : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>; - class AdvSIMD_1VectorArg_Expand_Intrinsic - : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>; - class AdvSIMD_1VectorArg_Long_Intrinsic - : Intrinsic<[llvm_anyvector_ty], [LLVMTruncatedType<0>], [IntrNoMem]>; - class AdvSIMD_1IntArg_Narrow_Intrinsic - : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem]>; - class AdvSIMD_1VectorArg_Narrow_Intrinsic - : Intrinsic<[llvm_anyint_ty], [LLVMExtendedType<0>], [IntrNoMem]>; - class AdvSIMD_1VectorArg_Int_Across_Intrinsic - : Intrinsic<[llvm_anyint_ty], [llvm_anyvector_ty], [IntrNoMem]>; - class AdvSIMD_1VectorArg_Float_Across_Intrinsic - : Intrinsic<[llvm_anyfloat_ty], [llvm_anyvector_ty], [IntrNoMem]>; - - class AdvSIMD_2IntArg_Intrinsic - : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem]>; - class AdvSIMD_2FloatArg_Intrinsic - : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem]>; - class AdvSIMD_2VectorArg_Intrinsic - : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem]>; - class AdvSIMD_2VectorArg_Compare_Intrinsic - : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>], - [IntrNoMem]>; - class AdvSIMD_2Arg_FloatCompare_Intrinsic - : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>], - [IntrNoMem]>; - class AdvSIMD_2VectorArg_Long_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [LLVMTruncatedType<0>, LLVMTruncatedType<0>], - [IntrNoMem]>; - class AdvSIMD_2VectorArg_Wide_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, LLVMTruncatedType<0>], - [IntrNoMem]>; - class AdvSIMD_2VectorArg_Narrow_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [LLVMExtendedType<0>, LLVMExtendedType<0>], - [IntrNoMem]>; - class AdvSIMD_2Arg_Scalar_Narrow_Intrinsic - : Intrinsic<[llvm_anyint_ty], - [LLVMExtendedType<0>, llvm_i32_ty], - [IntrNoMem]>; - class AdvSIMD_2VectorArg_Scalar_Expand_BySize_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [llvm_anyvector_ty], - [IntrNoMem]>; - class AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [LLVMTruncatedType<0>], - [IntrNoMem]>; - class AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [LLVMTruncatedType<0>, llvm_i32_ty], - [IntrNoMem]>; - class AdvSIMD_2VectorArg_Tied_Narrow_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty], - [IntrNoMem]>; - - class AdvSIMD_3VectorArg_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem]>; - class AdvSIMD_3VectorArg_Scalar_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], - [IntrNoMem]>; - class AdvSIMD_3VectorArg_Tied_Narrow_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty, - LLVMMatchType<1>], [IntrNoMem]>; - class AdvSIMD_3VectorArg_Scalar_Tied_Narrow_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty, llvm_i32_ty], - [IntrNoMem]>; - class AdvSIMD_CvtFxToFP_Intrinsic - : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty], - [IntrNoMem]>; - class AdvSIMD_CvtFPToFx_Intrinsic - : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty], - [IntrNoMem]>; -} - -// Arithmetic ops - -let Properties = [IntrNoMem] in { - // Vector Add Across Lanes - def int_arm64_neon_saddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic; - def int_arm64_neon_uaddv : AdvSIMD_1VectorArg_Int_Across_Intrinsic; - def int_arm64_neon_faddv : AdvSIMD_1VectorArg_Float_Across_Intrinsic; - - // Vector Long Add Across Lanes - def int_arm64_neon_saddlv : AdvSIMD_1VectorArg_Int_Across_Intrinsic; - def int_arm64_neon_uaddlv : AdvSIMD_1VectorArg_Int_Across_Intrinsic; - - // Vector Halving Add - def int_arm64_neon_shadd : AdvSIMD_2VectorArg_Intrinsic; - def int_arm64_neon_uhadd : AdvSIMD_2VectorArg_Intrinsic; - - // Vector Rounding Halving Add - def int_arm64_neon_srhadd : AdvSIMD_2VectorArg_Intrinsic; - def int_arm64_neon_urhadd : AdvSIMD_2VectorArg_Intrinsic; - - // Vector Saturating Add - def int_arm64_neon_sqadd : AdvSIMD_2IntArg_Intrinsic; - def int_arm64_neon_suqadd : AdvSIMD_2IntArg_Intrinsic; - def int_arm64_neon_usqadd : AdvSIMD_2IntArg_Intrinsic; - def int_arm64_neon_uqadd : AdvSIMD_2IntArg_Intrinsic; - - // Vector Add High-Half - // FIXME: this is a legacy intrinsic for aarch64_simd.h. Remove it when that - // header is no longer supported. - def int_arm64_neon_addhn : AdvSIMD_2VectorArg_Narrow_Intrinsic; - - // Vector Rounding Add High-Half - def int_arm64_neon_raddhn : AdvSIMD_2VectorArg_Narrow_Intrinsic; - - // Vector Saturating Doubling Multiply High - def int_arm64_neon_sqdmulh : AdvSIMD_2IntArg_Intrinsic; - - // Vector Saturating Rounding Doubling Multiply High - def int_arm64_neon_sqrdmulh : AdvSIMD_2IntArg_Intrinsic; - - // Vector Polynominal Multiply - def int_arm64_neon_pmul : AdvSIMD_2VectorArg_Intrinsic; - - // Vector Long Multiply - def int_arm64_neon_smull : AdvSIMD_2VectorArg_Long_Intrinsic; - def int_arm64_neon_umull : AdvSIMD_2VectorArg_Long_Intrinsic; - def int_arm64_neon_pmull : AdvSIMD_2VectorArg_Long_Intrinsic; - - // 64-bit polynomial multiply really returns an i128, which is not legal. Fake - // it with a v16i8. - def int_arm64_neon_pmull64 : - Intrinsic<[llvm_v16i8_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>; - - // Vector Extending Multiply - def int_arm64_neon_fmulx : AdvSIMD_2FloatArg_Intrinsic { - let Properties = [IntrNoMem, Commutative]; - } - - // Vector Saturating Doubling Long Multiply - def int_arm64_neon_sqdmull : AdvSIMD_2VectorArg_Long_Intrinsic; - def int_arm64_neon_sqdmulls_scalar - : Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - - // Vector Halving Subtract - def int_arm64_neon_shsub : AdvSIMD_2VectorArg_Intrinsic; - def int_arm64_neon_uhsub : AdvSIMD_2VectorArg_Intrinsic; - - // Vector Saturating Subtract - def int_arm64_neon_sqsub : AdvSIMD_2IntArg_Intrinsic; - def int_arm64_neon_uqsub : AdvSIMD_2IntArg_Intrinsic; - - // Vector Subtract High-Half - // FIXME: this is a legacy intrinsic for aarch64_simd.h. Remove it when that - // header is no longer supported. - def int_arm64_neon_subhn : AdvSIMD_2VectorArg_Narrow_Intrinsic; - - // Vector Rounding Subtract High-Half - def int_arm64_neon_rsubhn : AdvSIMD_2VectorArg_Narrow_Intrinsic; - - // Vector Compare Absolute Greater-than-or-equal - def int_arm64_neon_facge : AdvSIMD_2Arg_FloatCompare_Intrinsic; - - // Vector Compare Absolute Greater-than - def int_arm64_neon_facgt : AdvSIMD_2Arg_FloatCompare_Intrinsic; - - // Vector Absolute Difference - def int_arm64_neon_sabd : AdvSIMD_2VectorArg_Intrinsic; - def int_arm64_neon_uabd : AdvSIMD_2VectorArg_Intrinsic; - def int_arm64_neon_fabd : AdvSIMD_2VectorArg_Intrinsic; - - // Scalar Absolute Difference - def int_arm64_sisd_fabd : AdvSIMD_2Scalar_Float_Intrinsic; - - // Vector Max - def int_arm64_neon_smax : AdvSIMD_2VectorArg_Intrinsic; - def int_arm64_neon_umax : AdvSIMD_2VectorArg_Intrinsic; - def int_arm64_neon_fmax : AdvSIMD_2VectorArg_Intrinsic; - def int_arm64_neon_fmaxnmp : AdvSIMD_2VectorArg_Intrinsic; - - // Vector Max Across Lanes - def int_arm64_neon_smaxv : AdvSIMD_1VectorArg_Int_Across_Intrinsic; - def int_arm64_neon_umaxv : AdvSIMD_1VectorArg_Int_Across_Intrinsic; - def int_arm64_neon_fmaxv : AdvSIMD_1VectorArg_Float_Across_Intrinsic; - def int_arm64_neon_fmaxnmv : AdvSIMD_1VectorArg_Float_Across_Intrinsic; - - // Vector Min - def int_arm64_neon_smin : AdvSIMD_2VectorArg_Intrinsic; - def int_arm64_neon_umin : AdvSIMD_2VectorArg_Intrinsic; - def int_arm64_neon_fmin : AdvSIMD_2VectorArg_Intrinsic; - def int_arm64_neon_fminnmp : AdvSIMD_2VectorArg_Intrinsic; - - // Vector Min/Max Number - def int_arm64_neon_fminnm : AdvSIMD_2FloatArg_Intrinsic; - def int_arm64_neon_fmaxnm : AdvSIMD_2FloatArg_Intrinsic; - - // Vector Min Across Lanes - def int_arm64_neon_sminv : AdvSIMD_1VectorArg_Int_Across_Intrinsic; - def int_arm64_neon_uminv : AdvSIMD_1VectorArg_Int_Across_Intrinsic; - def int_arm64_neon_fminv : AdvSIMD_1VectorArg_Float_Across_Intrinsic; - def int_arm64_neon_fminnmv : AdvSIMD_1VectorArg_Float_Across_Intrinsic; - - // Pairwise Add - def int_arm64_neon_addp : AdvSIMD_2VectorArg_Intrinsic; - - // Long Pairwise Add - // FIXME: In theory, we shouldn't need intrinsics for saddlp or - // uaddlp, but tblgen's type inference currently can't handle the - // pattern fragments this ends up generating. - def int_arm64_neon_saddlp : AdvSIMD_1VectorArg_Expand_Intrinsic; - def int_arm64_neon_uaddlp : AdvSIMD_1VectorArg_Expand_Intrinsic; - - // Folding Maximum - def int_arm64_neon_smaxp : AdvSIMD_2VectorArg_Intrinsic; - def int_arm64_neon_umaxp : AdvSIMD_2VectorArg_Intrinsic; - def int_arm64_neon_fmaxp : AdvSIMD_2VectorArg_Intrinsic; - - // Folding Minimum - def int_arm64_neon_sminp : AdvSIMD_2VectorArg_Intrinsic; - def int_arm64_neon_uminp : AdvSIMD_2VectorArg_Intrinsic; - def int_arm64_neon_fminp : AdvSIMD_2VectorArg_Intrinsic; - - // Reciprocal Estimate/Step - def int_arm64_neon_frecps : AdvSIMD_2FloatArg_Intrinsic; - def int_arm64_neon_frsqrts : AdvSIMD_2FloatArg_Intrinsic; - - // Reciprocal Exponent - def int_arm64_neon_frecpx : AdvSIMD_1FloatArg_Intrinsic; - - // Vector Saturating Shift Left - def int_arm64_neon_sqshl : AdvSIMD_2IntArg_Intrinsic; - def int_arm64_neon_uqshl : AdvSIMD_2IntArg_Intrinsic; - - // Vector Rounding Shift Left - def int_arm64_neon_srshl : AdvSIMD_2IntArg_Intrinsic; - def int_arm64_neon_urshl : AdvSIMD_2IntArg_Intrinsic; - - // Vector Saturating Rounding Shift Left - def int_arm64_neon_sqrshl : AdvSIMD_2IntArg_Intrinsic; - def int_arm64_neon_uqrshl : AdvSIMD_2IntArg_Intrinsic; - - // Vector Signed->Unsigned Shift Left by Constant - def int_arm64_neon_sqshlu : AdvSIMD_2IntArg_Intrinsic; - - // Vector Signed->Unsigned Narrowing Saturating Shift Right by Constant - def int_arm64_neon_sqshrun : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic; - - // Vector Signed->Unsigned Rounding Narrowing Saturating Shift Right by Const - def int_arm64_neon_sqrshrun : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic; - - // Vector Narrowing Shift Right by Constant - def int_arm64_neon_sqshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic; - def int_arm64_neon_uqshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic; - - // Vector Rounding Narrowing Shift Right by Constant - def int_arm64_neon_rshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic; - - // Vector Rounding Narrowing Saturating Shift Right by Constant - def int_arm64_neon_sqrshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic; - def int_arm64_neon_uqrshrn : AdvSIMD_2Arg_Scalar_Narrow_Intrinsic; - - // Vector Shift Left - def int_arm64_neon_sshl : AdvSIMD_2IntArg_Intrinsic; - def int_arm64_neon_ushl : AdvSIMD_2IntArg_Intrinsic; - - // Vector Widening Shift Left by Constant - def int_arm64_neon_shll : AdvSIMD_2VectorArg_Scalar_Wide_BySize_Intrinsic; - def int_arm64_neon_sshll : AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic; - def int_arm64_neon_ushll : AdvSIMD_2VectorArg_Scalar_Wide_Intrinsic; - - // Vector Shift Right by Constant and Insert - def int_arm64_neon_vsri : AdvSIMD_3VectorArg_Scalar_Intrinsic; - - // Vector Shift Left by Constant and Insert - def int_arm64_neon_vsli : AdvSIMD_3VectorArg_Scalar_Intrinsic; - - // Vector Saturating Narrow - def int_arm64_neon_scalar_sqxtn: AdvSIMD_1IntArg_Narrow_Intrinsic; - def int_arm64_neon_scalar_uqxtn : AdvSIMD_1IntArg_Narrow_Intrinsic; - def int_arm64_neon_sqxtn : AdvSIMD_1VectorArg_Narrow_Intrinsic; - def int_arm64_neon_uqxtn : AdvSIMD_1VectorArg_Narrow_Intrinsic; - - // Vector Saturating Extract and Unsigned Narrow - def int_arm64_neon_scalar_sqxtun : AdvSIMD_1IntArg_Narrow_Intrinsic; - def int_arm64_neon_sqxtun : AdvSIMD_1VectorArg_Narrow_Intrinsic; - - // Vector Absolute Value - def int_arm64_neon_abs : AdvSIMD_1IntArg_Intrinsic; - - // Vector Saturating Absolute Value - def int_arm64_neon_sqabs : AdvSIMD_1IntArg_Intrinsic; - - // Vector Saturating Negation - def int_arm64_neon_sqneg : AdvSIMD_1IntArg_Intrinsic; - - // Vector Count Leading Sign Bits - def int_arm64_neon_cls : AdvSIMD_1VectorArg_Intrinsic; - - // Vector Reciprocal Estimate - def int_arm64_neon_urecpe : AdvSIMD_1VectorArg_Intrinsic; - def int_arm64_neon_frecpe : AdvSIMD_1FloatArg_Intrinsic; - - // Vector Square Root Estimate - def int_arm64_neon_ursqrte : AdvSIMD_1VectorArg_Intrinsic; - def int_arm64_neon_frsqrte : AdvSIMD_1FloatArg_Intrinsic; - - // Vector Bitwise Reverse - def int_arm64_neon_rbit : AdvSIMD_1VectorArg_Intrinsic; - - // Vector Conversions Between Half-Precision and Single-Precision. - def int_arm64_neon_vcvtfp2hf - : Intrinsic<[llvm_v4i16_ty], [llvm_v4f32_ty], [IntrNoMem]>; - def int_arm64_neon_vcvthf2fp - : Intrinsic<[llvm_v4f32_ty], [llvm_v4i16_ty], [IntrNoMem]>; - - // Vector Conversions Between Floating-point and Fixed-point. - def int_arm64_neon_vcvtfp2fxs : AdvSIMD_CvtFPToFx_Intrinsic; - def int_arm64_neon_vcvtfp2fxu : AdvSIMD_CvtFPToFx_Intrinsic; - def int_arm64_neon_vcvtfxs2fp : AdvSIMD_CvtFxToFP_Intrinsic; - def int_arm64_neon_vcvtfxu2fp : AdvSIMD_CvtFxToFP_Intrinsic; - - // Vector FP->Int Conversions - def int_arm64_neon_fcvtas : AdvSIMD_FPToIntRounding_Intrinsic; - def int_arm64_neon_fcvtau : AdvSIMD_FPToIntRounding_Intrinsic; - def int_arm64_neon_fcvtms : AdvSIMD_FPToIntRounding_Intrinsic; - def int_arm64_neon_fcvtmu : AdvSIMD_FPToIntRounding_Intrinsic; - def int_arm64_neon_fcvtns : AdvSIMD_FPToIntRounding_Intrinsic; - def int_arm64_neon_fcvtnu : AdvSIMD_FPToIntRounding_Intrinsic; - def int_arm64_neon_fcvtps : AdvSIMD_FPToIntRounding_Intrinsic; - def int_arm64_neon_fcvtpu : AdvSIMD_FPToIntRounding_Intrinsic; - def int_arm64_neon_fcvtzs : AdvSIMD_FPToIntRounding_Intrinsic; - def int_arm64_neon_fcvtzu : AdvSIMD_FPToIntRounding_Intrinsic; - - // Vector FP Rounding: only ties to even is unrepresented by a normal - // intrinsic. - def int_arm64_neon_frintn : AdvSIMD_1FloatArg_Intrinsic; - - // Scalar FP->Int conversions - - // Vector FP Inexact Narrowing - def int_arm64_neon_fcvtxn : AdvSIMD_1VectorArg_Expand_Intrinsic; - - // Scalar FP Inexact Narrowing - def int_arm64_sisd_fcvtxn : Intrinsic<[llvm_float_ty], [llvm_double_ty], - [IntrNoMem]>; -} - -let TargetPrefix = "arm64" in { // All intrinsics start with "llvm.arm64.". - class AdvSIMD_2Vector2Index_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [llvm_anyvector_ty, llvm_i64_ty, LLVMMatchType<0>, llvm_i64_ty], - [IntrNoMem]>; -} - -// Vector element to element moves -def int_arm64_neon_vcopy_lane: AdvSIMD_2Vector2Index_Intrinsic; - -let TargetPrefix = "arm64" in { // All intrinsics start with "llvm.arm64.". - class AdvSIMD_1Vec_Load_Intrinsic - : Intrinsic<[llvm_anyvector_ty], [LLVMAnyPointerType>], - [IntrReadArgMem]>; - class AdvSIMD_1Vec_Store_Lane_Intrinsic - : Intrinsic<[], [llvm_anyvector_ty, llvm_i64_ty, llvm_anyptr_ty], - [IntrReadWriteArgMem, NoCapture<2>]>; - - class AdvSIMD_2Vec_Load_Intrinsic - : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], - [LLVMAnyPointerType>], - [IntrReadArgMem]>; - class AdvSIMD_2Vec_Load_Lane_Intrinsic - : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], - [LLVMMatchType<0>, LLVMMatchType<0>, - llvm_i64_ty, llvm_anyptr_ty], - [IntrReadArgMem]>; - class AdvSIMD_2Vec_Store_Intrinsic - : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>, - LLVMAnyPointerType>], - [IntrReadWriteArgMem, NoCapture<2>]>; - class AdvSIMD_2Vec_Store_Lane_Intrinsic - : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>, - llvm_i64_ty, llvm_anyptr_ty], - [IntrReadWriteArgMem, NoCapture<3>]>; - - class AdvSIMD_3Vec_Load_Intrinsic - : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>], - [LLVMAnyPointerType>], - [IntrReadArgMem]>; - class AdvSIMD_3Vec_Load_Lane_Intrinsic - : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>], - [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, - llvm_i64_ty, llvm_anyptr_ty], - [IntrReadArgMem]>; - class AdvSIMD_3Vec_Store_Intrinsic - : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>, - LLVMMatchType<0>, LLVMAnyPointerType>], - [IntrReadWriteArgMem, NoCapture<3>]>; - class AdvSIMD_3Vec_Store_Lane_Intrinsic - : Intrinsic<[], [llvm_anyvector_ty, - LLVMMatchType<0>, LLVMMatchType<0>, - llvm_i64_ty, llvm_anyptr_ty], - [IntrReadWriteArgMem, NoCapture<4>]>; - - class AdvSIMD_4Vec_Load_Intrinsic - : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, - LLVMMatchType<0>, LLVMMatchType<0>], - [LLVMAnyPointerType>], - [IntrReadArgMem]>; - class AdvSIMD_4Vec_Load_Lane_Intrinsic - : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, - LLVMMatchType<0>, LLVMMatchType<0>], - [LLVMMatchType<0>, LLVMMatchType<0>, - LLVMMatchType<0>, LLVMMatchType<0>, - llvm_i64_ty, llvm_anyptr_ty], - [IntrReadArgMem]>; - class AdvSIMD_4Vec_Store_Intrinsic - : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>, - LLVMMatchType<0>, LLVMMatchType<0>, - LLVMAnyPointerType>], - [IntrReadWriteArgMem, NoCapture<4>]>; - class AdvSIMD_4Vec_Store_Lane_Intrinsic - : Intrinsic<[], [llvm_anyvector_ty, LLVMMatchType<0>, - LLVMMatchType<0>, LLVMMatchType<0>, - llvm_i64_ty, llvm_anyptr_ty], - [IntrReadWriteArgMem, NoCapture<5>]>; -} - -// Memory ops - -def int_arm64_neon_ld1x2 : AdvSIMD_2Vec_Load_Intrinsic; -def int_arm64_neon_ld1x3 : AdvSIMD_3Vec_Load_Intrinsic; -def int_arm64_neon_ld1x4 : AdvSIMD_4Vec_Load_Intrinsic; - -def int_arm64_neon_st1x2 : AdvSIMD_2Vec_Store_Intrinsic; -def int_arm64_neon_st1x3 : AdvSIMD_3Vec_Store_Intrinsic; -def int_arm64_neon_st1x4 : AdvSIMD_4Vec_Store_Intrinsic; - -def int_arm64_neon_ld2 : AdvSIMD_2Vec_Load_Intrinsic; -def int_arm64_neon_ld3 : AdvSIMD_3Vec_Load_Intrinsic; -def int_arm64_neon_ld4 : AdvSIMD_4Vec_Load_Intrinsic; - -def int_arm64_neon_ld2lane : AdvSIMD_2Vec_Load_Lane_Intrinsic; -def int_arm64_neon_ld3lane : AdvSIMD_3Vec_Load_Lane_Intrinsic; -def int_arm64_neon_ld4lane : AdvSIMD_4Vec_Load_Lane_Intrinsic; - -def int_arm64_neon_ld2r : AdvSIMD_2Vec_Load_Intrinsic; -def int_arm64_neon_ld3r : AdvSIMD_3Vec_Load_Intrinsic; -def int_arm64_neon_ld4r : AdvSIMD_4Vec_Load_Intrinsic; - -def int_arm64_neon_st2 : AdvSIMD_2Vec_Store_Intrinsic; -def int_arm64_neon_st3 : AdvSIMD_3Vec_Store_Intrinsic; -def int_arm64_neon_st4 : AdvSIMD_4Vec_Store_Intrinsic; - -def int_arm64_neon_st2lane : AdvSIMD_2Vec_Store_Lane_Intrinsic; -def int_arm64_neon_st3lane : AdvSIMD_3Vec_Store_Lane_Intrinsic; -def int_arm64_neon_st4lane : AdvSIMD_4Vec_Store_Lane_Intrinsic; - -let TargetPrefix = "arm64" in { // All intrinsics start with "llvm.arm64.". - class AdvSIMD_Tbl1_Intrinsic - : Intrinsic<[llvm_anyvector_ty], [llvm_v16i8_ty, LLVMMatchType<0>], - [IntrNoMem]>; - class AdvSIMD_Tbl2_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>], [IntrNoMem]>; - class AdvSIMD_Tbl3_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, - LLVMMatchType<0>], - [IntrNoMem]>; - class AdvSIMD_Tbl4_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, - LLVMMatchType<0>], - [IntrNoMem]>; - - class AdvSIMD_Tbx1_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, llvm_v16i8_ty, LLVMMatchType<0>], - [IntrNoMem]>; - class AdvSIMD_Tbx2_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty, - LLVMMatchType<0>], - [IntrNoMem]>; - class AdvSIMD_Tbx3_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty, - llvm_v16i8_ty, LLVMMatchType<0>], - [IntrNoMem]>; - class AdvSIMD_Tbx4_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty, - llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>], - [IntrNoMem]>; -} -def int_arm64_neon_tbl1 : AdvSIMD_Tbl1_Intrinsic; -def int_arm64_neon_tbl2 : AdvSIMD_Tbl2_Intrinsic; -def int_arm64_neon_tbl3 : AdvSIMD_Tbl3_Intrinsic; -def int_arm64_neon_tbl4 : AdvSIMD_Tbl4_Intrinsic; - -def int_arm64_neon_tbx1 : AdvSIMD_Tbx1_Intrinsic; -def int_arm64_neon_tbx2 : AdvSIMD_Tbx2_Intrinsic; -def int_arm64_neon_tbx3 : AdvSIMD_Tbx3_Intrinsic; -def int_arm64_neon_tbx4 : AdvSIMD_Tbx4_Intrinsic; - -let TargetPrefix = "arm64" in { - class Crypto_AES_DataKey_Intrinsic - : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; - - class Crypto_AES_Data_Intrinsic - : Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>; - - // SHA intrinsic taking 5 words of the hash (v4i32, i32) and 4 of the schedule - // (v4i32). - class Crypto_SHA_5Hash4Schedule_Intrinsic - : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty], - [IntrNoMem]>; - - // SHA intrinsic taking 5 words of the hash (v4i32, i32) and 4 of the schedule - // (v4i32). - class Crypto_SHA_1Hash_Intrinsic - : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - - // SHA intrinsic taking 8 words of the schedule - class Crypto_SHA_8Schedule_Intrinsic - : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; - - // SHA intrinsic taking 12 words of the schedule - class Crypto_SHA_12Schedule_Intrinsic - : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], - [IntrNoMem]>; - - // SHA intrinsic taking 8 words of the hash and 4 of the schedule. - class Crypto_SHA_8Hash4Schedule_Intrinsic - : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], - [IntrNoMem]>; -} - -// AES -def int_arm64_crypto_aese : Crypto_AES_DataKey_Intrinsic; -def int_arm64_crypto_aesd : Crypto_AES_DataKey_Intrinsic; -def int_arm64_crypto_aesmc : Crypto_AES_Data_Intrinsic; -def int_arm64_crypto_aesimc : Crypto_AES_Data_Intrinsic; - -// SHA1 -def int_arm64_crypto_sha1c : Crypto_SHA_5Hash4Schedule_Intrinsic; -def int_arm64_crypto_sha1p : Crypto_SHA_5Hash4Schedule_Intrinsic; -def int_arm64_crypto_sha1m : Crypto_SHA_5Hash4Schedule_Intrinsic; -def int_arm64_crypto_sha1h : Crypto_SHA_1Hash_Intrinsic; - -def int_arm64_crypto_sha1su0 : Crypto_SHA_12Schedule_Intrinsic; -def int_arm64_crypto_sha1su1 : Crypto_SHA_8Schedule_Intrinsic; - -// SHA256 -def int_arm64_crypto_sha256h : Crypto_SHA_8Hash4Schedule_Intrinsic; -def int_arm64_crypto_sha256h2 : Crypto_SHA_8Hash4Schedule_Intrinsic; -def int_arm64_crypto_sha256su0 : Crypto_SHA_8Schedule_Intrinsic; -def int_arm64_crypto_sha256su1 : Crypto_SHA_12Schedule_Intrinsic; - -//===----------------------------------------------------------------------===// -// CRC32 - -let TargetPrefix = "arm64" in { - -def int_arm64_crc32b : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem]>; -def int_arm64_crc32cb : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem]>; -def int_arm64_crc32h : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem]>; -def int_arm64_crc32ch : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem]>; -def int_arm64_crc32w : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem]>; -def int_arm64_crc32cw : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem]>; -def int_arm64_crc32x : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty], - [IntrNoMem]>; -def int_arm64_crc32cx : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i64_ty], - [IntrNoMem]>; -} diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp index a70b03d95cf..2b425fbdd33 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp @@ -168,8 +168,9 @@ void RuntimeDyldMachO::resolveRelocation(const RelocationEntry &RE, case Triple::thumb: resolveARMRelocation(RE, Value); break; + case Triple::aarch64: case Triple::arm64: - resolveARM64Relocation(RE, Value); + resolveAArch64Relocation(RE, Value); break; } } @@ -289,8 +290,8 @@ bool RuntimeDyldMachO::resolveARMRelocation(const RelocationEntry &RE, return false; } -bool RuntimeDyldMachO::resolveARM64Relocation(const RelocationEntry &RE, - uint64_t Value) { +bool RuntimeDyldMachO::resolveAArch64Relocation(const RelocationEntry &RE, + uint64_t Value) { const SectionEntry &Section = Sections[RE.SectionID]; uint8_t* LocalAddress = Section.Address + RE.Offset; diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h index 08573eed5c8..060eb8c29a2 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h @@ -41,7 +41,7 @@ private: bool resolveI386Relocation(const RelocationEntry &RE, uint64_t Value); bool resolveX86_64Relocation(const RelocationEntry &RE, uint64_t Value); bool resolveARMRelocation(const RelocationEntry &RE, uint64_t Value); - bool resolveARM64Relocation(const RelocationEntry &RE, uint64_t Value); + bool resolveAArch64Relocation(const RelocationEntry &RE, uint64_t Value); // Populate stubs in __jump_table section. void populateJumpTable(MachOObjectFile &Obj, const SectionRef &JTSection, diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp index 028c1912717..99236bd24ea 100644 --- a/lib/LTO/LTOCodeGenerator.cpp +++ b/lib/LTO/LTOCodeGenerator.cpp @@ -312,7 +312,8 @@ bool LTOCodeGenerator::determineTarget(std::string &errMsg) { MCpu = "core2"; else if (Triple.getArch() == llvm::Triple::x86) MCpu = "yonah"; - else if (Triple.getArch() == llvm::Triple::arm64) + else if (Triple.getArch() == llvm::Triple::arm64 || + Triple.getArch() == llvm::Triple::aarch64) MCpu = "cyclone"; } diff --git a/lib/LTO/LTOModule.cpp b/lib/LTO/LTOModule.cpp index 255951a7070..d1175142651 100644 --- a/lib/LTO/LTOModule.cpp +++ b/lib/LTO/LTOModule.cpp @@ -168,7 +168,8 @@ LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer, CPU = "core2"; else if (Triple.getArch() == llvm::Triple::x86) CPU = "yonah"; - else if (Triple.getArch() == llvm::Triple::arm64) + else if (Triple.getArch() == llvm::Triple::arm64 || + Triple.getArch() == llvm::Triple::aarch64) CPU = "cyclone"; } diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp index bb132799504..9d413afe5db 100644 --- a/lib/MC/MCObjectFileInfo.cpp +++ b/lib/MC/MCObjectFileInfo.cpp @@ -23,7 +23,8 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) { IsFunctionEHFrameSymbolPrivate = false; SupportsWeakOmittedEHFrame = false; - if (T.isOSDarwin() && T.getArch() == Triple::arm64) + if (T.isOSDarwin() && + (T.getArch() == Triple::arm64 || T.getArch() == Triple::aarch64)) SupportsCompactUnwindWithoutEHFrame = true; PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel @@ -151,7 +152,8 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) { COFFDebugSymbolsSection = nullptr; if ((T.isMacOSX() && !T.isMacOSXVersionLT(10, 6)) || - (T.isOSDarwin() && T.getArch() == Triple::arm64)) { + (T.isOSDarwin() && + (T.getArch() == Triple::arm64 || T.getArch() == Triple::aarch64))) { CompactUnwindSection = Ctx->getMachOSection("__LD", "__compact_unwind", MachO::S_ATTR_DEBUG, @@ -159,7 +161,7 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) { if (T.getArch() == Triple::x86_64 || T.getArch() == Triple::x86) CompactUnwindDwarfEHFrameOnly = 0x04000000; - else if (T.getArch() == Triple::arm64) + else if (T.getArch() == Triple::arm64 || T.getArch() == Triple::aarch64) CompactUnwindDwarfEHFrameOnly = 0x03000000; } @@ -785,7 +787,7 @@ void MCObjectFileInfo::InitMCObjectFileInfo(StringRef TT, Reloc::Model relocm, // cellspu-apple-darwin. Perhaps we should fix in Triple? if ((Arch == Triple::x86 || Arch == Triple::x86_64 || Arch == Triple::arm || Arch == Triple::thumb || - Arch == Triple::arm64 || + Arch == Triple::arm64 || Arch == Triple::aarch64 || Arch == Triple::ppc || Arch == Triple::ppc64 || Arch == Triple::UnknownArch) && (T.isOSDarwin() || T.isOSBinFormatMachO())) { diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h new file mode 100644 index 00000000000..1c022aaf86b --- /dev/null +++ b/lib/Target/AArch64/AArch64.h @@ -0,0 +1,49 @@ +//==-- AArch64.h - Top-level interface for AArch64 --------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in the LLVM +// AArch64 back-end. +// +//===----------------------------------------------------------------------===// + +#ifndef TARGET_AArch64_H +#define TARGET_AArch64_H + +#include "Utils/AArch64BaseInfo.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Support/DataTypes.h" + +namespace llvm { + +class AArch64TargetMachine; +class FunctionPass; +class MachineFunctionPass; + +FunctionPass *createAArch64DeadRegisterDefinitions(); +FunctionPass *createAArch64ConditionalCompares(); +FunctionPass *createAArch64AdvSIMDScalar(); +FunctionPass *createAArch64BranchRelaxation(); +FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM, + CodeGenOpt::Level OptLevel); +FunctionPass *createAArch64StorePairSuppressPass(); +FunctionPass *createAArch64ExpandPseudoPass(); +FunctionPass *createAArch64LoadStoreOptimizationPass(); +ModulePass *createAArch64PromoteConstantPass(); +FunctionPass *createAArch64AddressTypePromotionPass(); +/// \brief Creates an ARM-specific Target Transformation Info pass. +ImmutablePass * +createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM); + +FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); + +FunctionPass *createAArch64CollectLOHPass(); +} // end namespace llvm + +#endif diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td new file mode 100644 index 00000000000..1ad5ac8c6f3 --- /dev/null +++ b/lib/Target/AArch64/AArch64.td @@ -0,0 +1,134 @@ +//=- AArch64.td - Describe the AArch64 Target Machine --------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Target-independent interfaces which we are implementing +//===----------------------------------------------------------------------===// + +include "llvm/Target/Target.td" + +//===----------------------------------------------------------------------===// +// AArch64 Subtarget features. +// + +def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true", + "Enable ARMv8 FP">; + +def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true", + "Enable Advanced SIMD instructions", [FeatureFPARMv8]>; + +def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", + "Enable cryptographic instructions">; + +def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", + "Enable ARMv8 CRC-32 checksum instructions">; + +/// Cyclone has register move instructions which are "free". +def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", + "Has zero-cycle register moves">; + +/// Cyclone has instructions which zero registers for "free". +def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", + "Has zero-cycle zeroing instructions">; + +//===----------------------------------------------------------------------===// +// Register File Description +//===----------------------------------------------------------------------===// + +include "AArch64RegisterInfo.td" +include "AArch64CallingConvention.td" + +//===----------------------------------------------------------------------===// +// Instruction Descriptions +//===----------------------------------------------------------------------===// + +include "AArch64Schedule.td" +include "AArch64InstrInfo.td" + +def AArch64InstrInfo : InstrInfo; + +//===----------------------------------------------------------------------===// +// AArch64 Processors supported. +// +include "AArch64SchedA53.td" +include "AArch64SchedCyclone.td" + +def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", + "Cortex-A53 ARM processors", + [FeatureFPARMv8, + FeatureNEON, + FeatureCrypto, + FeatureCRC]>; + +def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", + "Cortex-A57 ARM processors", + [FeatureFPARMv8, + FeatureNEON, + FeatureCrypto, + FeatureCRC]>; + +def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone", + "Cyclone", + [FeatureFPARMv8, + FeatureNEON, + FeatureCrypto, + FeatureCRC, + FeatureZCRegMove, FeatureZCZeroing]>; + +def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8, + FeatureNEON, + FeatureCRC]>; + +def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; +def : ProcessorModel<"cortex-a57", NoSchedModel, [ProcA57]>; +def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; + +//===----------------------------------------------------------------------===// +// Assembly parser +//===----------------------------------------------------------------------===// + +def GenericAsmParserVariant : AsmParserVariant { + int Variant = 0; + string Name = "generic"; +} + +def AppleAsmParserVariant : AsmParserVariant { + int Variant = 1; + string Name = "apple-neon"; +} + +//===----------------------------------------------------------------------===// +// Assembly printer +//===----------------------------------------------------------------------===// +// AArch64 Uses the MC printer for asm output, so make sure the TableGen +// AsmWriter bits get associated with the correct class. +def GenericAsmWriter : AsmWriter { + string AsmWriterClassName = "InstPrinter"; + int Variant = 0; + bit isMCAsmWriter = 1; +} + +def AppleAsmWriter : AsmWriter { + let AsmWriterClassName = "AppleInstPrinter"; + int Variant = 1; + int isMCAsmWriter = 1; +} + +//===----------------------------------------------------------------------===// +// Target Declaration +//===----------------------------------------------------------------------===// + +def AArch64 : Target { + let InstructionSet = AArch64InstrInfo; + let AssemblyParserVariants = [GenericAsmParserVariant, AppleAsmParserVariant]; + let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter]; +} diff --git a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp new file mode 100644 index 00000000000..04906f6078f --- /dev/null +++ b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp @@ -0,0 +1,492 @@ +//===-- AArch64AddressTypePromotion.cpp --- Promote type for addr accesses -==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass tries to promote the computations use to obtained a sign extended +// value used into memory accesses. +// E.g. +// a = add nsw i32 b, 3 +// d = sext i32 a to i64 +// e = getelementptr ..., i64 d +// +// => +// f = sext i32 b to i64 +// a = add nsw i64 f, 3 +// e = getelementptr ..., i64 a +// +// This is legal to do so if the computations are markers with either nsw or nuw +// markers. +// Moreover, the current heuristic is simple: it does not create new sext +// operations, i.e., it gives up when a sext would have forked (e.g., if +// a = add i32 b, c, two sexts are required to promote the computation). +// +// FIXME: This pass may be useful for other targets too. +// ===---------------------------------------------------------------------===// + +#include "AArch64.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-type-promotion" + +static cl::opt +EnableAddressTypePromotion("aarch64-type-promotion", cl::Hidden, + cl::desc("Enable the type promotion pass"), + cl::init(true)); +static cl::opt +EnableMerge("aarch64-type-promotion-merge", cl::Hidden, + cl::desc("Enable merging of redundant sexts when one is dominating" + " the other."), + cl::init(true)); + +//===----------------------------------------------------------------------===// +// AArch64AddressTypePromotion +//===----------------------------------------------------------------------===// + +namespace llvm { +void initializeAArch64AddressTypePromotionPass(PassRegistry &); +} + +namespace { +class AArch64AddressTypePromotion : public FunctionPass { + +public: + static char ID; + AArch64AddressTypePromotion() + : FunctionPass(ID), Func(nullptr), ConsideredSExtType(nullptr) { + initializeAArch64AddressTypePromotionPass(*PassRegistry::getPassRegistry()); + } + + const char *getPassName() const override { + return "AArch64 Address Type Promotion"; + } + + /// Iterate over the functions and promote the computation of interesting + // sext instructions. + bool runOnFunction(Function &F) override; + +private: + /// The current function. + Function *Func; + /// Filter out all sexts that does not have this type. + /// Currently initialized with Int64Ty. + Type *ConsideredSExtType; + + // This transformation requires dominator info. + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addPreserved(); + FunctionPass::getAnalysisUsage(AU); + } + + typedef SmallPtrSet SetOfInstructions; + typedef SmallVector Instructions; + typedef DenseMap ValueToInsts; + + /// Check if it is profitable to move a sext through this instruction. + /// Currently, we consider it is profitable if: + /// - Inst is used only once (no need to insert truncate). + /// - Inst has only one operand that will require a sext operation (we do + /// do not create new sext operation). + bool shouldGetThrough(const Instruction *Inst); + + /// Check if it is possible and legal to move a sext through this + /// instruction. + /// Current heuristic considers that we can get through: + /// - Arithmetic operation marked with the nsw or nuw flag. + /// - Other sext operation. + /// - Truncate operation if it was just dropping sign extended bits. + bool canGetThrough(const Instruction *Inst); + + /// Move sext operations through safe to sext instructions. + bool propagateSignExtension(Instructions &SExtInsts); + + /// Is this sext should be considered for code motion. + /// We look for sext with ConsideredSExtType and uses in at least one + // GetElementPtrInst. + bool shouldConsiderSExt(const Instruction *SExt) const; + + /// Collect all interesting sext operations, i.e., the ones with the right + /// type and used in memory accesses. + /// More precisely, a sext instruction is considered as interesting if it + /// is used in a "complex" getelementptr or it exits at least another + /// sext instruction that sign extended the same initial value. + /// A getelementptr is considered as "complex" if it has more than 2 + // operands. + void analyzeSExtension(Instructions &SExtInsts); + + /// Merge redundant sign extension operations in common dominator. + void mergeSExts(ValueToInsts &ValToSExtendedUses, + SetOfInstructions &ToRemove); +}; +} // end anonymous namespace. + +char AArch64AddressTypePromotion::ID = 0; + +INITIALIZE_PASS_BEGIN(AArch64AddressTypePromotion, "aarch64-type-promotion", + "AArch64 Type Promotion Pass", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(AArch64AddressTypePromotion, "aarch64-type-promotion", + "AArch64 Type Promotion Pass", false, false) + +FunctionPass *llvm::createAArch64AddressTypePromotionPass() { + return new AArch64AddressTypePromotion(); +} + +bool AArch64AddressTypePromotion::canGetThrough(const Instruction *Inst) { + if (isa(Inst)) + return true; + + const BinaryOperator *BinOp = dyn_cast(Inst); + if (BinOp && isa(BinOp) && + (BinOp->hasNoUnsignedWrap() || BinOp->hasNoSignedWrap())) + return true; + + // sext(trunc(sext)) --> sext + if (isa(Inst) && isa(Inst->getOperand(0))) { + const Instruction *Opnd = cast(Inst->getOperand(0)); + // Check that the truncate just drop sign extended bits. + if (Inst->getType()->getIntegerBitWidth() >= + Opnd->getOperand(0)->getType()->getIntegerBitWidth() && + Inst->getOperand(0)->getType()->getIntegerBitWidth() <= + ConsideredSExtType->getIntegerBitWidth()) + return true; + } + + return false; +} + +bool AArch64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) { + // If the type of the sext is the same as the considered one, this sext + // will become useless. + // Otherwise, we will have to do something to preserve the original value, + // unless it is used once. + if (isa(Inst) && + (Inst->getType() == ConsideredSExtType || Inst->hasOneUse())) + return true; + + // If the Inst is used more that once, we may need to insert truncate + // operations and we don't do that at the moment. + if (!Inst->hasOneUse()) + return false; + + // This truncate is used only once, thus if we can get thourgh, it will become + // useless. + if (isa(Inst)) + return true; + + // If both operands are not constant, a new sext will be created here. + // Current heuristic is: each step should be profitable. + // Therefore we don't allow to increase the number of sext even if it may + // be profitable later on. + if (isa(Inst) && isa(Inst->getOperand(1))) + return true; + + return false; +} + +static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) { + if (isa(Inst) && OpIdx == 0) + return false; + return true; +} + +bool +AArch64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const { + if (SExt->getType() != ConsideredSExtType) + return false; + + for (const Use &U : SExt->uses()) { + if (isa(*U)) + return true; + } + + return false; +} + +// Input: +// - SExtInsts contains all the sext instructions that are use direclty in +// GetElementPtrInst, i.e., access to memory. +// Algorithm: +// - For each sext operation in SExtInsts: +// Let var be the operand of sext. +// while it is profitable (see shouldGetThrough), legal, and safe +// (see canGetThrough) to move sext through var's definition: +// * promote the type of var's definition. +// * fold var into sext uses. +// * move sext above var's definition. +// * update sext operand to use the operand of var that should be sign +// extended (by construction there is only one). +// +// E.g., +// a = ... i32 c, 3 +// b = sext i32 a to i64 <- is it legal/safe/profitable to get through 'a' +// ... +// = b +// => Yes, update the code +// b = sext i32 c to i64 +// a = ... i64 b, 3 +// ... +// = a +// Iterate on 'c'. +bool +AArch64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) { + DEBUG(dbgs() << "*** Propagate Sign Extension ***\n"); + + bool LocalChange = false; + SetOfInstructions ToRemove; + ValueToInsts ValToSExtendedUses; + while (!SExtInsts.empty()) { + // Get through simple chain. + Instruction *SExt = SExtInsts.pop_back_val(); + + DEBUG(dbgs() << "Consider:\n" << *SExt << '\n'); + + // If this SExt has already been merged continue. + if (SExt->use_empty() && ToRemove.count(SExt)) { + DEBUG(dbgs() << "No uses => marked as delete\n"); + continue; + } + + // Now try to get through the chain of definitions. + while (isa(SExt->getOperand(0))) { + Instruction *Inst = dyn_cast(SExt->getOperand(0)); + DEBUG(dbgs() << "Try to get through:\n" << *Inst << '\n'); + if (!canGetThrough(Inst) || !shouldGetThrough(Inst)) { + // We cannot get through something that is not an Instruction + // or not safe to SExt. + DEBUG(dbgs() << "Cannot get through\n"); + break; + } + + LocalChange = true; + // If this is a sign extend, it becomes useless. + if (isa(Inst) || isa(Inst)) { + DEBUG(dbgs() << "SExt or trunc, mark it as to remove\n"); + // We cannot use replaceAllUsesWith here because we may trigger some + // assertion on the type as all involved sext operation may have not + // been moved yet. + while (!Inst->use_empty()) { + Value::use_iterator UseIt = Inst->use_begin(); + Instruction *UseInst = dyn_cast(*UseIt); + assert(UseInst && "Use of sext is not an Instruction!"); + UseInst->setOperand(UseIt->getOperandNo(), SExt); + } + ToRemove.insert(Inst); + SExt->setOperand(0, Inst->getOperand(0)); + SExt->moveBefore(Inst); + continue; + } + + // Get through the Instruction: + // 1. Update its type. + // 2. Replace the uses of SExt by Inst. + // 3. Sign extend each operand that needs to be sign extended. + + // Step #1. + Inst->mutateType(SExt->getType()); + // Step #2. + SExt->replaceAllUsesWith(Inst); + // Step #3. + Instruction *SExtForOpnd = SExt; + + DEBUG(dbgs() << "Propagate SExt to operands\n"); + for (int OpIdx = 0, EndOpIdx = Inst->getNumOperands(); OpIdx != EndOpIdx; + ++OpIdx) { + DEBUG(dbgs() << "Operand:\n" << *(Inst->getOperand(OpIdx)) << '\n'); + if (Inst->getOperand(OpIdx)->getType() == SExt->getType() || + !shouldSExtOperand(Inst, OpIdx)) { + DEBUG(dbgs() << "No need to propagate\n"); + continue; + } + // Check if we can statically sign extend the operand. + Value *Opnd = Inst->getOperand(OpIdx); + if (const ConstantInt *Cst = dyn_cast(Opnd)) { + DEBUG(dbgs() << "Statically sign extend\n"); + Inst->setOperand(OpIdx, ConstantInt::getSigned(SExt->getType(), + Cst->getSExtValue())); + continue; + } + // UndefValue are typed, so we have to statically sign extend them. + if (isa(Opnd)) { + DEBUG(dbgs() << "Statically sign extend\n"); + Inst->setOperand(OpIdx, UndefValue::get(SExt->getType())); + continue; + } + + // Otherwise we have to explicity sign extend it. + assert(SExtForOpnd && + "Only one operand should have been sign extended"); + + SExtForOpnd->setOperand(0, Opnd); + + DEBUG(dbgs() << "Move before:\n" << *Inst << "\nSign extend\n"); + // Move the sign extension before the insertion point. + SExtForOpnd->moveBefore(Inst); + Inst->setOperand(OpIdx, SExtForOpnd); + // If more sext are required, new instructions will have to be created. + SExtForOpnd = nullptr; + } + if (SExtForOpnd == SExt) { + DEBUG(dbgs() << "Sign extension is useless now\n"); + ToRemove.insert(SExt); + break; + } + } + + // If the use is already of the right type, connect its uses to its argument + // and delete it. + // This can happen for an Instruction which all uses are sign extended. + if (!ToRemove.count(SExt) && + SExt->getType() == SExt->getOperand(0)->getType()) { + DEBUG(dbgs() << "Sign extension is useless, attach its use to " + "its argument\n"); + SExt->replaceAllUsesWith(SExt->getOperand(0)); + ToRemove.insert(SExt); + } else + ValToSExtendedUses[SExt->getOperand(0)].push_back(SExt); + } + + if (EnableMerge) + mergeSExts(ValToSExtendedUses, ToRemove); + + // Remove all instructions marked as ToRemove. + for (Instruction *I: ToRemove) + I->eraseFromParent(); + return LocalChange; +} + +void AArch64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses, + SetOfInstructions &ToRemove) { + DominatorTree &DT = getAnalysis().getDomTree(); + + for (auto &Entry : ValToSExtendedUses) { + Instructions &Insts = Entry.second; + Instructions CurPts; + for (Instruction *Inst : Insts) { + if (ToRemove.count(Inst)) + continue; + bool inserted = false; + for (auto Pt : CurPts) { + if (DT.dominates(Inst, Pt)) { + DEBUG(dbgs() << "Replace all uses of:\n" << *Pt << "\nwith:\n" + << *Inst << '\n'); + (Pt)->replaceAllUsesWith(Inst); + ToRemove.insert(Pt); + Pt = Inst; + inserted = true; + break; + } + if (!DT.dominates(Pt, Inst)) + // Give up if we need to merge in a common dominator as the + // expermients show it is not profitable. + continue; + + DEBUG(dbgs() << "Replace all uses of:\n" << *Inst << "\nwith:\n" + << *Pt << '\n'); + Inst->replaceAllUsesWith(Pt); + ToRemove.insert(Inst); + inserted = true; + break; + } + if (!inserted) + CurPts.push_back(Inst); + } + } +} + +void AArch64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) { + DEBUG(dbgs() << "*** Analyze Sign Extensions ***\n"); + + DenseMap SeenChains; + + for (auto &BB : *Func) { + for (auto &II : BB) { + Instruction *SExt = &II; + + // Collect all sext operation per type. + if (!isa(SExt) || !shouldConsiderSExt(SExt)) + continue; + + DEBUG(dbgs() << "Found:\n" << (*SExt) << '\n'); + + // Cases where we actually perform the optimization: + // 1. SExt is used in a getelementptr with more than 2 operand => + // likely we can merge some computation if they are done on 64 bits. + // 2. The beginning of the SExt chain is SExt several time. => + // code sharing is possible. + + bool insert = false; + // #1. + for (const Use &U : SExt->uses()) { + const Instruction *Inst = dyn_cast(U); + if (Inst && Inst->getNumOperands() > 2) { + DEBUG(dbgs() << "Interesting use in GetElementPtrInst\n" << *Inst + << '\n'); + insert = true; + break; + } + } + + // #2. + // Check the head of the chain. + Instruction *Inst = SExt; + Value *Last; + do { + int OpdIdx = 0; + const BinaryOperator *BinOp = dyn_cast(Inst); + if (BinOp && isa(BinOp->getOperand(0))) + OpdIdx = 1; + Last = Inst->getOperand(OpdIdx); + Inst = dyn_cast(Last); + } while (Inst && canGetThrough(Inst) && shouldGetThrough(Inst)); + + DEBUG(dbgs() << "Head of the chain:\n" << *Last << '\n'); + DenseMap::iterator AlreadySeen = + SeenChains.find(Last); + if (insert || AlreadySeen != SeenChains.end()) { + DEBUG(dbgs() << "Insert\n"); + SExtInsts.push_back(SExt); + if (AlreadySeen != SeenChains.end() && AlreadySeen->second != nullptr) { + DEBUG(dbgs() << "Insert chain member\n"); + SExtInsts.push_back(AlreadySeen->second); + SeenChains[Last] = nullptr; + } + } else { + DEBUG(dbgs() << "Record its chain membership\n"); + SeenChains[Last] = SExt; + } + } + } +} + +bool AArch64AddressTypePromotion::runOnFunction(Function &F) { + if (!EnableAddressTypePromotion || F.isDeclaration()) + return false; + Func = &F; + ConsideredSExtType = Type::getInt64Ty(Func->getContext()); + + DEBUG(dbgs() << "*** " << getPassName() << ": " << Func->getName() << '\n'); + + Instructions SExtInsts; + analyzeSExtension(SExtInsts); + return propagateSignExtension(SExtInsts); +} diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp new file mode 100644 index 00000000000..734fb215e6e --- /dev/null +++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp @@ -0,0 +1,387 @@ +//===-- AArch64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// When profitable, replace GPR targeting i64 instructions with their +// AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined +// as minimizing the number of cross-class register copies. +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// TODO: Graph based predicate heuristics. +// Walking the instruction list linearly will get many, perhaps most, of +// the cases, but to do a truly thorough job of this, we need a more +// wholistic approach. +// +// This optimization is very similar in spirit to the register allocator's +// spill placement, only here we're determining where to place cross-class +// register copies rather than spills. As such, a similar approach is +// called for. +// +// We want to build up a set of graphs of all instructions which are candidates +// for transformation along with instructions which generate their inputs and +// consume their outputs. For each edge in the graph, we assign a weight +// based on whether there is a copy required there (weight zero if not) and +// the block frequency of the block containing the defining or using +// instruction, whichever is less. Our optimization is then a graph problem +// to minimize the total weight of all the graphs, then transform instructions +// and add or remove copy instructions as called for to implement the +// solution. +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64RegisterInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "aarch64-simd-scalar" + +// Allow forcing all i64 operations with equivalent SIMD instructions to use +// them. For stress-testing the transformation function. +static cl::opt +TransformAll("aarch64-simd-scalar-force-all", + cl::desc("Force use of AdvSIMD scalar instructions everywhere"), + cl::init(false), cl::Hidden); + +STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used"); +STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted"); +STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted"); + +namespace { +class AArch64AdvSIMDScalar : public MachineFunctionPass { + MachineRegisterInfo *MRI; + const AArch64InstrInfo *TII; + +private: + // isProfitableToTransform - Predicate function to determine whether an + // instruction should be transformed to its equivalent AdvSIMD scalar + // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example. + bool isProfitableToTransform(const MachineInstr *MI) const; + + // transformInstruction - Perform the transformation of an instruction + // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs + // to be the correct register class, minimizing cross-class copies. + void transformInstruction(MachineInstr *MI); + + // processMachineBasicBlock - Main optimzation loop. + bool processMachineBasicBlock(MachineBasicBlock *MBB); + +public: + static char ID; // Pass identification, replacement for typeid. + explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &F) override; + + const char *getPassName() const override { + return "AdvSIMD Scalar Operation Optimization"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +char AArch64AdvSIMDScalar::ID = 0; +} // end anonymous namespace + +static bool isGPR64(unsigned Reg, unsigned SubReg, + const MachineRegisterInfo *MRI) { + if (SubReg) + return false; + if (TargetRegisterInfo::isVirtualRegister(Reg)) + return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass); + return AArch64::GPR64RegClass.contains(Reg); +} + +static bool isFPR64(unsigned Reg, unsigned SubReg, + const MachineRegisterInfo *MRI) { + if (TargetRegisterInfo::isVirtualRegister(Reg)) + return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) && + SubReg == 0) || + (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) && + SubReg == AArch64::dsub); + // Physical register references just check the register class directly. + return (AArch64::FPR64RegClass.contains(Reg) && SubReg == 0) || + (AArch64::FPR128RegClass.contains(Reg) && SubReg == AArch64::dsub); +} + +// getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64 +// copy instruction. Return zero_reg if the instruction is not a copy. +static unsigned getSrcFromCopy(const MachineInstr *MI, + const MachineRegisterInfo *MRI, + unsigned &SubReg) { + SubReg = 0; + // The "FMOV Xd, Dn" instruction is the typical form. + if (MI->getOpcode() == AArch64::FMOVDXr || + MI->getOpcode() == AArch64::FMOVXDr) + return MI->getOperand(1).getReg(); + // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see + // these at this stage, but it's easy to check for. + if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) { + SubReg = AArch64::dsub; + return MI->getOperand(1).getReg(); + } + // Or just a plain COPY instruction. This can be directly to/from FPR64, + // or it can be a dsub subreg reference to an FPR128. + if (MI->getOpcode() == AArch64::COPY) { + if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(), + MRI) && + isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI)) + return MI->getOperand(1).getReg(); + if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(), + MRI) && + isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), + MRI)) { + SubReg = MI->getOperand(1).getSubReg(); + return MI->getOperand(1).getReg(); + } + } + + // Otherwise, this is some other kind of instruction. + return 0; +} + +// getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent +// that we're considering transforming to, return that AdvSIMD opcode. For all +// others, return the original opcode. +static int getTransformOpcode(unsigned Opc) { + switch (Opc) { + default: + break; + // FIXME: Lots more possibilities. + case AArch64::ADDXrr: + return AArch64::ADDv1i64; + case AArch64::SUBXrr: + return AArch64::SUBv1i64; + } + // No AdvSIMD equivalent, so just return the original opcode. + return Opc; +} + +static bool isTransformable(const MachineInstr *MI) { + int Opc = MI->getOpcode(); + return Opc != getTransformOpcode(Opc); +} + +// isProfitableToTransform - Predicate function to determine whether an +// instruction should be transformed to its equivalent AdvSIMD scalar +// instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example. +bool +AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const { + // If this instruction isn't eligible to be transformed (no SIMD equivalent), + // early exit since that's the common case. + if (!isTransformable(MI)) + return false; + + // Count the number of copies we'll need to add and approximate the number + // of copies that a transform will enable us to remove. + unsigned NumNewCopies = 3; + unsigned NumRemovableCopies = 0; + + unsigned OrigSrc0 = MI->getOperand(1).getReg(); + unsigned OrigSrc1 = MI->getOperand(2).getReg(); + unsigned Src0 = 0, SubReg0; + unsigned Src1 = 0, SubReg1; + if (!MRI->def_empty(OrigSrc0)) { + MachineRegisterInfo::def_instr_iterator Def = + MRI->def_instr_begin(OrigSrc0); + assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); + Src0 = getSrcFromCopy(&*Def, MRI, SubReg0); + // If the source was from a copy, we don't need to insert a new copy. + if (Src0) + --NumNewCopies; + // If there are no other users of the original source, we can delete + // that instruction. + if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) + ++NumRemovableCopies; + } + if (!MRI->def_empty(OrigSrc1)) { + MachineRegisterInfo::def_instr_iterator Def = + MRI->def_instr_begin(OrigSrc1); + assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); + Src1 = getSrcFromCopy(&*Def, MRI, SubReg1); + if (Src1) + --NumNewCopies; + // If there are no other users of the original source, we can delete + // that instruction. + if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) + ++NumRemovableCopies; + } + + // If any of the uses of the original instructions is a cross class copy, + // that's a copy that will be removable if we transform. Likewise, if + // any of the uses is a transformable instruction, it's likely the tranforms + // will chain, enabling us to save a copy there, too. This is an aggressive + // heuristic that approximates the graph based cost analysis described above. + unsigned Dst = MI->getOperand(0).getReg(); + bool AllUsesAreCopies = true; + for (MachineRegisterInfo::use_instr_nodbg_iterator + Use = MRI->use_instr_nodbg_begin(Dst), + E = MRI->use_instr_nodbg_end(); + Use != E; ++Use) { + unsigned SubReg; + if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(&*Use)) + ++NumRemovableCopies; + // If the use is an INSERT_SUBREG, that's still something that can + // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's + // preferable to have it use the FPR64 in most cases, as if the source + // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely. + // Ditto for a lane insert. + else if (Use->getOpcode() == AArch64::INSERT_SUBREG || + Use->getOpcode() == AArch64::INSvi64gpr) + ; + else + AllUsesAreCopies = false; + } + // If all of the uses of the original destination register are copies to + // FPR64, then we won't end up having a new copy back to GPR64 either. + if (AllUsesAreCopies) + --NumNewCopies; + + // If a transform will not increase the number of cross-class copies required, + // return true. + if (NumNewCopies <= NumRemovableCopies) + return true; + + // Finally, even if we otherwise wouldn't transform, check if we're forcing + // transformation of everything. + return TransformAll; +} + +static MachineInstr *insertCopy(const AArch64InstrInfo *TII, MachineInstr *MI, + unsigned Dst, unsigned Src, bool IsKill) { + MachineInstrBuilder MIB = + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AArch64::COPY), + Dst) + .addReg(Src, getKillRegState(IsKill)); + DEBUG(dbgs() << " adding copy: " << *MIB); + ++NumCopiesInserted; + return MIB; +} + +// transformInstruction - Perform the transformation of an instruction +// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs +// to be the correct register class, minimizing cross-class copies. +void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) { + DEBUG(dbgs() << "Scalar transform: " << *MI); + + MachineBasicBlock *MBB = MI->getParent(); + int OldOpc = MI->getOpcode(); + int NewOpc = getTransformOpcode(OldOpc); + assert(OldOpc != NewOpc && "transform an instruction to itself?!"); + + // Check if we need a copy for the source registers. + unsigned OrigSrc0 = MI->getOperand(1).getReg(); + unsigned OrigSrc1 = MI->getOperand(2).getReg(); + unsigned Src0 = 0, SubReg0; + unsigned Src1 = 0, SubReg1; + if (!MRI->def_empty(OrigSrc0)) { + MachineRegisterInfo::def_instr_iterator Def = + MRI->def_instr_begin(OrigSrc0); + assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); + Src0 = getSrcFromCopy(&*Def, MRI, SubReg0); + // If there are no other users of the original source, we can delete + // that instruction. + if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) { + assert(Src0 && "Can't delete copy w/o a valid original source!"); + Def->eraseFromParent(); + ++NumCopiesDeleted; + } + } + if (!MRI->def_empty(OrigSrc1)) { + MachineRegisterInfo::def_instr_iterator Def = + MRI->def_instr_begin(OrigSrc1); + assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); + Src1 = getSrcFromCopy(&*Def, MRI, SubReg1); + // If there are no other users of the original source, we can delete + // that instruction. + if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) { + assert(Src1 && "Can't delete copy w/o a valid original source!"); + Def->eraseFromParent(); + ++NumCopiesDeleted; + } + } + // If we weren't able to reference the original source directly, create a + // copy. + if (!Src0) { + SubReg0 = 0; + Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass); + insertCopy(TII, MI, Src0, OrigSrc0, true); + } + if (!Src1) { + SubReg1 = 0; + Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass); + insertCopy(TII, MI, Src1, OrigSrc1, true); + } + + // Create a vreg for the destination. + // FIXME: No need to do this if the ultimate user expects an FPR64. + // Check for that and avoid the copy if possible. + unsigned Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass); + + // For now, all of the new instructions have the same simple three-register + // form, so no need to special case based on what instruction we're + // building. + BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(NewOpc), Dst) + .addReg(Src0, getKillRegState(true), SubReg0) + .addReg(Src1, getKillRegState(true), SubReg1); + + // Now copy the result back out to a GPR. + // FIXME: Try to avoid this if all uses could actually just use the FPR64 + // directly. + insertCopy(TII, MI, MI->getOperand(0).getReg(), Dst, true); + + // Erase the old instruction. + MI->eraseFromParent(); + + ++NumScalarInsnsUsed; +} + +// processMachineBasicBlock - Main optimzation loop. +bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) { + bool Changed = false; + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) { + MachineInstr *MI = I; + ++I; + if (isProfitableToTransform(MI)) { + transformInstruction(MI); + Changed = true; + } + } + return Changed; +} + +// runOnMachineFunction - Pass entry point from PassManager. +bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) { + bool Changed = false; + DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n"); + + const TargetMachine &TM = mf.getTarget(); + MRI = &mf.getRegInfo(); + TII = static_cast(TM.getInstrInfo()); + + // Just check things on a one-block-at-a-time basis. + for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I) + if (processMachineBasicBlock(I)) + Changed = true; + return Changed; +} + +// createAArch64AdvSIMDScalar - Factory function used by AArch64TargetMachine +// to add the pass to the PassManager. +FunctionPass *llvm::createAArch64AdvSIMDScalar() { + return new AArch64AdvSIMDScalar(); +} diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp new file mode 100644 index 00000000000..8553a591fee --- /dev/null +++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -0,0 +1,519 @@ +//===-- AArch64AsmPrinter.cpp - AArch64 LLVM assembly writer --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a printer that converts from our internal representation +// of machine-dependent LLVM code to the AArch64 assembly language. +// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64MCInstLower.h" +#include "AArch64RegisterInfo.h" +#include "AArch64Subtarget.h" +#include "InstPrinter/AArch64InstPrinter.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Twine.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/StackMaps.h" +#include "llvm/CodeGen/MachineModuleInfoImpls.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstBuilder.h" +#include "llvm/MC/MCLinkerOptimizationHint.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/TargetRegistry.h" +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +namespace { + +class AArch64AsmPrinter : public AsmPrinter { + /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can + /// make the right decision when printing asm code for different targets. + const AArch64Subtarget *Subtarget; + + AArch64MCInstLower MCInstLowering; + StackMaps SM; + +public: + AArch64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer) + : AsmPrinter(TM, Streamer), + Subtarget(&TM.getSubtarget()), + MCInstLowering(OutContext, *Mang, *this), SM(*this), AArch64FI(nullptr), + LOHLabelCounter(0) {} + + const char *getPassName() const override { + return "AArch64 Assembly Printer"; + } + + /// \brief Wrapper for MCInstLowering.lowerOperand() for the + /// tblgen'erated pseudo lowering. + bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const { + return MCInstLowering.lowerOperand(MO, MCOp); + } + + void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, + const MachineInstr &MI); + void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, + const MachineInstr &MI); + /// \brief tblgen'erated driver function for lowering simple MI->MC + /// pseudo instructions. + bool emitPseudoExpansionLowering(MCStreamer &OutStreamer, + const MachineInstr *MI); + + void EmitInstruction(const MachineInstr *MI) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AsmPrinter::getAnalysisUsage(AU); + AU.setPreservesAll(); + } + + bool runOnMachineFunction(MachineFunction &F) override { + AArch64FI = F.getInfo(); + return AsmPrinter::runOnMachineFunction(F); + } + +private: + MachineLocation getDebugValueLocation(const MachineInstr *MI) const; + void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O); + bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O); + bool printAsmRegInClass(const MachineOperand &MO, + const TargetRegisterClass *RC, bool isVector, + raw_ostream &O); + + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &O) override; + bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &O) override; + + void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS); + + void EmitFunctionBodyEnd() override; + + MCSymbol *GetCPISymbol(unsigned CPID) const override; + void EmitEndOfAsmFile(Module &M) override; + AArch64FunctionInfo *AArch64FI; + + /// \brief Emit the LOHs contained in AArch64FI. + void EmitLOHs(); + + typedef std::map MInstToMCSymbol; + MInstToMCSymbol LOHInstToLabel; + unsigned LOHLabelCounter; +}; + +} // end of anonymous namespace + +//===----------------------------------------------------------------------===// + +void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) { + if (Subtarget->isTargetMachO()) { + // Funny Darwin hack: This flag tells the linker that no global symbols + // contain code that falls through to other global symbols (e.g. the obvious + // implementation of multiple entry points). If this doesn't occur, the + // linker can safely perform dead code stripping. Since LLVM never + // generates code that does this, it is always safe to set. + OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); + SM.serializeToStackMapSection(); + } + + // Emit a .data.rel section containing any stubs that were created. + if (Subtarget->isTargetELF()) { + const TargetLoweringObjectFileELF &TLOFELF = + static_cast(getObjFileLowering()); + + MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo(); + + // Output stubs for external and common global variables. + MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList(); + if (!Stubs.empty()) { + OutStreamer.SwitchSection(TLOFELF.getDataRelSection()); + const DataLayout *TD = TM.getDataLayout(); + + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { + OutStreamer.EmitLabel(Stubs[i].first); + OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(), + TD->getPointerSize(0)); + } + Stubs.clear(); + } + } + +} + +MachineLocation +AArch64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const { + MachineLocation Location; + assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!"); + // Frame address. Currently handles register +- offset only. + if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm()) + Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm()); + else { + DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n"); + } + return Location; +} + +void AArch64AsmPrinter::EmitLOHs() { + SmallVector MCArgs; + + for (const auto &D : AArch64FI->getLOHContainer()) { + for (const MachineInstr *MI : D.getArgs()) { + MInstToMCSymbol::iterator LabelIt = LOHInstToLabel.find(MI); + assert(LabelIt != LOHInstToLabel.end() && + "Label hasn't been inserted for LOH related instruction"); + MCArgs.push_back(LabelIt->second); + } + OutStreamer.EmitLOHDirective(D.getKind(), MCArgs); + MCArgs.clear(); + } +} + +void AArch64AsmPrinter::EmitFunctionBodyEnd() { + if (!AArch64FI->getLOHRelated().empty()) + EmitLOHs(); +} + +/// GetCPISymbol - Return the symbol for the specified constant pool entry. +MCSymbol *AArch64AsmPrinter::GetCPISymbol(unsigned CPID) const { + // Darwin uses a linker-private symbol name for constant-pools (to + // avoid addends on the relocation?), ELF has no such concept and + // uses a normal private symbol. + if (getDataLayout().getLinkerPrivateGlobalPrefix()[0]) + return OutContext.GetOrCreateSymbol( + Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" + + Twine(getFunctionNumber()) + "_" + Twine(CPID)); + + return OutContext.GetOrCreateSymbol( + Twine(getDataLayout().getPrivateGlobalPrefix()) + "CPI" + + Twine(getFunctionNumber()) + "_" + Twine(CPID)); +} + +void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum, + raw_ostream &O) { + const MachineOperand &MO = MI->getOperand(OpNum); + switch (MO.getType()) { + default: + assert(0 && ""); + case MachineOperand::MO_Register: { + unsigned Reg = MO.getReg(); + assert(TargetRegisterInfo::isPhysicalRegister(Reg)); + assert(!MO.getSubReg() && "Subregs should be eliminated!"); + O << AArch64InstPrinter::getRegisterName(Reg); + break; + } + case MachineOperand::MO_Immediate: { + int64_t Imm = MO.getImm(); + O << '#' << Imm; + break; + } + } +} + +bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode, + raw_ostream &O) { + unsigned Reg = MO.getReg(); + switch (Mode) { + default: + return true; // Unknown mode. + case 'w': + Reg = getWRegFromXReg(Reg); + break; + case 'x': + Reg = getXRegFromWReg(Reg); + break; + } + + O << AArch64InstPrinter::getRegisterName(Reg); + return false; +} + +// Prints the register in MO using class RC using the offset in the +// new register class. This should not be used for cross class +// printing. +bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO, + const TargetRegisterClass *RC, + bool isVector, raw_ostream &O) { + assert(MO.isReg() && "Should only get here with a register!"); + const AArch64RegisterInfo *RI = + static_cast(TM.getRegisterInfo()); + unsigned Reg = MO.getReg(); + unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg)); + assert(RI->regsOverlap(RegToPrint, Reg)); + O << AArch64InstPrinter::getRegisterName( + RegToPrint, isVector ? AArch64::vreg : AArch64::NoRegAltName); + return false; +} + +bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, + unsigned AsmVariant, + const char *ExtraCode, raw_ostream &O) { + const MachineOperand &MO = MI->getOperand(OpNum); + // Does this asm operand have a single letter operand modifier? + if (ExtraCode && ExtraCode[0]) { + if (ExtraCode[1] != 0) + return true; // Unknown modifier. + + switch (ExtraCode[0]) { + default: + return true; // Unknown modifier. + case 'w': // Print W register + case 'x': // Print X register + if (MO.isReg()) + return printAsmMRegister(MO, ExtraCode[0], O); + if (MO.isImm() && MO.getImm() == 0) { + unsigned Reg = ExtraCode[0] == 'w' ? AArch64::WZR : AArch64::XZR; + O << AArch64InstPrinter::getRegisterName(Reg); + return false; + } + printOperand(MI, OpNum, O); + return false; + case 'b': // Print B register. + case 'h': // Print H register. + case 's': // Print S register. + case 'd': // Print D register. + case 'q': // Print Q register. + if (MO.isReg()) { + const TargetRegisterClass *RC; + switch (ExtraCode[0]) { + case 'b': + RC = &AArch64::FPR8RegClass; + break; + case 'h': + RC = &AArch64::FPR16RegClass; + break; + case 's': + RC = &AArch64::FPR32RegClass; + break; + case 'd': + RC = &AArch64::FPR64RegClass; + break; + case 'q': + RC = &AArch64::FPR128RegClass; + break; + default: + return true; + } + return printAsmRegInClass(MO, RC, false /* vector */, O); + } + printOperand(MI, OpNum, O); + return false; + } + } + + // According to ARM, we should emit x and v registers unless we have a + // modifier. + if (MO.isReg()) { + unsigned Reg = MO.getReg(); + + // If this is a w or x register, print an x register. + if (AArch64::GPR32allRegClass.contains(Reg) || + AArch64::GPR64allRegClass.contains(Reg)) + return printAsmMRegister(MO, 'x', O); + + // If this is a b, h, s, d, or q register, print it as a v register. + return printAsmRegInClass(MO, &AArch64::FPR128RegClass, true /* vector */, + O); + } + + printOperand(MI, OpNum, O); + return false; +} + +bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, + unsigned OpNum, + unsigned AsmVariant, + const char *ExtraCode, + raw_ostream &O) { + if (ExtraCode && ExtraCode[0]) + return true; // Unknown modifier. + + const MachineOperand &MO = MI->getOperand(OpNum); + assert(MO.isReg() && "unexpected inline asm memory operand"); + O << "[" << AArch64InstPrinter::getRegisterName(MO.getReg()) << "]"; + return false; +} + +void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI, + raw_ostream &OS) { + unsigned NOps = MI->getNumOperands(); + assert(NOps == 4); + OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: "; + // cast away const; DIetc do not take const operands for some reason. + DIVariable V(const_cast(MI->getOperand(NOps - 1).getMetadata())); + OS << V.getName(); + OS << " <- "; + // Frame address. Currently handles register +- offset only. + assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm()); + OS << '['; + printOperand(MI, 0, OS); + OS << '+'; + printOperand(MI, 1, OS); + OS << ']'; + OS << "+"; + printOperand(MI, NOps - 2, OS); +} + +void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, + const MachineInstr &MI) { + unsigned NumNOPBytes = MI.getOperand(1).getImm(); + + SM.recordStackMap(MI); + // Emit padding. + assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); + for (unsigned i = 0; i < NumNOPBytes; i += 4) + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0)); +} + +// Lower a patchpoint of the form: +// [], , , , +void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, + const MachineInstr &MI) { + SM.recordPatchPoint(MI); + + PatchPointOpers Opers(&MI); + + int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm(); + unsigned EncodedBytes = 0; + if (CallTarget) { + assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget && + "High 16 bits of call target should be zero."); + unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg(); + EncodedBytes = 16; + // Materialize the jump address: + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZWi) + .addReg(ScratchReg) + .addImm((CallTarget >> 32) & 0xFFFF) + .addImm(32)); + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi) + .addReg(ScratchReg) + .addReg(ScratchReg) + .addImm((CallTarget >> 16) & 0xFFFF) + .addImm(16)); + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi) + .addReg(ScratchReg) + .addReg(ScratchReg) + .addImm(CallTarget & 0xFFFF) + .addImm(0)); + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::BLR).addReg(ScratchReg)); + } + // Emit padding. + unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm(); + assert(NumBytes >= EncodedBytes && + "Patchpoint can't request size less than the length of a call."); + assert((NumBytes - EncodedBytes) % 4 == 0 && + "Invalid number of NOP bytes requested!"); + for (unsigned i = EncodedBytes; i < NumBytes; i += 4) + EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0)); +} + +// Simple pseudo-instructions have their lowering (with expansion to real +// instructions) auto-generated. +#include "AArch64GenMCPseudoLowering.inc" + +void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { + // Do any auto-generated pseudo lowerings. + if (emitPseudoExpansionLowering(OutStreamer, MI)) + return; + + if (AArch64FI->getLOHRelated().count(MI)) { + // Generate a label for LOH related instruction + MCSymbol *LOHLabel = GetTempSymbol("loh", LOHLabelCounter++); + // Associate the instruction with the label + LOHInstToLabel[MI] = LOHLabel; + OutStreamer.EmitLabel(LOHLabel); + } + + // Do any manual lowerings. + switch (MI->getOpcode()) { + default: + break; + case AArch64::DBG_VALUE: { + if (isVerbose() && OutStreamer.hasRawTextSupport()) { + SmallString<128> TmpStr; + raw_svector_ostream OS(TmpStr); + PrintDebugValueComment(MI, OS); + OutStreamer.EmitRawText(StringRef(OS.str())); + } + return; + } + + // Tail calls use pseudo instructions so they have the proper code-gen + // attributes (isCall, isReturn, etc.). We lower them to the real + // instruction here. + case AArch64::TCRETURNri: { + MCInst TmpInst; + TmpInst.setOpcode(AArch64::BR); + TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); + EmitToStreamer(OutStreamer, TmpInst); + return; + } + case AArch64::TCRETURNdi: { + MCOperand Dest; + MCInstLowering.lowerOperand(MI->getOperand(0), Dest); + MCInst TmpInst; + TmpInst.setOpcode(AArch64::B); + TmpInst.addOperand(Dest); + EmitToStreamer(OutStreamer, TmpInst); + return; + } + case AArch64::TLSDESC_BLR: { + MCOperand Callee, Sym; + MCInstLowering.lowerOperand(MI->getOperand(0), Callee); + MCInstLowering.lowerOperand(MI->getOperand(1), Sym); + + // First emit a relocation-annotation. This expands to no code, but requests + // the following instruction gets an R_AARCH64_TLSDESC_CALL. + MCInst TLSDescCall; + TLSDescCall.setOpcode(AArch64::TLSDESCCALL); + TLSDescCall.addOperand(Sym); + EmitToStreamer(OutStreamer, TLSDescCall); + + // Other than that it's just a normal indirect call to the function loaded + // from the descriptor. + MCInst BLR; + BLR.setOpcode(AArch64::BLR); + BLR.addOperand(Callee); + EmitToStreamer(OutStreamer, BLR); + + return; + } + + case TargetOpcode::STACKMAP: + return LowerSTACKMAP(OutStreamer, SM, *MI); + + case TargetOpcode::PATCHPOINT: + return LowerPATCHPOINT(OutStreamer, SM, *MI); + } + + // Finally, do the automated lowerings for everything else. + MCInst TmpInst; + MCInstLowering.Lower(MI, TmpInst); + EmitToStreamer(OutStreamer, TmpInst); +} + +// Force static initialization. +extern "C" void LLVMInitializeAArch64AsmPrinter() { + RegisterAsmPrinter X(TheAArch64leTarget); + RegisterAsmPrinter Y(TheAArch64beTarget); + + RegisterAsmPrinter Z(TheARM64leTarget); + RegisterAsmPrinter W(TheARM64beTarget); +} diff --git a/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/lib/Target/AArch64/AArch64BranchRelaxation.cpp new file mode 100644 index 00000000000..52094526727 --- /dev/null +++ b/lib/Target/AArch64/AArch64BranchRelaxation.cpp @@ -0,0 +1,510 @@ +//===-- AArch64BranchRelaxation.cpp - AArch64 branch relaxation -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/CommandLine.h" +using namespace llvm; + +#define DEBUG_TYPE "aarch64-branch-relax" + +static cl::opt +BranchRelaxation("aarch64-branch-relax", cl::Hidden, cl::init(true), + cl::desc("Relax out of range conditional branches")); + +static cl::opt +TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), + cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); + +static cl::opt +CBZDisplacementBits("aarch64-cbz-offset-bits", cl::Hidden, cl::init(19), + cl::desc("Restrict range of CB[N]Z instructions (DEBUG)")); + +static cl::opt +BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), + cl::desc("Restrict range of Bcc instructions (DEBUG)")); + +STATISTIC(NumSplit, "Number of basic blocks split"); +STATISTIC(NumRelaxed, "Number of conditional branches relaxed"); + +namespace { +class AArch64BranchRelaxation : public MachineFunctionPass { + /// BasicBlockInfo - Information about the offset and size of a single + /// basic block. + struct BasicBlockInfo { + /// Offset - Distance from the beginning of the function to the beginning + /// of this basic block. + /// + /// The offset is always aligned as required by the basic block. + unsigned Offset; + + /// Size - Size of the basic block in bytes. If the block contains + /// inline assembly, this is a worst case estimate. + /// + /// The size does not include any alignment padding whether from the + /// beginning of the block, or from an aligned jump table at the end. + unsigned Size; + + BasicBlockInfo() : Offset(0), Size(0) {} + + /// Compute the offset immediately following this block. If LogAlign is + /// specified, return the offset the successor block will get if it has + /// this alignment. + unsigned postOffset(unsigned LogAlign = 0) const { + unsigned PO = Offset + Size; + unsigned Align = 1 << LogAlign; + return (PO + Align - 1) / Align * Align; + } + }; + + SmallVector BlockInfo; + + MachineFunction *MF; + const AArch64InstrInfo *TII; + + bool relaxBranchInstructions(); + void scanFunction(); + MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI); + void adjustBlockOffsets(MachineBasicBlock &MBB); + bool isBlockInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp); + bool fixupConditionalBranch(MachineInstr *MI); + void computeBlockSize(const MachineBasicBlock &MBB); + unsigned getInstrOffset(MachineInstr *MI) const; + void dumpBBs(); + void verify(); + +public: + static char ID; + AArch64BranchRelaxation() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "AArch64 branch relaxation pass"; + } +}; +char AArch64BranchRelaxation::ID = 0; +} + +/// verify - check BBOffsets, BBSizes, alignment of islands +void AArch64BranchRelaxation::verify() { +#ifndef NDEBUG + unsigned PrevNum = MF->begin()->getNumber(); + for (MachineBasicBlock &MBB : *MF) { + unsigned Align = MBB.getAlignment(); + unsigned Num = MBB.getNumber(); + assert(BlockInfo[Num].Offset % (1u << Align) == 0); + assert(!Num || BlockInfo[PrevNum].postOffset() <= BlockInfo[Num].Offset); + PrevNum = Num; + } +#endif +} + +/// print block size and offset information - debugging +void AArch64BranchRelaxation::dumpBBs() { + for (auto &MBB : *MF) { + const BasicBlockInfo &BBI = BlockInfo[MBB.getNumber()]; + dbgs() << format("BB#%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset) + << format("size=%#x\n", BBI.Size); + } +} + +/// BBHasFallthrough - Return true if the specified basic block can fallthrough +/// into the block immediately after it. +static bool BBHasFallthrough(MachineBasicBlock *MBB) { + // Get the next machine basic block in the function. + MachineFunction::iterator MBBI = MBB; + // Can't fall off end of function. + MachineBasicBlock *NextBB = std::next(MBBI); + if (NextBB == MBB->getParent()->end()) + return false; + + for (MachineBasicBlock *S : MBB->successors()) + if (S == NextBB) + return true; + + return false; +} + +/// scanFunction - Do the initial scan of the function, building up +/// information about each block. +void AArch64BranchRelaxation::scanFunction() { + BlockInfo.clear(); + BlockInfo.resize(MF->getNumBlockIDs()); + + // First thing, compute the size of all basic blocks, and see if the function + // has any inline assembly in it. If so, we have to be conservative about + // alignment assumptions, as we don't know for sure the size of any + // instructions in the inline assembly. + for (MachineBasicBlock &MBB : *MF) + computeBlockSize(MBB); + + // Compute block offsets and known bits. + adjustBlockOffsets(*MF->begin()); +} + +/// computeBlockSize - Compute the size for MBB. +/// This function updates BlockInfo directly. +void AArch64BranchRelaxation::computeBlockSize(const MachineBasicBlock &MBB) { + unsigned Size = 0; + for (const MachineInstr &MI : MBB) + Size += TII->GetInstSizeInBytes(&MI); + BlockInfo[MBB.getNumber()].Size = Size; +} + +/// getInstrOffset - Return the current offset of the specified machine +/// instruction from the start of the function. This offset changes as stuff is +/// moved around inside the function. +unsigned AArch64BranchRelaxation::getInstrOffset(MachineInstr *MI) const { + MachineBasicBlock *MBB = MI->getParent(); + + // The offset is composed of two things: the sum of the sizes of all MBB's + // before this instruction's block, and the offset from the start of the block + // it is in. + unsigned Offset = BlockInfo[MBB->getNumber()].Offset; + + // Sum instructions before MI in MBB. + for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) { + assert(I != MBB->end() && "Didn't find MI in its own basic block?"); + Offset += TII->GetInstSizeInBytes(I); + } + return Offset; +} + +void AArch64BranchRelaxation::adjustBlockOffsets(MachineBasicBlock &Start) { + unsigned PrevNum = Start.getNumber(); + for (auto &MBB : make_range(MachineFunction::iterator(Start), MF->end())) { + unsigned Num = MBB.getNumber(); + if (!Num) // block zero is never changed from offset zero. + continue; + // Get the offset and known bits at the end of the layout predecessor. + // Include the alignment of the current block. + unsigned LogAlign = MBB.getAlignment(); + BlockInfo[Num].Offset = BlockInfo[PrevNum].postOffset(LogAlign); + PrevNum = Num; + } +} + +/// Split the basic block containing MI into two blocks, which are joined by +/// an unconditional branch. Update data structures and renumber blocks to +/// account for this change and returns the newly created block. +/// NOTE: Successor list of the original BB is out of date after this function, +/// and must be updated by the caller! Other transforms follow using this +/// utility function, so no point updating now rather than waiting. +MachineBasicBlock * +AArch64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) { + MachineBasicBlock *OrigBB = MI->getParent(); + + // Create a new MBB for the code after the OrigBB. + MachineBasicBlock *NewBB = + MF->CreateMachineBasicBlock(OrigBB->getBasicBlock()); + MachineFunction::iterator MBBI = OrigBB; + ++MBBI; + MF->insert(MBBI, NewBB); + + // Splice the instructions starting with MI over to NewBB. + NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end()); + + // Add an unconditional branch from OrigBB to NewBB. + // Note the new unconditional branch is not being recorded. + // There doesn't seem to be meaningful DebugInfo available; this doesn't + // correspond to anything in the source. + BuildMI(OrigBB, DebugLoc(), TII->get(AArch64::B)).addMBB(NewBB); + + // Insert an entry into BlockInfo to align it properly with the block numbers. + BlockInfo.insert(BlockInfo.begin() + NewBB->getNumber(), BasicBlockInfo()); + + // Figure out how large the OrigBB is. As the first half of the original + // block, it cannot contain a tablejump. The size includes + // the new jump we added. (It should be possible to do this without + // recounting everything, but it's very confusing, and this is rarely + // executed.) + computeBlockSize(*OrigBB); + + // Figure out how large the NewMBB is. As the second half of the original + // block, it may contain a tablejump. + computeBlockSize(*NewBB); + + // All BBOffsets following these blocks must be modified. + adjustBlockOffsets(*OrigBB); + + ++NumSplit; + + return NewBB; +} + +/// isBlockInRange - Returns true if the distance between specific MI and +/// specific BB can fit in MI's displacement field. +bool AArch64BranchRelaxation::isBlockInRange(MachineInstr *MI, + MachineBasicBlock *DestBB, + unsigned Bits) { + unsigned MaxOffs = ((1 << (Bits - 1)) - 1) << 2; + unsigned BrOffset = getInstrOffset(MI); + unsigned DestOffset = BlockInfo[DestBB->getNumber()].Offset; + + DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber() + << " from BB#" << MI->getParent()->getNumber() + << " max delta=" << MaxOffs << " from " << getInstrOffset(MI) + << " to " << DestOffset << " offset " + << int(DestOffset - BrOffset) << "\t" << *MI); + + // Branch before the Dest. + if (BrOffset <= DestOffset) + return (DestOffset - BrOffset <= MaxOffs); + return (BrOffset - DestOffset <= MaxOffs); +} + +static bool isConditionalBranch(unsigned Opc) { + switch (Opc) { + default: + return false; + case AArch64::TBZW: + case AArch64::TBNZW: + case AArch64::TBZX: + case AArch64::TBNZX: + case AArch64::CBZW: + case AArch64::CBNZW: + case AArch64::CBZX: + case AArch64::CBNZX: + case AArch64::Bcc: + return true; + } +} + +static MachineBasicBlock *getDestBlock(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + assert(0 && "unexpected opcode!"); + case AArch64::TBZW: + case AArch64::TBNZW: + case AArch64::TBZX: + case AArch64::TBNZX: + return MI->getOperand(2).getMBB(); + case AArch64::CBZW: + case AArch64::CBNZW: + case AArch64::CBZX: + case AArch64::CBNZX: + case AArch64::Bcc: + return MI->getOperand(1).getMBB(); + } +} + +static unsigned getOppositeConditionOpcode(unsigned Opc) { + switch (Opc) { + default: + assert(0 && "unexpected opcode!"); + case AArch64::TBNZW: return AArch64::TBZW; + case AArch64::TBNZX: return AArch64::TBZX; + case AArch64::TBZW: return AArch64::TBNZW; + case AArch64::TBZX: return AArch64::TBNZX; + case AArch64::CBNZW: return AArch64::CBZW; + case AArch64::CBNZX: return AArch64::CBZX; + case AArch64::CBZW: return AArch64::CBNZW; + case AArch64::CBZX: return AArch64::CBNZX; + case AArch64::Bcc: return AArch64::Bcc; // Condition is an operand for Bcc. + } +} + +static unsigned getBranchDisplacementBits(unsigned Opc) { + switch (Opc) { + default: + assert(0 && "unexpected opcode!"); + case AArch64::TBNZW: + case AArch64::TBZW: + case AArch64::TBNZX: + case AArch64::TBZX: + return TBZDisplacementBits; + case AArch64::CBNZW: + case AArch64::CBZW: + case AArch64::CBNZX: + case AArch64::CBZX: + return CBZDisplacementBits; + case AArch64::Bcc: + return BCCDisplacementBits; + } +} + +static inline void invertBccCondition(MachineInstr *MI) { + assert(MI->getOpcode() == AArch64::Bcc && "Unexpected opcode!"); + AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(0).getImm(); + CC = AArch64CC::getInvertedCondCode(CC); + MI->getOperand(0).setImm((int64_t)CC); +} + +/// fixupConditionalBranch - Fix up a conditional branch whose destination is +/// too far away to fit in its displacement field. It is converted to an inverse +/// conditional branch + an unconditional branch to the destination. +bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) { + MachineBasicBlock *DestBB = getDestBlock(MI); + + // Add an unconditional branch to the destination and invert the branch + // condition to jump over it: + // tbz L1 + // => + // tbnz L2 + // b L1 + // L2: + + // If the branch is at the end of its MBB and that has a fall-through block, + // direct the updated conditional branch to the fall-through block. Otherwise, + // split the MBB before the next instruction. + MachineBasicBlock *MBB = MI->getParent(); + MachineInstr *BMI = &MBB->back(); + bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB); + + if (BMI != MI) { + if (std::next(MachineBasicBlock::iterator(MI)) == + std::prev(MBB->getLastNonDebugInstr()) && + BMI->getOpcode() == AArch64::B) { + // Last MI in the BB is an unconditional branch. Can we simply invert the + // condition and swap destinations: + // beq L1 + // b L2 + // => + // bne L2 + // b L1 + MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB(); + if (isBlockInRange(MI, NewDest, + getBranchDisplacementBits(MI->getOpcode()))) { + DEBUG(dbgs() << " Invert condition and swap its destination with " + << *BMI); + BMI->getOperand(0).setMBB(DestBB); + unsigned OpNum = (MI->getOpcode() == AArch64::TBZW || + MI->getOpcode() == AArch64::TBNZW || + MI->getOpcode() == AArch64::TBZX || + MI->getOpcode() == AArch64::TBNZX) + ? 2 + : 1; + MI->getOperand(OpNum).setMBB(NewDest); + MI->setDesc(TII->get(getOppositeConditionOpcode(MI->getOpcode()))); + if (MI->getOpcode() == AArch64::Bcc) + invertBccCondition(MI); + return true; + } + } + } + + if (NeedSplit) { + // Analyze the branch so we know how to update the successor lists. + MachineBasicBlock *TBB, *FBB; + SmallVector Cond; + TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, false); + + MachineBasicBlock *NewBB = splitBlockBeforeInstr(MI); + // No need for the branch to the next block. We're adding an unconditional + // branch to the destination. + int delta = TII->GetInstSizeInBytes(&MBB->back()); + BlockInfo[MBB->getNumber()].Size -= delta; + MBB->back().eraseFromParent(); + // BlockInfo[SplitBB].Offset is wrong temporarily, fixed below + + // Update the successor lists according to the transformation to follow. + // Do it here since if there's no split, no update is needed. + MBB->replaceSuccessor(FBB, NewBB); + NewBB->addSuccessor(FBB); + } + MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB)); + + DEBUG(dbgs() << " Insert B to BB#" << DestBB->getNumber() + << ", invert condition and change dest. to BB#" + << NextBB->getNumber() << "\n"); + + // Insert a new conditional branch and a new unconditional branch. + MachineInstrBuilder MIB = BuildMI( + MBB, DebugLoc(), TII->get(getOppositeConditionOpcode(MI->getOpcode()))) + .addOperand(MI->getOperand(0)); + if (MI->getOpcode() == AArch64::TBZW || MI->getOpcode() == AArch64::TBNZW || + MI->getOpcode() == AArch64::TBZX || MI->getOpcode() == AArch64::TBNZX) + MIB.addOperand(MI->getOperand(1)); + if (MI->getOpcode() == AArch64::Bcc) + invertBccCondition(MIB); + MIB.addMBB(NextBB); + BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back()); + BuildMI(MBB, DebugLoc(), TII->get(AArch64::B)).addMBB(DestBB); + BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back()); + + // Remove the old conditional branch. It may or may not still be in MBB. + BlockInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI); + MI->eraseFromParent(); + + // Finally, keep the block offsets up to date. + adjustBlockOffsets(*MBB); + return true; +} + +bool AArch64BranchRelaxation::relaxBranchInstructions() { + bool Changed = false; + // Relaxing branches involves creating new basic blocks, so re-eval + // end() for termination. + for (auto &MBB : *MF) { + MachineInstr *MI = MBB.getFirstTerminator(); + if (isConditionalBranch(MI->getOpcode()) && + !isBlockInRange(MI, getDestBlock(MI), + getBranchDisplacementBits(MI->getOpcode()))) { + fixupConditionalBranch(MI); + ++NumRelaxed; + Changed = true; + } + } + return Changed; +} + +bool AArch64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) { + MF = &mf; + + // If the pass is disabled, just bail early. + if (!BranchRelaxation) + return false; + + DEBUG(dbgs() << "***** AArch64BranchRelaxation *****\n"); + + TII = (const AArch64InstrInfo *)MF->getTarget().getInstrInfo(); + + // Renumber all of the machine basic blocks in the function, guaranteeing that + // the numbers agree with the position of the block in the function. + MF->RenumberBlocks(); + + // Do the initial scan of the function, building up information about the + // sizes of each block. + scanFunction(); + + DEBUG(dbgs() << " Basic blocks before relaxation\n"); + DEBUG(dumpBBs()); + + bool MadeChange = false; + while (relaxBranchInstructions()) + MadeChange = true; + + // After a while, this might be made debug-only, but it is not expensive. + verify(); + + DEBUG(dbgs() << " Basic blocks after relaxation\n"); + DEBUG(dbgs() << '\n'; dumpBBs()); + + BlockInfo.clear(); + + return MadeChange; +} + +/// createAArch64BranchRelaxation - returns an instance of the constpool +/// island pass. +FunctionPass *llvm::createAArch64BranchRelaxation() { + return new AArch64BranchRelaxation(); +} diff --git a/lib/Target/AArch64/AArch64CallingConv.h b/lib/Target/AArch64/AArch64CallingConv.h new file mode 100644 index 00000000000..1fe426ed686 --- /dev/null +++ b/lib/Target/AArch64/AArch64CallingConv.h @@ -0,0 +1,94 @@ +//=== AArch64CallingConv.h - Custom Calling Convention Routines -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the custom routines for the AArch64 Calling Convention that +// aren't done by tablegen. +// +//===----------------------------------------------------------------------===// + +#ifndef AArch64CALLINGCONV_H +#define AArch64CALLINGCONV_H + +#include "AArch64InstrInfo.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/Target/TargetInstrInfo.h" + +namespace llvm { + +/// CC_AArch64_Custom_i1i8i16_Reg - customized handling of passing i1/i8/i16 via +/// register. Here, ValVT can be i1/i8/i16 or i32 depending on whether the +/// argument is already promoted and LocVT is i1/i8/i16. We only promote the +/// argument to i32 if we are sure this argument will be passed in register. +static bool CC_AArch64_Custom_i1i8i16_Reg(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, + CCState &State, + bool IsWebKitJS = false) { + static const MCPhysReg RegList1[] = { AArch64::W0, AArch64::W1, AArch64::W2, + AArch64::W3, AArch64::W4, AArch64::W5, + AArch64::W6, AArch64::W7 }; + static const MCPhysReg RegList2[] = { AArch64::X0, AArch64::X1, AArch64::X2, + AArch64::X3, AArch64::X4, AArch64::X5, + AArch64::X6, AArch64::X7 }; + static const MCPhysReg WebKitRegList1[] = { AArch64::W0 }; + static const MCPhysReg WebKitRegList2[] = { AArch64::X0 }; + + const MCPhysReg *List1 = IsWebKitJS ? WebKitRegList1 : RegList1; + const MCPhysReg *List2 = IsWebKitJS ? WebKitRegList2 : RegList2; + + if (unsigned Reg = State.AllocateReg(List1, List2, 8)) { + // Customized extra section for handling i1/i8/i16: + // We need to promote the argument to i32 if it is not done already. + if (ValVT != MVT::i32) { + if (ArgFlags.isSExt()) + LocInfo = CCValAssign::SExt; + else if (ArgFlags.isZExt()) + LocInfo = CCValAssign::ZExt; + else + LocInfo = CCValAssign::AExt; + ValVT = MVT::i32; + } + // Set LocVT to i32 as well if passing via register. + LocVT = MVT::i32; + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return true; + } + return false; +} + +/// CC_AArch64_WebKit_JS_i1i8i16_Reg - customized handling of passing i1/i8/i16 +/// via register. This behaves the same as CC_AArch64_Custom_i1i8i16_Reg, but only +/// uses the first register. +static bool CC_AArch64_WebKit_JS_i1i8i16_Reg(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, + CCState &State) { + return CC_AArch64_Custom_i1i8i16_Reg(ValNo, ValVT, LocVT, LocInfo, ArgFlags, + State, true); +} + +/// CC_AArch64_Custom_i1i8i16_Stack: customized handling of passing i1/i8/i16 on +/// stack. Here, ValVT can be i1/i8/i16 or i32 depending on whether the argument +/// is already promoted and LocVT is i1/i8/i16. If ValVT is already promoted, +/// it will be truncated back to i1/i8/i16. +static bool CC_AArch64_Custom_i1i8i16_Stack(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, + CCState &State) { + unsigned Space = ((LocVT == MVT::i1 || LocVT == MVT::i8) ? 1 : 2); + unsigned Offset12 = State.AllocateStack(Space, Space); + ValVT = LocVT; + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset12, LocVT, LocInfo)); + return true; +} + +} // End llvm namespace + +#endif diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td new file mode 100644 index 00000000000..c263d14dcc3 --- /dev/null +++ b/lib/Target/AArch64/AArch64CallingConvention.td @@ -0,0 +1,236 @@ +//=- AArch64CallingConv.td - Calling Conventions for AArch64 -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This describes the calling conventions for AArch64 architecture. +// +//===----------------------------------------------------------------------===// + +/// CCIfAlign - Match of the original alignment of the arg +class CCIfAlign : + CCIf; +/// CCIfBigEndian - Match only if we're in big endian mode. +class CCIfBigEndian : + CCIf<"State.getTarget().getDataLayout()->isBigEndian()", A>; + +//===----------------------------------------------------------------------===// +// ARM AAPCS64 Calling Convention +//===----------------------------------------------------------------------===// + +def CC_AArch64_AAPCS : CallingConv<[ + CCIfType<[v2f32], CCBitConvertToType>, + CCIfType<[v2f64, v4f32], CCBitConvertToType>, + + // Big endian vectors must be passed as if they were 1-element vectors so that + // their lanes are in a consistent order. + CCIfBigEndian>>, + CCIfBigEndian>>, + + // An SRet is passed in X8, not X0 like a normal pointer parameter. + CCIfSRet>>, + + // Put ByVal arguments directly on the stack. Minimum size and alignment of a + // slot is 64-bit. + CCIfByVal>, + + // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, + // up to eight each of GPR and FPR. + CCIfType<[i1, i8, i16], CCCustom<"CC_AArch64_Custom_i1i8i16_Reg">>, + CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], + [X0, X1, X2, X3, X4, X5, X6, X7]>>, + // i128 is split to two i64s, we can't fit half to register X7. + CCIfType<[i64], CCIfSplit>>, + + // i128 is split to two i64s, and its stack alignment is 16 bytes. + CCIfType<[i64], CCIfSplit>>, + + CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], + [W0, W1, W2, W3, W4, W5, W6, W7]>>, + CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], + CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], + CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + + // If more than will fit in registers, pass them on the stack instead. + CCIfType<[i1, i8, i16], CCAssignToStack<8, 8>>, + CCIfType<[i32, f32], CCAssignToStack<8, 8>>, + CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8], + CCAssignToStack<8, 8>>, + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], + CCAssignToStack<16, 16>> +]>; + +def RetCC_AArch64_AAPCS : CallingConv<[ + CCIfType<[v2f32], CCBitConvertToType>, + CCIfType<[v2f64, v4f32], CCBitConvertToType>, + + // Big endian vectors must be passed as if they were 1-element vectors so that + // their lanes are in a consistent order. + CCIfBigEndian>>, + CCIfBigEndian>>, + + CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], + [X0, X1, X2, X3, X4, X5, X6, X7]>>, + CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], + [W0, W1, W2, W3, W4, W5, W6, W7]>>, + CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], + CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], + CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>> +]>; + + +// Darwin uses a calling convention which differs in only two ways +// from the standard one at this level: +// + i128s (i.e. split i64s) don't need even registers. +// + Stack slots are sized as needed rather than being at least 64-bit. +def CC_AArch64_DarwinPCS : CallingConv<[ + CCIfType<[v2f32], CCBitConvertToType>, + CCIfType<[v2f64, v4f32, f128], CCBitConvertToType>, + + // An SRet is passed in X8, not X0 like a normal pointer parameter. + CCIfSRet>>, + + // Put ByVal arguments directly on the stack. Minimum size and alignment of a + // slot is 64-bit. + CCIfByVal>, + + // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, + // up to eight each of GPR and FPR. + CCIfType<[i1, i8, i16], CCCustom<"CC_AArch64_Custom_i1i8i16_Reg">>, + CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], + [X0, X1, X2, X3, X4, X5, X6, X7]>>, + // i128 is split to two i64s, we can't fit half to register X7. + CCIfType<[i64], + CCIfSplit>>, + // i128 is split to two i64s, and its stack alignment is 16 bytes. + CCIfType<[i64], CCIfSplit>>, + + CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], + [W0, W1, W2, W3, W4, W5, W6, W7]>>, + CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], + CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], + CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + + // If more than will fit in registers, pass them on the stack instead. + CCIfType<[i1, i8, i16], CCCustom<"CC_AArch64_Custom_i1i8i16_Stack">>, + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8], + CCAssignToStack<8, 8>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>> +]>; + +def CC_AArch64_DarwinPCS_VarArg : CallingConv<[ + CCIfType<[v2f32], CCBitConvertToType>, + CCIfType<[v2f64, v4f32, f128], CCBitConvertToType>, + + // Handle all scalar types as either i64 or f64. + CCIfType<[i8, i16, i32], CCPromoteToType>, + CCIfType<[f32], CCPromoteToType>, + + // Everything is on the stack. + // i128 is split to two i64s, and its stack alignment is 16 bytes. + CCIfType<[i64], CCIfSplit>>, + CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], CCAssignToStack<8, 8>>, + CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>> +]>; + +// The WebKit_JS calling convention only passes the first argument (the callee) +// in register and the remaining arguments on stack. We allow 32bit stack slots, +// so that WebKit can write partial values in the stack and define the other +// 32bit quantity as undef. +def CC_AArch64_WebKit_JS : CallingConv<[ + // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0). + CCIfType<[i1, i8, i16], CCCustom<"CC_AArch64_WebKit_JS_i1i8i16_Reg">>, + CCIfType<[i32], CCAssignToRegWithShadow<[W0], [X0]>>, + CCIfType<[i64], CCAssignToRegWithShadow<[X0], [W0]>>, + + // Pass the remaining arguments on the stack instead. + CCIfType<[i1, i8, i16], CCAssignToStack<4, 4>>, + CCIfType<[i32, f32], CCAssignToStack<4, 4>>, + CCIfType<[i64, f64], CCAssignToStack<8, 8>> +]>; + +def RetCC_AArch64_WebKit_JS : CallingConv<[ + CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], + [X0, X1, X2, X3, X4, X5, X6, X7]>>, + CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], + [W0, W1, W2, W3, W4, W5, W6, W7]>>, + CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, + CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], + [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>> +]>; + +// FIXME: LR is only callee-saved in the sense that *we* preserve it and are +// presumably a callee to someone. External functions may not do so, but this +// is currently safe since BL has LR as an implicit-def and what happens after a +// tail call doesn't matter. +// +// It would be better to model its preservation semantics properly (create a +// vreg on entry, use it in RET & tail call generation; make that vreg def if we +// end up saving LR as part of a call frame). Watch this space... +def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, + X23, X24, X25, X26, X27, X28, + D8, D9, D10, D11, + D12, D13, D14, D15)>; + +// Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since +// 'this' and the pointer return value are both passed in X0 in these cases, +// this can be partially modelled by treating X0 as a callee-saved register; +// only the resulting RegMask is used; the SaveList is ignored +// +// (For generic ARM 64-bit ABI code, clang will not generate constructors or +// destructors with 'this' returns, so this RegMask will not be used in that +// case) +def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>; + +// The function used by Darwin to obtain the address of a thread-local variable +// guarantees more than a normal AAPCS function. x16 and x17 are used on the +// fast path for calculation, but other registers except X0 (argument/return) +// and LR (it is a call, after all) are preserved. +def CSR_AArch64_TLS_Darwin + : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17), + FP, + (sequence "Q%u", 0, 31))>; + +// The ELF stub used for TLS-descriptor access saves every feasible +// register. Only X0 and LR are clobbered. +def CSR_AArch64_TLS_ELF + : CalleeSavedRegs<(add (sequence "X%u", 1, 28), FP, + (sequence "Q%u", 0, 31))>; + +def CSR_AArch64_AllRegs + : CalleeSavedRegs<(add (sequence "W%u", 0, 30), WSP, + (sequence "X%u", 0, 28), FP, LR, SP, + (sequence "B%u", 0, 31), (sequence "H%u", 0, 31), + (sequence "S%u", 0, 31), (sequence "D%u", 0, 31), + (sequence "Q%u", 0, 31))>; + diff --git a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp new file mode 100644 index 00000000000..4d23dc59d7a --- /dev/null +++ b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp @@ -0,0 +1,147 @@ +//===-- AArch64CleanupLocalDynamicTLSPass.cpp ---------------------*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Local-dynamic access to thread-local variables proceeds in three stages. +// +// 1. The offset of this Module's thread-local area from TPIDR_EL0 is calculated +// in much the same way as a general-dynamic TLS-descriptor access against +// the special symbol _TLS_MODULE_BASE. +// 2. The variable's offset from _TLS_MODULE_BASE_ is calculated using +// instructions with "dtprel" modifiers. +// 3. These two are added, together with TPIDR_EL0, to obtain the variable's +// true address. +// +// This is only better than general-dynamic access to the variable if two or +// more of the first stage TLS-descriptor calculations can be combined. This +// pass looks through a function and performs such combinations. +// +//===----------------------------------------------------------------------===// +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64TargetMachine.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +using namespace llvm; + +namespace { +struct LDTLSCleanup : public MachineFunctionPass { + static char ID; + LDTLSCleanup() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override { + AArch64FunctionInfo *AFI = MF.getInfo(); + if (AFI->getNumLocalDynamicTLSAccesses() < 2) { + // No point folding accesses if there isn't at least two. + return false; + } + + MachineDominatorTree *DT = &getAnalysis(); + return VisitNode(DT->getRootNode(), 0); + } + + // Visit the dominator subtree rooted at Node in pre-order. + // If TLSBaseAddrReg is non-null, then use that to replace any + // TLS_base_addr instructions. Otherwise, create the register + // when the first such instruction is seen, and then use it + // as we encounter more instructions. + bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) { + MachineBasicBlock *BB = Node->getBlock(); + bool Changed = false; + + // Traverse the current block. + for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; + ++I) { + switch (I->getOpcode()) { + case AArch64::TLSDESC_BLR: + // Make sure it's a local dynamic access. + if (!I->getOperand(1).isSymbol() || + strcmp(I->getOperand(1).getSymbolName(), "_TLS_MODULE_BASE_")) + break; + + if (TLSBaseAddrReg) + I = replaceTLSBaseAddrCall(I, TLSBaseAddrReg); + else + I = setRegister(I, &TLSBaseAddrReg); + Changed = true; + break; + default: + break; + } + } + + // Visit the children of this block in the dominator tree. + for (MachineDomTreeNode *N : *Node) { + Changed |= VisitNode(N, TLSBaseAddrReg); + } + + return Changed; + } + + // Replace the TLS_base_addr instruction I with a copy from + // TLSBaseAddrReg, returning the new instruction. + MachineInstr *replaceTLSBaseAddrCall(MachineInstr *I, + unsigned TLSBaseAddrReg) { + MachineFunction *MF = I->getParent()->getParent(); + const AArch64TargetMachine *TM = + static_cast(&MF->getTarget()); + const AArch64InstrInfo *TII = TM->getInstrInfo(); + + // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the + // code sequence assumes the address will be. + MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(), + TII->get(TargetOpcode::COPY), + AArch64::X0).addReg(TLSBaseAddrReg); + + // Erase the TLS_base_addr instruction. + I->eraseFromParent(); + + return Copy; + } + + // Create a virtal register in *TLSBaseAddrReg, and populate it by + // inserting a copy instruction after I. Returns the new instruction. + MachineInstr *setRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) { + MachineFunction *MF = I->getParent()->getParent(); + const AArch64TargetMachine *TM = + static_cast(&MF->getTarget()); + const AArch64InstrInfo *TII = TM->getInstrInfo(); + + // Create a virtual register for the TLS base address. + MachineRegisterInfo &RegInfo = MF->getRegInfo(); + *TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass); + + // Insert a copy from X0 to TLSBaseAddrReg for later. + MachineInstr *Next = I->getNextNode(); + MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(), + TII->get(TargetOpcode::COPY), + *TLSBaseAddrReg).addReg(AArch64::X0); + + return Copy; + } + + const char *getPassName() const override { + return "Local Dynamic TLS Access Clean-up"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +} + +char LDTLSCleanup::ID = 0; +FunctionPass *llvm::createAArch64CleanupLocalDynamicTLSPass() { + return new LDTLSCleanup(); +} diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp new file mode 100644 index 00000000000..6b1f09678e9 --- /dev/null +++ b/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -0,0 +1,1117 @@ +//===---------- AArch64CollectLOH.cpp - AArch64 collect LOH pass --*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that collect the Linker Optimization Hint (LOH). +// This pass should be run at the very end of the compilation flow, just before +// assembly printer. +// To be useful for the linker, the LOH must be printed into the assembly file. +// +// A LOH describes a sequence of instructions that may be optimized by the +// linker. +// This same sequence cannot be optimized by the compiler because some of +// the information will be known at link time. +// For instance, consider the following sequence: +// L1: adrp xA, sym@PAGE +// L2: add xB, xA, sym@PAGEOFF +// L3: ldr xC, [xB, #imm] +// This sequence can be turned into: +// A literal load if sym@PAGE + sym@PAGEOFF + #imm - address(L3) is < 1MB: +// L3: ldr xC, sym+#imm +// It may also be turned into either the following more efficient +// code sequences: +// - If sym@PAGEOFF + #imm fits the encoding space of L3. +// L1: adrp xA, sym@PAGE +// L3: ldr xC, [xB, sym@PAGEOFF + #imm] +// - If sym@PAGE + sym@PAGEOFF - address(L1) < 1MB: +// L1: adr xA, sym +// L3: ldr xC, [xB, #imm] +// +// To be valid a LOH must meet all the requirements needed by all the related +// possible linker transformations. +// For instance, using the running example, the constraints to emit +// ".loh AdrpAddLdr" are: +// - L1, L2, and L3 instructions are of the expected type, i.e., +// respectively ADRP, ADD (immediate), and LD. +// - The result of L1 is used only by L2. +// - The register argument (xA) used in the ADD instruction is defined +// only by L1. +// - The result of L2 is used only by L3. +// - The base address (xB) in L3 is defined only L2. +// - The ADRP in L1 and the ADD in L2 must reference the same symbol using +// @PAGE/@PAGEOFF with no additional constants +// +// Currently supported LOHs are: +// * So called non-ADRP-related: +// - .loh AdrpAddLdr L1, L2, L3: +// L1: adrp xA, sym@PAGE +// L2: add xB, xA, sym@PAGEOFF +// L3: ldr xC, [xB, #imm] +// - .loh AdrpLdrGotLdr L1, L2, L3: +// L1: adrp xA, sym@GOTPAGE +// L2: ldr xB, [xA, sym@GOTPAGEOFF] +// L3: ldr xC, [xB, #imm] +// - .loh AdrpLdr L1, L3: +// L1: adrp xA, sym@PAGE +// L3: ldr xC, [xA, sym@PAGEOFF] +// - .loh AdrpAddStr L1, L2, L3: +// L1: adrp xA, sym@PAGE +// L2: add xB, xA, sym@PAGEOFF +// L3: str xC, [xB, #imm] +// - .loh AdrpLdrGotStr L1, L2, L3: +// L1: adrp xA, sym@GOTPAGE +// L2: ldr xB, [xA, sym@GOTPAGEOFF] +// L3: str xC, [xB, #imm] +// - .loh AdrpAdd L1, L2: +// L1: adrp xA, sym@PAGE +// L2: add xB, xA, sym@PAGEOFF +// For all these LOHs, L1, L2, L3 form a simple chain: +// L1 result is used only by L2 and L2 result by L3. +// L3 LOH-related argument is defined only by L2 and L2 LOH-related argument +// by L1. +// All these LOHs aim at using more efficient load/store patterns by folding +// some instructions used to compute the address directly into the load/store. +// +// * So called ADRP-related: +// - .loh AdrpAdrp L2, L1: +// L2: ADRP xA, sym1@PAGE +// L1: ADRP xA, sym2@PAGE +// L2 dominates L1 and xA is not redifined between L2 and L1 +// This LOH aims at getting rid of redundant ADRP instructions. +// +// The overall design for emitting the LOHs is: +// 1. AArch64CollectLOH (this pass) records the LOHs in the AArch64FunctionInfo. +// 2. AArch64AsmPrinter reads the LOHs from AArch64FunctionInfo and it: +// 1. Associates them a label. +// 2. Emits them in a MCStreamer (EmitLOHDirective). +// - The MCMachOStreamer records them into the MCAssembler. +// - The MCAsmStreamer prints them. +// - Other MCStreamers ignore them. +// 3. Closes the MCStreamer: +// - The MachObjectWriter gets them from the MCAssembler and writes +// them in the object file. +// - Other ObjectWriters ignore them. +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +#define DEBUG_TYPE "aarch64-collect-loh" + +static cl::opt +PreCollectRegister("aarch64-collect-loh-pre-collect-register", cl::Hidden, + cl::desc("Restrict analysis to registers invovled" + " in LOHs"), + cl::init(true)); + +static cl::opt +BasicBlockScopeOnly("aarch64-collect-loh-bb-only", cl::Hidden, + cl::desc("Restrict analysis at basic block scope"), + cl::init(true)); + +STATISTIC(NumADRPSimpleCandidate, + "Number of simplifiable ADRP dominate by another"); +STATISTIC(NumADRPComplexCandidate2, + "Number of simplifiable ADRP reachable by 2 defs"); +STATISTIC(NumADRPComplexCandidate3, + "Number of simplifiable ADRP reachable by 3 defs"); +STATISTIC(NumADRPComplexCandidateOther, + "Number of simplifiable ADRP reachable by 4 or more defs"); +STATISTIC(NumADDToSTRWithImm, + "Number of simplifiable STR with imm reachable by ADD"); +STATISTIC(NumLDRToSTRWithImm, + "Number of simplifiable STR with imm reachable by LDR"); +STATISTIC(NumADDToSTR, "Number of simplifiable STR reachable by ADD"); +STATISTIC(NumLDRToSTR, "Number of simplifiable STR reachable by LDR"); +STATISTIC(NumADDToLDRWithImm, + "Number of simplifiable LDR with imm reachable by ADD"); +STATISTIC(NumLDRToLDRWithImm, + "Number of simplifiable LDR with imm reachable by LDR"); +STATISTIC(NumADDToLDR, "Number of simplifiable LDR reachable by ADD"); +STATISTIC(NumLDRToLDR, "Number of simplifiable LDR reachable by LDR"); +STATISTIC(NumADRPToLDR, "Number of simplifiable LDR reachable by ADRP"); +STATISTIC(NumCplxLvl1, "Number of complex case of level 1"); +STATISTIC(NumTooCplxLvl1, "Number of too complex case of level 1"); +STATISTIC(NumCplxLvl2, "Number of complex case of level 2"); +STATISTIC(NumTooCplxLvl2, "Number of too complex case of level 2"); +STATISTIC(NumADRSimpleCandidate, "Number of simplifiable ADRP + ADD"); +STATISTIC(NumADRComplexCandidate, "Number of too complex ADRP + ADD"); + +namespace llvm { +void initializeAArch64CollectLOHPass(PassRegistry &); +} + +namespace { +struct AArch64CollectLOH : public MachineFunctionPass { + static char ID; + AArch64CollectLOH() : MachineFunctionPass(ID) { + initializeAArch64CollectLOHPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "AArch64 Collect Linker Optimization Hint (LOH)"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired(); + } + +private: +}; + +/// A set of MachineInstruction. +typedef SetVector SetOfMachineInstr; +/// Map a basic block to a set of instructions per register. +/// This is used to represent the exposed uses of a basic block +/// per register. +typedef MapVector +BlockToSetOfInstrsPerColor; +/// Map a basic block to an instruction per register. +/// This is used to represent the live-out definitions of a basic block +/// per register. +typedef MapVector +BlockToInstrPerColor; +/// Map an instruction to a set of instructions. Used to represent the +/// mapping def to reachable uses or use to definitions. +typedef MapVector InstrToInstrs; +/// Map a basic block to a BitVector. +/// This is used to record the kill registers per basic block. +typedef MapVector BlockToRegSet; + +/// Map a register to a dense id. +typedef DenseMap MapRegToId; +/// Map a dense id to a register. Used for debug purposes. +typedef SmallVector MapIdToReg; +} // end anonymous namespace. + +char AArch64CollectLOH::ID = 0; + +INITIALIZE_PASS_BEGIN(AArch64CollectLOH, "aarch64-collect-loh", + "AArch64 Collect Linker Optimization Hint (LOH)", false, + false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(AArch64CollectLOH, "aarch64-collect-loh", + "AArch64 Collect Linker Optimization Hint (LOH)", false, + false) + +/// Given a couple (MBB, reg) get the corresponding set of instruction from +/// the given "sets". +/// If this couple does not reference any set, an empty set is added to "sets" +/// for this couple and returned. +/// \param nbRegs is used internally allocate some memory. It must be consistent +/// with the way sets is used. +static SetOfMachineInstr &getSet(BlockToSetOfInstrsPerColor &sets, + const MachineBasicBlock &MBB, unsigned reg, + unsigned nbRegs) { + SetOfMachineInstr *result; + BlockToSetOfInstrsPerColor::iterator it = sets.find(&MBB); + if (it != sets.end()) + result = it->second; + else + result = sets[&MBB] = new SetOfMachineInstr[nbRegs]; + + return result[reg]; +} + +/// Given a couple (reg, MI) get the corresponding set of instructions from the +/// the given "sets". +/// This is used to get the uses record in sets of a definition identified by +/// MI and reg, i.e., MI defines reg. +/// If the couple does not reference anything, an empty set is added to +/// "sets[reg]". +/// \pre set[reg] is valid. +static SetOfMachineInstr &getUses(InstrToInstrs *sets, unsigned reg, + const MachineInstr &MI) { + return sets[reg][&MI]; +} + +/// Same as getUses but does not modify the input map: sets. +/// \return NULL if the couple (reg, MI) is not in sets. +static const SetOfMachineInstr *getUses(const InstrToInstrs *sets, unsigned reg, + const MachineInstr &MI) { + InstrToInstrs::const_iterator Res = sets[reg].find(&MI); + if (Res != sets[reg].end()) + return &(Res->second); + return nullptr; +} + +/// Initialize the reaching definition algorithm: +/// For each basic block BB in MF, record: +/// - its kill set. +/// - its reachable uses (uses that are exposed to BB's predecessors). +/// - its the generated definitions. +/// \param DummyOp if not NULL, specifies a Dummy Operation to be added to +/// the list of uses of exposed defintions. +/// \param ADRPMode specifies to only consider ADRP instructions for generated +/// definition. It also consider definitions of ADRP instructions as uses and +/// ignore other uses. The ADRPMode is used to collect the information for LHO +/// that involve ADRP operation only. +static void initReachingDef(MachineFunction &MF, + InstrToInstrs *ColorOpToReachedUses, + BlockToInstrPerColor &Gen, BlockToRegSet &Kill, + BlockToSetOfInstrsPerColor &ReachableUses, + const MapRegToId &RegToId, + const MachineInstr *DummyOp, bool ADRPMode) { + const TargetMachine &TM = MF.getTarget(); + const TargetRegisterInfo *TRI = TM.getRegisterInfo(); + + unsigned NbReg = RegToId.size(); + + for (MachineBasicBlock &MBB : MF) { + const MachineInstr **&BBGen = Gen[&MBB]; + BBGen = new const MachineInstr *[NbReg]; + memset(BBGen, 0, sizeof(const MachineInstr *) * NbReg); + + BitVector &BBKillSet = Kill[&MBB]; + BBKillSet.resize(NbReg); + for (const MachineInstr &MI : MBB) { + bool IsADRP = MI.getOpcode() == AArch64::ADRP; + + // Process uses first. + if (IsADRP || !ADRPMode) + for (const MachineOperand &MO : MI.operands()) { + // Treat ADRP def as use, as the goal of the analysis is to find + // ADRP defs reached by other ADRP defs. + if (!MO.isReg() || (!ADRPMode && !MO.isUse()) || + (ADRPMode && (!IsADRP || !MO.isDef()))) + continue; + unsigned CurReg = MO.getReg(); + MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg); + if (ItCurRegId == RegToId.end()) + continue; + CurReg = ItCurRegId->second; + + // if CurReg has not been defined, this use is reachable. + if (!BBGen[CurReg] && !BBKillSet.test(CurReg)) + getSet(ReachableUses, MBB, CurReg, NbReg).insert(&MI); + // current basic block definition for this color, if any, is in Gen. + if (BBGen[CurReg]) + getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(&MI); + } + + // Process clobbers. + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isRegMask()) + continue; + // Clobbers kill the related colors. + const uint32_t *PreservedRegs = MO.getRegMask(); + + // Set generated regs. + for (const auto Entry : RegToId) { + unsigned Reg = Entry.second; + // Use the global register ID when querying APIs external to this + // pass. + if (MachineOperand::clobbersPhysReg(PreservedRegs, Entry.first)) { + // Do not register clobbered definition for no ADRP. + // This definition is not used anyway (otherwise register + // allocation is wrong). + BBGen[Reg] = ADRPMode ? &MI : nullptr; + BBKillSet.set(Reg); + } + } + } + + // Process register defs. + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || !MO.isDef()) + continue; + unsigned CurReg = MO.getReg(); + MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg); + if (ItCurRegId == RegToId.end()) + continue; + + for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) { + MapRegToId::const_iterator ItRegId = RegToId.find(*AI); + assert(ItRegId != RegToId.end() && + "Sub-register of an " + "involved register, not recorded as involved!"); + BBKillSet.set(ItRegId->second); + BBGen[ItRegId->second] = &MI; + } + BBGen[ItCurRegId->second] = &MI; + } + } + + // If we restrict our analysis to basic block scope, conservatively add a + // dummy + // use for each generated value. + if (!ADRPMode && DummyOp && !MBB.succ_empty()) + for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg) + if (BBGen[CurReg]) + getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(DummyOp); + } +} + +/// Reaching def core algorithm: +/// while an Out has changed +/// for each bb +/// for each color +/// In[bb][color] = U Out[bb.predecessors][color] +/// insert reachableUses[bb][color] in each in[bb][color] +/// op.reachedUses +/// +/// Out[bb] = Gen[bb] U (In[bb] - Kill[bb]) +static void reachingDefAlgorithm(MachineFunction &MF, + InstrToInstrs *ColorOpToReachedUses, + BlockToSetOfInstrsPerColor &In, + BlockToSetOfInstrsPerColor &Out, + BlockToInstrPerColor &Gen, BlockToRegSet &Kill, + BlockToSetOfInstrsPerColor &ReachableUses, + unsigned NbReg) { + bool HasChanged; + do { + HasChanged = false; + for (MachineBasicBlock &MBB : MF) { + unsigned CurReg; + for (CurReg = 0; CurReg < NbReg; ++CurReg) { + SetOfMachineInstr &BBInSet = getSet(In, MBB, CurReg, NbReg); + SetOfMachineInstr &BBReachableUses = + getSet(ReachableUses, MBB, CurReg, NbReg); + SetOfMachineInstr &BBOutSet = getSet(Out, MBB, CurReg, NbReg); + unsigned Size = BBOutSet.size(); + // In[bb][color] = U Out[bb.predecessors][color] + for (MachineBasicBlock *PredMBB : MBB.predecessors()) { + SetOfMachineInstr &PredOutSet = getSet(Out, *PredMBB, CurReg, NbReg); + BBInSet.insert(PredOutSet.begin(), PredOutSet.end()); + } + // insert reachableUses[bb][color] in each in[bb][color] op.reachedses + for (const MachineInstr *MI : BBInSet) { + SetOfMachineInstr &OpReachedUses = + getUses(ColorOpToReachedUses, CurReg, *MI); + OpReachedUses.insert(BBReachableUses.begin(), BBReachableUses.end()); + } + // Out[bb] = Gen[bb] U (In[bb] - Kill[bb]) + if (!Kill[&MBB].test(CurReg)) + BBOutSet.insert(BBInSet.begin(), BBInSet.end()); + if (Gen[&MBB][CurReg]) + BBOutSet.insert(Gen[&MBB][CurReg]); + HasChanged |= BBOutSet.size() != Size; + } + } + } while (HasChanged); +} + +/// Release all memory dynamically allocated during the reaching +/// definition algorithm. +static void finitReachingDef(BlockToSetOfInstrsPerColor &In, + BlockToSetOfInstrsPerColor &Out, + BlockToInstrPerColor &Gen, + BlockToSetOfInstrsPerColor &ReachableUses) { + for (auto &IT : Out) + delete[] IT.second; + for (auto &IT : In) + delete[] IT.second; + for (auto &IT : ReachableUses) + delete[] IT.second; + for (auto &IT : Gen) + delete[] IT.second; +} + +/// Reaching definition algorithm. +/// \param MF function on which the algorithm will operate. +/// \param[out] ColorOpToReachedUses will contain the result of the reaching +/// def algorithm. +/// \param ADRPMode specify whether the reaching def algorithm should be tuned +/// for ADRP optimization. \see initReachingDef for more details. +/// \param DummyOp if not NULL, the algorithm will work at +/// basic block scope and will set for every exposed definition a use to +/// @p DummyOp. +/// \pre ColorOpToReachedUses is an array of at least number of registers of +/// InstrToInstrs. +static void reachingDef(MachineFunction &MF, + InstrToInstrs *ColorOpToReachedUses, + const MapRegToId &RegToId, bool ADRPMode = false, + const MachineInstr *DummyOp = nullptr) { + // structures: + // For each basic block. + // Out: a set per color of definitions that reach the + // out boundary of this block. + // In: Same as Out but for in boundary. + // Gen: generated color in this block (one operation per color). + // Kill: register set of killed color in this block. + // ReachableUses: a set per color of uses (operation) reachable + // for "In" definitions. + BlockToSetOfInstrsPerColor Out, In, ReachableUses; + BlockToInstrPerColor Gen; + BlockToRegSet Kill; + + // Initialize Gen, kill and reachableUses. + initReachingDef(MF, ColorOpToReachedUses, Gen, Kill, ReachableUses, RegToId, + DummyOp, ADRPMode); + + // Algo. + if (!DummyOp) + reachingDefAlgorithm(MF, ColorOpToReachedUses, In, Out, Gen, Kill, + ReachableUses, RegToId.size()); + + // finit. + finitReachingDef(In, Out, Gen, ReachableUses); +} + +#ifndef NDEBUG +/// print the result of the reaching definition algorithm. +static void printReachingDef(const InstrToInstrs *ColorOpToReachedUses, + unsigned NbReg, const TargetRegisterInfo *TRI, + const MapIdToReg &IdToReg) { + unsigned CurReg; + for (CurReg = 0; CurReg < NbReg; ++CurReg) { + if (ColorOpToReachedUses[CurReg].empty()) + continue; + DEBUG(dbgs() << "*** Reg " << PrintReg(IdToReg[CurReg], TRI) << " ***\n"); + + for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) { + DEBUG(dbgs() << "Def:\n"); + DEBUG(DefsIt.first->print(dbgs())); + DEBUG(dbgs() << "Reachable uses:\n"); + for (const MachineInstr *MI : DefsIt.second) { + DEBUG(MI->print(dbgs())); + } + } + } +} +#endif // NDEBUG + +/// Answer the following question: Can Def be one of the definition +/// involved in a part of a LOH? +static bool canDefBePartOfLOH(const MachineInstr *Def) { + unsigned Opc = Def->getOpcode(); + // Accept ADRP, ADDLow and LOADGot. + switch (Opc) { + default: + return false; + case AArch64::ADRP: + return true; + case AArch64::ADDXri: + // Check immediate to see if the immediate is an address. + switch (Def->getOperand(2).getType()) { + default: + return false; + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_JumpTableIndex: + case MachineOperand::MO_ConstantPoolIndex: + case MachineOperand::MO_BlockAddress: + return true; + } + case AArch64::LDRXui: + // Check immediate to see if the immediate is an address. + switch (Def->getOperand(2).getType()) { + default: + return false; + case MachineOperand::MO_GlobalAddress: + return true; + } + } + // Unreachable. + return false; +} + +/// Check whether the given instruction can the end of a LOH chain involving a +/// store. +static bool isCandidateStore(const MachineInstr *Instr) { + switch (Instr->getOpcode()) { + default: + return false; + case AArch64::STRBui: + case AArch64::STRHui: + case AArch64::STRWui: + case AArch64::STRXui: + case AArch64::STRSui: + case AArch64::STRDui: + case AArch64::STRQui: + // In case we have str xA, [xA, #imm], this is two different uses + // of xA and we cannot fold, otherwise the xA stored may be wrong, + // even if #imm == 0. + if (Instr->getOperand(0).getReg() != Instr->getOperand(1).getReg()) + return true; + } + return false; +} + +/// Given the result of a reaching definition algorithm in ColorOpToReachedUses, +/// Build the Use to Defs information and filter out obvious non-LOH candidates. +/// In ADRPMode, non-LOH candidates are "uses" with non-ADRP definitions. +/// In non-ADRPMode, non-LOH candidates are "uses" with several definition, +/// i.e., no simple chain. +/// \param ADRPMode -- \see initReachingDef. +static void reachedUsesToDefs(InstrToInstrs &UseToReachingDefs, + const InstrToInstrs *ColorOpToReachedUses, + const MapRegToId &RegToId, + bool ADRPMode = false) { + + SetOfMachineInstr NotCandidate; + unsigned NbReg = RegToId.size(); + MapRegToId::const_iterator EndIt = RegToId.end(); + for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg) { + // If this color is never defined, continue. + if (ColorOpToReachedUses[CurReg].empty()) + continue; + + for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) { + for (const MachineInstr *MI : DefsIt.second) { + const MachineInstr *Def = DefsIt.first; + MapRegToId::const_iterator It; + // if all the reaching defs are not adrp, this use will not be + // simplifiable. + if ((ADRPMode && Def->getOpcode() != AArch64::ADRP) || + (!ADRPMode && !canDefBePartOfLOH(Def)) || + (!ADRPMode && isCandidateStore(MI) && + // store are LOH candidate iff the end of the chain is used as + // base. + ((It = RegToId.find((MI)->getOperand(1).getReg())) == EndIt || + It->second != CurReg))) { + NotCandidate.insert(MI); + continue; + } + // Do not consider self reaching as a simplifiable case for ADRP. + if (!ADRPMode || MI != DefsIt.first) { + UseToReachingDefs[MI].insert(DefsIt.first); + // If UsesIt has several reaching definitions, it is not + // candidate for simplificaton in non-ADRPMode. + if (!ADRPMode && UseToReachingDefs[MI].size() > 1) + NotCandidate.insert(MI); + } + } + } + } + for (const MachineInstr *Elem : NotCandidate) { + DEBUG(dbgs() << "Too many reaching defs: " << *Elem << "\n"); + // It would have been better if we could just remove the entry + // from the map. Because of that, we have to filter the garbage + // (second.empty) in the subsequence analysis. + UseToReachingDefs[Elem].clear(); + } +} + +/// Based on the use to defs information (in ADRPMode), compute the +/// opportunities of LOH ADRP-related. +static void computeADRP(const InstrToInstrs &UseToDefs, + AArch64FunctionInfo &AArch64FI, + const MachineDominatorTree *MDT) { + DEBUG(dbgs() << "*** Compute LOH for ADRP\n"); + for (const auto &Entry : UseToDefs) { + unsigned Size = Entry.second.size(); + if (Size == 0) + continue; + if (Size == 1) { + const MachineInstr *L2 = *Entry.second.begin(); + const MachineInstr *L1 = Entry.first; + if (!MDT->dominates(L2, L1)) { + DEBUG(dbgs() << "Dominance check failed:\n" << *L2 << '\n' << *L1 + << '\n'); + continue; + } + DEBUG(dbgs() << "Record AdrpAdrp:\n" << *L2 << '\n' << *L1 << '\n'); + SmallVector Args; + Args.push_back(L2); + Args.push_back(L1); + AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, Args); + ++NumADRPSimpleCandidate; + } +#ifdef DEBUG + else if (Size == 2) + ++NumADRPComplexCandidate2; + else if (Size == 3) + ++NumADRPComplexCandidate3; + else + ++NumADRPComplexCandidateOther; +#endif + // if Size < 1, the use should have been removed from the candidates + assert(Size >= 1 && "No reaching defs for that use!"); + } +} + +/// Check whether the given instruction can be the end of a LOH chain +/// involving a load. +static bool isCandidateLoad(const MachineInstr *Instr) { + switch (Instr->getOpcode()) { + default: + return false; + case AArch64::LDRSBWui: + case AArch64::LDRSBXui: + case AArch64::LDRSHWui: + case AArch64::LDRSHXui: + case AArch64::LDRSWui: + case AArch64::LDRBui: + case AArch64::LDRHui: + case AArch64::LDRWui: + case AArch64::LDRXui: + case AArch64::LDRSui: + case AArch64::LDRDui: + case AArch64::LDRQui: + if (Instr->getOperand(2).getTargetFlags() & AArch64II::MO_GOT) + return false; + return true; + } + // Unreachable. + return false; +} + +/// Check whether the given instruction can load a litteral. +static bool supportLoadFromLiteral(const MachineInstr *Instr) { + switch (Instr->getOpcode()) { + default: + return false; + case AArch64::LDRSWui: + case AArch64::LDRWui: + case AArch64::LDRXui: + case AArch64::LDRSui: + case AArch64::LDRDui: + case AArch64::LDRQui: + return true; + } + // Unreachable. + return false; +} + +/// Check whether the given instruction is a LOH candidate. +/// \param UseToDefs is used to check that Instr is at the end of LOH supported +/// chain. +/// \pre UseToDefs contains only on def per use, i.e., obvious non candidate are +/// already been filtered out. +static bool isCandidate(const MachineInstr *Instr, + const InstrToInstrs &UseToDefs, + const MachineDominatorTree *MDT) { + if (!isCandidateLoad(Instr) && !isCandidateStore(Instr)) + return false; + + const MachineInstr *Def = *UseToDefs.find(Instr)->second.begin(); + if (Def->getOpcode() != AArch64::ADRP) { + // At this point, Def is ADDXri or LDRXui of the right type of + // symbol, because we filtered out the uses that were not defined + // by these kind of instructions (+ ADRP). + + // Check if this forms a simple chain: each intermediate node must + // dominates the next one. + if (!MDT->dominates(Def, Instr)) + return false; + // Move one node up in the simple chain. + if (UseToDefs.find(Def) == + UseToDefs.end() + // The map may contain garbage we have to ignore. + || + UseToDefs.find(Def)->second.empty()) + return false; + Instr = Def; + Def = *UseToDefs.find(Def)->second.begin(); + } + // Check if we reached the top of the simple chain: + // - top is ADRP. + // - check the simple chain property: each intermediate node must + // dominates the next one. + if (Def->getOpcode() == AArch64::ADRP) + return MDT->dominates(Def, Instr); + return false; +} + +static bool registerADRCandidate(const MachineInstr &Use, + const InstrToInstrs &UseToDefs, + const InstrToInstrs *DefsPerColorToUses, + AArch64FunctionInfo &AArch64FI, + SetOfMachineInstr *InvolvedInLOHs, + const MapRegToId &RegToId) { + // Look for opportunities to turn ADRP -> ADD or + // ADRP -> LDR GOTPAGEOFF into ADR. + // If ADRP has more than one use. Give up. + if (Use.getOpcode() != AArch64::ADDXri && + (Use.getOpcode() != AArch64::LDRXui || + !(Use.getOperand(2).getTargetFlags() & AArch64II::MO_GOT))) + return false; + InstrToInstrs::const_iterator It = UseToDefs.find(&Use); + // The map may contain garbage that we need to ignore. + if (It == UseToDefs.end() || It->second.empty()) + return false; + const MachineInstr &Def = **It->second.begin(); + if (Def.getOpcode() != AArch64::ADRP) + return false; + // Check the number of users of ADRP. + const SetOfMachineInstr *Users = + getUses(DefsPerColorToUses, + RegToId.find(Def.getOperand(0).getReg())->second, Def); + if (Users->size() > 1) { + ++NumADRComplexCandidate; + return false; + } + ++NumADRSimpleCandidate; + assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Def)) && + "ADRP already involved in LOH."); + assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Use)) && + "ADD already involved in LOH."); + DEBUG(dbgs() << "Record AdrpAdd\n" << Def << '\n' << Use << '\n'); + + SmallVector Args; + Args.push_back(&Def); + Args.push_back(&Use); + + AArch64FI.addLOHDirective(Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd + : MCLOH_AdrpLdrGot, + Args); + return true; +} + +/// Based on the use to defs information (in non-ADRPMode), compute the +/// opportunities of LOH non-ADRP-related +static void computeOthers(const InstrToInstrs &UseToDefs, + const InstrToInstrs *DefsPerColorToUses, + AArch64FunctionInfo &AArch64FI, const MapRegToId &RegToId, + const MachineDominatorTree *MDT) { + SetOfMachineInstr *InvolvedInLOHs = nullptr; +#ifdef DEBUG + SetOfMachineInstr InvolvedInLOHsStorage; + InvolvedInLOHs = &InvolvedInLOHsStorage; +#endif // DEBUG + DEBUG(dbgs() << "*** Compute LOH for Others\n"); + // ADRP -> ADD/LDR -> LDR/STR pattern. + // Fall back to ADRP -> ADD pattern if we fail to catch the bigger pattern. + + // FIXME: When the statistics are not important, + // This initial filtering loop can be merged into the next loop. + // Currently, we didn't do it to have the same code for both DEBUG and + // NDEBUG builds. Indeed, the iterator of the second loop would need + // to be changed. + SetOfMachineInstr PotentialCandidates; + SetOfMachineInstr PotentialADROpportunities; + for (auto &Use : UseToDefs) { + // If no definition is available, this is a non candidate. + if (Use.second.empty()) + continue; + // Keep only instructions that are load or store and at the end of + // a ADRP -> ADD/LDR/Nothing chain. + // We already filtered out the no-chain cases. + if (!isCandidate(Use.first, UseToDefs, MDT)) { + PotentialADROpportunities.insert(Use.first); + continue; + } + PotentialCandidates.insert(Use.first); + } + + // Make the following distinctions for statistics as the linker does + // know how to decode instructions: + // - ADD/LDR/Nothing make there different patterns. + // - LDR/STR make two different patterns. + // Hence, 6 - 1 base patterns. + // (because ADRP-> Nothing -> STR is not simplifiable) + + // The linker is only able to have a simple semantic, i.e., if pattern A + // do B. + // However, we want to see the opportunity we may miss if we were able to + // catch more complex cases. + + // PotentialCandidates are result of a chain ADRP -> ADD/LDR -> + // A potential candidate becomes a candidate, if its current immediate + // operand is zero and all nodes of the chain have respectively only one user +#ifdef DEBUG + SetOfMachineInstr DefsOfPotentialCandidates; +#endif + for (const MachineInstr *Candidate : PotentialCandidates) { + // Get the definition of the candidate i.e., ADD or LDR. + const MachineInstr *Def = *UseToDefs.find(Candidate)->second.begin(); + // Record the elements of the chain. + const MachineInstr *L1 = Def; + const MachineInstr *L2 = nullptr; + unsigned ImmediateDefOpc = Def->getOpcode(); + if (Def->getOpcode() != AArch64::ADRP) { + // Check the number of users of this node. + const SetOfMachineInstr *Users = + getUses(DefsPerColorToUses, + RegToId.find(Def->getOperand(0).getReg())->second, *Def); + if (Users->size() > 1) { +#ifdef DEBUG + // if all the uses of this def are in potential candidate, this is + // a complex candidate of level 2. + bool IsLevel2 = true; + for (const MachineInstr *MI : *Users) { + if (!PotentialCandidates.count(MI)) { + ++NumTooCplxLvl2; + IsLevel2 = false; + break; + } + } + if (IsLevel2) + ++NumCplxLvl2; +#endif // DEBUG + PotentialADROpportunities.insert(Def); + continue; + } + L2 = Def; + Def = *UseToDefs.find(Def)->second.begin(); + L1 = Def; + } // else the element in the middle of the chain is nothing, thus + // Def already contains the first element of the chain. + + // Check the number of users of the first node in the chain, i.e., ADRP + const SetOfMachineInstr *Users = + getUses(DefsPerColorToUses, + RegToId.find(Def->getOperand(0).getReg())->second, *Def); + if (Users->size() > 1) { +#ifdef DEBUG + // if all the uses of this def are in the defs of the potential candidate, + // this is a complex candidate of level 1 + if (DefsOfPotentialCandidates.empty()) { + // lazy init + DefsOfPotentialCandidates = PotentialCandidates; + for (const MachineInstr *Candidate : PotentialCandidates) { + if (!UseToDefs.find(Candidate)->second.empty()) + DefsOfPotentialCandidates.insert( + *UseToDefs.find(Candidate)->second.begin()); + } + } + bool Found = false; + for (auto &Use : *Users) { + if (!DefsOfPotentialCandidates.count(Use)) { + ++NumTooCplxLvl1; + Found = true; + break; + } + } + if (!Found) + ++NumCplxLvl1; +#endif // DEBUG + continue; + } + + bool IsL2Add = (ImmediateDefOpc == AArch64::ADDXri); + // If the chain is three instructions long and ldr is the second element, + // then this ldr must load form GOT, otherwise this is not a correct chain. + if (L2 && !IsL2Add && L2->getOperand(2).getTargetFlags() != AArch64II::MO_GOT) + continue; + SmallVector Args; + MCLOHType Kind; + if (isCandidateLoad(Candidate)) { + if (!L2) { + // At this point, the candidate LOH indicates that the ldr instruction + // may use a direct access to the symbol. There is not such encoding + // for loads of byte and half. + if (!supportLoadFromLiteral(Candidate)) + continue; + + DEBUG(dbgs() << "Record AdrpLdr:\n" << *L1 << '\n' << *Candidate + << '\n'); + Kind = MCLOH_AdrpLdr; + Args.push_back(L1); + Args.push_back(Candidate); + assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) && + "L1 already involved in LOH."); + assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) && + "Candidate already involved in LOH."); + ++NumADRPToLDR; + } else { + DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot") + << "Ldr:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate + << '\n'); + + Kind = IsL2Add ? MCLOH_AdrpAddLdr : MCLOH_AdrpLdrGotLdr; + Args.push_back(L1); + Args.push_back(L2); + Args.push_back(Candidate); + + PotentialADROpportunities.remove(L2); + assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) && + "L1 already involved in LOH."); + assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) && + "L2 already involved in LOH."); + assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) && + "Candidate already involved in LOH."); +#ifdef DEBUG + // get the immediate of the load + if (Candidate->getOperand(2).getImm() == 0) + if (ImmediateDefOpc == AArch64::ADDXri) + ++NumADDToLDR; + else + ++NumLDRToLDR; + else if (ImmediateDefOpc == AArch64::ADDXri) + ++NumADDToLDRWithImm; + else + ++NumLDRToLDRWithImm; +#endif // DEBUG + } + } else { + if (ImmediateDefOpc == AArch64::ADRP) + continue; + else { + + DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot") + << "Str:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate + << '\n'); + + Kind = IsL2Add ? MCLOH_AdrpAddStr : MCLOH_AdrpLdrGotStr; + Args.push_back(L1); + Args.push_back(L2); + Args.push_back(Candidate); + + PotentialADROpportunities.remove(L2); + assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) && + "L1 already involved in LOH."); + assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) && + "L2 already involved in LOH."); + assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) && + "Candidate already involved in LOH."); +#ifdef DEBUG + // get the immediate of the store + if (Candidate->getOperand(2).getImm() == 0) + if (ImmediateDefOpc == AArch64::ADDXri) + ++NumADDToSTR; + else + ++NumLDRToSTR; + else if (ImmediateDefOpc == AArch64::ADDXri) + ++NumADDToSTRWithImm; + else + ++NumLDRToSTRWithImm; +#endif // DEBUG + } + } + AArch64FI.addLOHDirective(Kind, Args); + } + + // Now, we grabbed all the big patterns, check ADR opportunities. + for (const MachineInstr *Candidate : PotentialADROpportunities) + registerADRCandidate(*Candidate, UseToDefs, DefsPerColorToUses, AArch64FI, + InvolvedInLOHs, RegToId); +} + +/// Look for every register defined by potential LOHs candidates. +/// Map these registers with dense id in @p RegToId and vice-versa in +/// @p IdToReg. @p IdToReg is populated only in DEBUG mode. +static void collectInvolvedReg(MachineFunction &MF, MapRegToId &RegToId, + MapIdToReg &IdToReg, + const TargetRegisterInfo *TRI) { + unsigned CurRegId = 0; + if (!PreCollectRegister) { + unsigned NbReg = TRI->getNumRegs(); + for (; CurRegId < NbReg; ++CurRegId) { + RegToId[CurRegId] = CurRegId; + DEBUG(IdToReg.push_back(CurRegId)); + DEBUG(assert(IdToReg[CurRegId] == CurRegId && "Reg index mismatches")); + } + return; + } + + DEBUG(dbgs() << "** Collect Involved Register\n"); + for (const auto &MBB : MF) { + for (const MachineInstr &MI : MBB) { + if (!canDefBePartOfLOH(&MI)) + continue; + + // Process defs + for (MachineInstr::const_mop_iterator IO = MI.operands_begin(), + IOEnd = MI.operands_end(); + IO != IOEnd; ++IO) { + if (!IO->isReg() || !IO->isDef()) + continue; + unsigned CurReg = IO->getReg(); + for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) + if (RegToId.find(*AI) == RegToId.end()) { + DEBUG(IdToReg.push_back(*AI); + assert(IdToReg[CurRegId] == *AI && + "Reg index mismatches insertion index.")); + RegToId[*AI] = CurRegId++; + DEBUG(dbgs() << "Register: " << PrintReg(*AI, TRI) << '\n'); + } + } + } + } +} + +bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) { + const TargetMachine &TM = MF.getTarget(); + const TargetRegisterInfo *TRI = TM.getRegisterInfo(); + const MachineDominatorTree *MDT = &getAnalysis(); + + MapRegToId RegToId; + MapIdToReg IdToReg; + AArch64FunctionInfo *AArch64FI = MF.getInfo(); + assert(AArch64FI && "No MachineFunctionInfo for this function!"); + + DEBUG(dbgs() << "Looking for LOH in " << MF.getName() << '\n'); + + collectInvolvedReg(MF, RegToId, IdToReg, TRI); + if (RegToId.empty()) + return false; + + MachineInstr *DummyOp = nullptr; + if (BasicBlockScopeOnly) { + const AArch64InstrInfo *TII = + static_cast(TM.getInstrInfo()); + // For local analysis, create a dummy operation to record uses that are not + // local. + DummyOp = MF.CreateMachineInstr(TII->get(AArch64::COPY), DebugLoc()); + } + + unsigned NbReg = RegToId.size(); + bool Modified = false; + + // Start with ADRP. + InstrToInstrs *ColorOpToReachedUses = new InstrToInstrs[NbReg]; + + // Compute the reaching def in ADRP mode, meaning ADRP definitions + // are first considered as uses. + reachingDef(MF, ColorOpToReachedUses, RegToId, true, DummyOp); + DEBUG(dbgs() << "ADRP reaching defs\n"); + DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg)); + + // Translate the definition to uses map into a use to definitions map to ease + // statistic computation. + InstrToInstrs ADRPToReachingDefs; + reachedUsesToDefs(ADRPToReachingDefs, ColorOpToReachedUses, RegToId, true); + + // Compute LOH for ADRP. + computeADRP(ADRPToReachingDefs, *AArch64FI, MDT); + delete[] ColorOpToReachedUses; + + // Continue with general ADRP -> ADD/LDR -> LDR/STR pattern. + ColorOpToReachedUses = new InstrToInstrs[NbReg]; + + // first perform a regular reaching def analysis. + reachingDef(MF, ColorOpToReachedUses, RegToId, false, DummyOp); + DEBUG(dbgs() << "All reaching defs\n"); + DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg)); + + // Turn that into a use to defs to ease statistic computation. + InstrToInstrs UsesToReachingDefs; + reachedUsesToDefs(UsesToReachingDefs, ColorOpToReachedUses, RegToId, false); + + // Compute other than AdrpAdrp LOH. + computeOthers(UsesToReachingDefs, ColorOpToReachedUses, *AArch64FI, RegToId, + MDT); + delete[] ColorOpToReachedUses; + + if (BasicBlockScopeOnly) + MF.DeleteMachineInstr(DummyOp); + + return Modified; +} + +/// createAArch64CollectLOHPass - returns an instance of the Statistic for +/// linker optimization pass. +FunctionPass *llvm::createAArch64CollectLOHPass() { + return new AArch64CollectLOH(); +} diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp new file mode 100644 index 00000000000..452cdecf8a0 --- /dev/null +++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -0,0 +1,919 @@ +//===-- AArch64ConditionalCompares.cpp --- CCMP formation for AArch64 -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the AArch64ConditionalCompares pass which reduces +// branching and code size by using the conditional compare instructions CCMP, +// CCMN, and FCMP. +// +// The CFG transformations for forming conditional compares are very similar to +// if-conversion, and this pass should run immediately before the early +// if-conversion pass. +// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SparseSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineTraceMetrics.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-ccmp" + +// Absolute maximum number of instructions allowed per speculated block. +// This bypasses all other heuristics, so it should be set fairly high. +static cl::opt BlockInstrLimit( + "aarch64-ccmp-limit", cl::init(30), cl::Hidden, + cl::desc("Maximum number of instructions per speculated block.")); + +// Stress testing mode - disable heuristics. +static cl::opt Stress("aarch64-stress-ccmp", cl::Hidden, + cl::desc("Turn all knobs to 11")); + +STATISTIC(NumConsidered, "Number of ccmps considered"); +STATISTIC(NumPhiRejs, "Number of ccmps rejected (PHI)"); +STATISTIC(NumPhysRejs, "Number of ccmps rejected (Physregs)"); +STATISTIC(NumPhi2Rejs, "Number of ccmps rejected (PHI2)"); +STATISTIC(NumHeadBranchRejs, "Number of ccmps rejected (Head branch)"); +STATISTIC(NumCmpBranchRejs, "Number of ccmps rejected (CmpBB branch)"); +STATISTIC(NumCmpTermRejs, "Number of ccmps rejected (CmpBB is cbz...)"); +STATISTIC(NumImmRangeRejs, "Number of ccmps rejected (Imm out of range)"); +STATISTIC(NumLiveDstRejs, "Number of ccmps rejected (Cmp dest live)"); +STATISTIC(NumMultNZCVUses, "Number of ccmps rejected (NZCV used)"); +STATISTIC(NumUnknNZCVDefs, "Number of ccmps rejected (NZCV def unknown)"); + +STATISTIC(NumSpeculateRejs, "Number of ccmps rejected (Can't speculate)"); + +STATISTIC(NumConverted, "Number of ccmp instructions created"); +STATISTIC(NumCompBranches, "Number of cbz/cbnz branches converted"); + +//===----------------------------------------------------------------------===// +// SSACCmpConv +//===----------------------------------------------------------------------===// +// +// The SSACCmpConv class performs ccmp-conversion on SSA form machine code +// after determining if it is possible. The class contains no heuristics; +// external code should be used to determine when ccmp-conversion is a good +// idea. +// +// CCmp-formation works on a CFG representing chained conditions, typically +// from C's short-circuit || and && operators: +// +// From: Head To: Head +// / | CmpBB +// / | / | +// | CmpBB / | +// | / | Tail | +// | / | | | +// Tail | | | +// | | | | +// ... ... ... ... +// +// The Head block is terminated by a br.cond instruction, and the CmpBB block +// contains compare + br.cond. Tail must be a successor of both. +// +// The cmp-conversion turns the compare instruction in CmpBB into a conditional +// compare, and merges CmpBB into Head, speculatively executing its +// instructions. The AArch64 conditional compare instructions have an immediate +// operand that specifies the NZCV flag values when the condition is false and +// the compare isn't executed. This makes it possible to chain compares with +// different condition codes. +// +// Example: +// +// if (a == 5 || b == 17) +// foo(); +// +// Head: +// cmp w0, #5 +// b.eq Tail +// CmpBB: +// cmp w1, #17 +// b.eq Tail +// ... +// Tail: +// bl _foo +// +// Becomes: +// +// Head: +// cmp w0, #5 +// ccmp w1, #17, 4, ne ; 4 = nZcv +// b.eq Tail +// ... +// Tail: +// bl _foo +// +// The ccmp condition code is the one that would cause the Head terminator to +// branch to CmpBB. +// +// FIXME: It should also be possible to speculate a block on the critical edge +// between Head and Tail, just like if-converting a diamond. +// +// FIXME: Handle PHIs in Tail by turning them into selects (if-conversion). + +namespace { +class SSACCmpConv { + MachineFunction *MF; + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + MachineRegisterInfo *MRI; + +public: + /// The first block containing a conditional branch, dominating everything + /// else. + MachineBasicBlock *Head; + + /// The block containing cmp+br.cond with a successor shared with Head. + MachineBasicBlock *CmpBB; + + /// The common successor for Head and CmpBB. + MachineBasicBlock *Tail; + + /// The compare instruction in CmpBB that can be converted to a ccmp. + MachineInstr *CmpMI; + +private: + /// The branch condition in Head as determined by AnalyzeBranch. + SmallVector HeadCond; + + /// The condition code that makes Head branch to CmpBB. + AArch64CC::CondCode HeadCmpBBCC; + + /// The branch condition in CmpBB. + SmallVector CmpBBCond; + + /// The condition code that makes CmpBB branch to Tail. + AArch64CC::CondCode CmpBBTailCC; + + /// Check if the Tail PHIs are trivially convertible. + bool trivialTailPHIs(); + + /// Remove CmpBB from the Tail PHIs. + void updateTailPHIs(); + + /// Check if an operand defining DstReg is dead. + bool isDeadDef(unsigned DstReg); + + /// Find the compare instruction in MBB that controls the conditional branch. + /// Return NULL if a convertible instruction can't be found. + MachineInstr *findConvertibleCompare(MachineBasicBlock *MBB); + + /// Return true if all non-terminator instructions in MBB can be safely + /// speculated. + bool canSpeculateInstrs(MachineBasicBlock *MBB, const MachineInstr *CmpMI); + +public: + /// runOnMachineFunction - Initialize per-function data structures. + void runOnMachineFunction(MachineFunction &MF) { + this->MF = &MF; + TII = MF.getTarget().getInstrInfo(); + TRI = MF.getTarget().getRegisterInfo(); + MRI = &MF.getRegInfo(); + } + + /// If the sub-CFG headed by MBB can be cmp-converted, initialize the + /// internal state, and return true. + bool canConvert(MachineBasicBlock *MBB); + + /// Cmo-convert the last block passed to canConvertCmp(), assuming + /// it is possible. Add any erased blocks to RemovedBlocks. + void convert(SmallVectorImpl &RemovedBlocks); + + /// Return the expected code size delta if the conversion into a + /// conditional compare is performed. + int expectedCodeSizeDelta() const; +}; +} // end anonymous namespace + +// Check that all PHIs in Tail are selecting the same value from Head and CmpBB. +// This means that no if-conversion is required when merging CmpBB into Head. +bool SSACCmpConv::trivialTailPHIs() { + for (auto &I : *Tail) { + if (!I.isPHI()) + break; + unsigned HeadReg = 0, CmpBBReg = 0; + // PHI operands come in (VReg, MBB) pairs. + for (unsigned oi = 1, oe = I.getNumOperands(); oi != oe; oi += 2) { + MachineBasicBlock *MBB = I.getOperand(oi + 1).getMBB(); + unsigned Reg = I.getOperand(oi).getReg(); + if (MBB == Head) { + assert((!HeadReg || HeadReg == Reg) && "Inconsistent PHI operands"); + HeadReg = Reg; + } + if (MBB == CmpBB) { + assert((!CmpBBReg || CmpBBReg == Reg) && "Inconsistent PHI operands"); + CmpBBReg = Reg; + } + } + if (HeadReg != CmpBBReg) + return false; + } + return true; +} + +// Assuming that trivialTailPHIs() is true, update the Tail PHIs by simply +// removing the CmpBB operands. The Head operands will be identical. +void SSACCmpConv::updateTailPHIs() { + for (auto &I : *Tail) { + if (!I.isPHI()) + break; + // I is a PHI. It can have multiple entries for CmpBB. + for (unsigned oi = I.getNumOperands(); oi > 2; oi -= 2) { + // PHI operands are (Reg, MBB) at (oi-2, oi-1). + if (I.getOperand(oi - 1).getMBB() == CmpBB) { + I.RemoveOperand(oi - 1); + I.RemoveOperand(oi - 2); + } + } + } +} + +// This pass runs before the AArch64DeadRegisterDefinitions pass, so compares +// are still writing virtual registers without any uses. +bool SSACCmpConv::isDeadDef(unsigned DstReg) { + // Writes to the zero register are dead. + if (DstReg == AArch64::WZR || DstReg == AArch64::XZR) + return true; + if (!TargetRegisterInfo::isVirtualRegister(DstReg)) + return false; + // A virtual register def without any uses will be marked dead later, and + // eventually replaced by the zero register. + return MRI->use_nodbg_empty(DstReg); +} + +// Parse a condition code returned by AnalyzeBranch, and compute the CondCode +// corresponding to TBB. +// Return +static bool parseCond(ArrayRef Cond, AArch64CC::CondCode &CC) { + // A normal br.cond simply has the condition code. + if (Cond[0].getImm() != -1) { + assert(Cond.size() == 1 && "Unknown Cond array format"); + CC = (AArch64CC::CondCode)(int)Cond[0].getImm(); + return true; + } + // For tbz and cbz instruction, the opcode is next. + switch (Cond[1].getImm()) { + default: + // This includes tbz / tbnz branches which can't be converted to + // ccmp + br.cond. + return false; + case AArch64::CBZW: + case AArch64::CBZX: + assert(Cond.size() == 3 && "Unknown Cond array format"); + CC = AArch64CC::EQ; + return true; + case AArch64::CBNZW: + case AArch64::CBNZX: + assert(Cond.size() == 3 && "Unknown Cond array format"); + CC = AArch64CC::NE; + return true; + } +} + +MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) { + MachineBasicBlock::iterator I = MBB->getFirstTerminator(); + if (I == MBB->end()) + return nullptr; + // The terminator must be controlled by the flags. + if (!I->readsRegister(AArch64::NZCV)) { + switch (I->getOpcode()) { + case AArch64::CBZW: + case AArch64::CBZX: + case AArch64::CBNZW: + case AArch64::CBNZX: + // These can be converted into a ccmp against #0. + return I; + } + ++NumCmpTermRejs; + DEBUG(dbgs() << "Flags not used by terminator: " << *I); + return nullptr; + } + + // Now find the instruction controlling the terminator. + for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) { + --I; + assert(!I->isTerminator() && "Spurious terminator"); + switch (I->getOpcode()) { + // cmp is an alias for subs with a dead destination register. + case AArch64::SUBSWri: + case AArch64::SUBSXri: + // cmn is an alias for adds with a dead destination register. + case AArch64::ADDSWri: + case AArch64::ADDSXri: + // Check that the immediate operand is within range, ccmp wants a uimm5. + // Rd = SUBSri Rn, imm, shift + if (I->getOperand(3).getImm() || !isUInt<5>(I->getOperand(2).getImm())) { + DEBUG(dbgs() << "Immediate out of range for ccmp: " << *I); + ++NumImmRangeRejs; + return nullptr; + } + // Fall through. + case AArch64::SUBSWrr: + case AArch64::SUBSXrr: + case AArch64::ADDSWrr: + case AArch64::ADDSXrr: + if (isDeadDef(I->getOperand(0).getReg())) + return I; + DEBUG(dbgs() << "Can't convert compare with live destination: " << *I); + ++NumLiveDstRejs; + return nullptr; + case AArch64::FCMPSrr: + case AArch64::FCMPDrr: + case AArch64::FCMPESrr: + case AArch64::FCMPEDrr: + return I; + } + + // Check for flag reads and clobbers. + MIOperands::PhysRegInfo PRI = + MIOperands(I).analyzePhysReg(AArch64::NZCV, TRI); + + if (PRI.Reads) { + // The ccmp doesn't produce exactly the same flags as the original + // compare, so reject the transform if there are uses of the flags + // besides the terminators. + DEBUG(dbgs() << "Can't create ccmp with multiple uses: " << *I); + ++NumMultNZCVUses; + return nullptr; + } + + if (PRI.Clobbers) { + DEBUG(dbgs() << "Not convertible compare: " << *I); + ++NumUnknNZCVDefs; + return nullptr; + } + } + DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n'); + return nullptr; +} + +/// Determine if all the instructions in MBB can safely +/// be speculated. The terminators are not considered. +/// +/// Only CmpMI is allowed to clobber the flags. +/// +bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB, + const MachineInstr *CmpMI) { + // Reject any live-in physregs. It's probably NZCV/EFLAGS, and very hard to + // get right. + if (!MBB->livein_empty()) { + DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has live-ins.\n"); + return false; + } + + unsigned InstrCount = 0; + + // Check all instructions, except the terminators. It is assumed that + // terminators never have side effects or define any used register values. + for (auto &I : make_range(MBB->begin(), MBB->getFirstTerminator())) { + if (I.isDebugValue()) + continue; + + if (++InstrCount > BlockInstrLimit && !Stress) { + DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has more than " + << BlockInstrLimit << " instructions.\n"); + return false; + } + + // There shouldn't normally be any phis in a single-predecessor block. + if (I.isPHI()) { + DEBUG(dbgs() << "Can't hoist: " << I); + return false; + } + + // Don't speculate loads. Note that it may be possible and desirable to + // speculate GOT or constant pool loads that are guaranteed not to trap, + // but we don't support that for now. + if (I.mayLoad()) { + DEBUG(dbgs() << "Won't speculate load: " << I); + return false; + } + + // We never speculate stores, so an AA pointer isn't necessary. + bool DontMoveAcrossStore = true; + if (!I.isSafeToMove(TII, nullptr, DontMoveAcrossStore)) { + DEBUG(dbgs() << "Can't speculate: " << I); + return false; + } + + // Only CmpMI is allowed to clobber the flags. + if (&I != CmpMI && I.modifiesRegister(AArch64::NZCV, TRI)) { + DEBUG(dbgs() << "Clobbers flags: " << I); + return false; + } + } + return true; +} + +/// Analyze the sub-cfg rooted in MBB, and return true if it is a potential +/// candidate for cmp-conversion. Fill out the internal state. +/// +bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) { + Head = MBB; + Tail = CmpBB = nullptr; + + if (Head->succ_size() != 2) + return false; + MachineBasicBlock *Succ0 = Head->succ_begin()[0]; + MachineBasicBlock *Succ1 = Head->succ_begin()[1]; + + // CmpBB can only have a single predecessor. Tail is allowed many. + if (Succ0->pred_size() != 1) + std::swap(Succ0, Succ1); + + // Succ0 is our candidate for CmpBB. + if (Succ0->pred_size() != 1 || Succ0->succ_size() != 2) + return false; + + CmpBB = Succ0; + Tail = Succ1; + + if (!CmpBB->isSuccessor(Tail)) + return false; + + // The CFG topology checks out. + DEBUG(dbgs() << "\nTriangle: BB#" << Head->getNumber() << " -> BB#" + << CmpBB->getNumber() << " -> BB#" << Tail->getNumber() << '\n'); + ++NumConsidered; + + // Tail is allowed to have many predecessors, but we can't handle PHIs yet. + // + // FIXME: Real PHIs could be if-converted as long as the CmpBB values are + // defined before The CmpBB cmp clobbers the flags. Alternatively, it should + // always be safe to sink the ccmp down to immediately before the CmpBB + // terminators. + if (!trivialTailPHIs()) { + DEBUG(dbgs() << "Can't handle phis in Tail.\n"); + ++NumPhiRejs; + return false; + } + + if (!Tail->livein_empty()) { + DEBUG(dbgs() << "Can't handle live-in physregs in Tail.\n"); + ++NumPhysRejs; + return false; + } + + // CmpBB should never have PHIs since Head is its only predecessor. + // FIXME: Clean them up if it happens. + if (!CmpBB->empty() && CmpBB->front().isPHI()) { + DEBUG(dbgs() << "Can't handle phis in CmpBB.\n"); + ++NumPhi2Rejs; + return false; + } + + if (!CmpBB->livein_empty()) { + DEBUG(dbgs() << "Can't handle live-in physregs in CmpBB.\n"); + ++NumPhysRejs; + return false; + } + + // The branch we're looking to eliminate must be analyzable. + HeadCond.clear(); + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + if (TII->AnalyzeBranch(*Head, TBB, FBB, HeadCond)) { + DEBUG(dbgs() << "Head branch not analyzable.\n"); + ++NumHeadBranchRejs; + return false; + } + + // This is weird, probably some sort of degenerate CFG, or an edge to a + // landing pad. + if (!TBB || HeadCond.empty()) { + DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in Head.\n"); + ++NumHeadBranchRejs; + return false; + } + + if (!parseCond(HeadCond, HeadCmpBBCC)) { + DEBUG(dbgs() << "Unsupported branch type on Head\n"); + ++NumHeadBranchRejs; + return false; + } + + // Make sure the branch direction is right. + if (TBB != CmpBB) { + assert(TBB == Tail && "Unexpected TBB"); + HeadCmpBBCC = AArch64CC::getInvertedCondCode(HeadCmpBBCC); + } + + CmpBBCond.clear(); + TBB = FBB = nullptr; + if (TII->AnalyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) { + DEBUG(dbgs() << "CmpBB branch not analyzable.\n"); + ++NumCmpBranchRejs; + return false; + } + + if (!TBB || CmpBBCond.empty()) { + DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in CmpBB.\n"); + ++NumCmpBranchRejs; + return false; + } + + if (!parseCond(CmpBBCond, CmpBBTailCC)) { + DEBUG(dbgs() << "Unsupported branch type on CmpBB\n"); + ++NumCmpBranchRejs; + return false; + } + + if (TBB != Tail) + CmpBBTailCC = AArch64CC::getInvertedCondCode(CmpBBTailCC); + + DEBUG(dbgs() << "Head->CmpBB on " << AArch64CC::getCondCodeName(HeadCmpBBCC) + << ", CmpBB->Tail on " << AArch64CC::getCondCodeName(CmpBBTailCC) + << '\n'); + + CmpMI = findConvertibleCompare(CmpBB); + if (!CmpMI) + return false; + + if (!canSpeculateInstrs(CmpBB, CmpMI)) { + ++NumSpeculateRejs; + return false; + } + return true; +} + +void SSACCmpConv::convert(SmallVectorImpl &RemovedBlocks) { + DEBUG(dbgs() << "Merging BB#" << CmpBB->getNumber() << " into BB#" + << Head->getNumber() << ":\n" << *CmpBB); + + // All CmpBB instructions are moved into Head, and CmpBB is deleted. + // Update the CFG first. + updateTailPHIs(); + Head->removeSuccessor(CmpBB); + CmpBB->removeSuccessor(Tail); + Head->transferSuccessorsAndUpdatePHIs(CmpBB); + DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc(); + TII->RemoveBranch(*Head); + + // If the Head terminator was one of the cbz / tbz branches with built-in + // compare, we need to insert an explicit compare instruction in its place. + if (HeadCond[0].getImm() == -1) { + ++NumCompBranches; + unsigned Opc = 0; + switch (HeadCond[1].getImm()) { + case AArch64::CBZW: + case AArch64::CBNZW: + Opc = AArch64::SUBSWri; + break; + case AArch64::CBZX: + case AArch64::CBNZX: + Opc = AArch64::SUBSXri; + break; + default: + llvm_unreachable("Cannot convert Head branch"); + } + const MCInstrDesc &MCID = TII->get(Opc); + // Create a dummy virtual register for the SUBS def. + unsigned DestReg = + MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI, *MF)); + // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz. + BuildMI(*Head, Head->end(), TermDL, MCID) + .addReg(DestReg, RegState::Define | RegState::Dead) + .addOperand(HeadCond[2]) + .addImm(0) + .addImm(0); + // SUBS uses the GPR*sp register classes. + MRI->constrainRegClass(HeadCond[2].getReg(), + TII->getRegClass(MCID, 1, TRI, *MF)); + } + + Head->splice(Head->end(), CmpBB, CmpBB->begin(), CmpBB->end()); + + // Now replace CmpMI with a ccmp instruction that also considers the incoming + // flags. + unsigned Opc = 0; + unsigned FirstOp = 1; // First CmpMI operand to copy. + bool isZBranch = false; // CmpMI is a cbz/cbnz instruction. + switch (CmpMI->getOpcode()) { + default: + llvm_unreachable("Unknown compare opcode"); + case AArch64::SUBSWri: Opc = AArch64::CCMPWi; break; + case AArch64::SUBSWrr: Opc = AArch64::CCMPWr; break; + case AArch64::SUBSXri: Opc = AArch64::CCMPXi; break; + case AArch64::SUBSXrr: Opc = AArch64::CCMPXr; break; + case AArch64::ADDSWri: Opc = AArch64::CCMNWi; break; + case AArch64::ADDSWrr: Opc = AArch64::CCMNWr; break; + case AArch64::ADDSXri: Opc = AArch64::CCMNXi; break; + case AArch64::ADDSXrr: Opc = AArch64::CCMNXr; break; + case AArch64::FCMPSrr: Opc = AArch64::FCCMPSrr; FirstOp = 0; break; + case AArch64::FCMPDrr: Opc = AArch64::FCCMPDrr; FirstOp = 0; break; + case AArch64::FCMPESrr: Opc = AArch64::FCCMPESrr; FirstOp = 0; break; + case AArch64::FCMPEDrr: Opc = AArch64::FCCMPEDrr; FirstOp = 0; break; + case AArch64::CBZW: + case AArch64::CBNZW: + Opc = AArch64::CCMPWi; + FirstOp = 0; + isZBranch = true; + break; + case AArch64::CBZX: + case AArch64::CBNZX: + Opc = AArch64::CCMPXi; + FirstOp = 0; + isZBranch = true; + break; + } + + // The ccmp instruction should set the flags according to the comparison when + // Head would have branched to CmpBB. + // The NZCV immediate operand should provide flags for the case where Head + // would have branched to Tail. These flags should cause the new Head + // terminator to branch to tail. + unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(CmpBBTailCC); + const MCInstrDesc &MCID = TII->get(Opc); + MRI->constrainRegClass(CmpMI->getOperand(FirstOp).getReg(), + TII->getRegClass(MCID, 0, TRI, *MF)); + if (CmpMI->getOperand(FirstOp + 1).isReg()) + MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(), + TII->getRegClass(MCID, 1, TRI, *MF)); + MachineInstrBuilder MIB = + BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID) + .addOperand(CmpMI->getOperand(FirstOp)); // Register Rn + if (isZBranch) + MIB.addImm(0); // cbz/cbnz Rn -> ccmp Rn, #0 + else + MIB.addOperand(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate + MIB.addImm(NZCV).addImm(HeadCmpBBCC); + + // If CmpMI was a terminator, we need a new conditional branch to replace it. + // This now becomes a Head terminator. + if (isZBranch) { + bool isNZ = CmpMI->getOpcode() == AArch64::CBNZW || + CmpMI->getOpcode() == AArch64::CBNZX; + BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), TII->get(AArch64::Bcc)) + .addImm(isNZ ? AArch64CC::NE : AArch64CC::EQ) + .addOperand(CmpMI->getOperand(1)); // Branch target. + } + CmpMI->eraseFromParent(); + Head->updateTerminator(); + + RemovedBlocks.push_back(CmpBB); + CmpBB->eraseFromParent(); + DEBUG(dbgs() << "Result:\n" << *Head); + ++NumConverted; +} + +int SSACCmpConv::expectedCodeSizeDelta() const { + int delta = 0; + // If the Head terminator was one of the cbz / tbz branches with built-in + // compare, we need to insert an explicit compare instruction in its place + // plus a branch instruction. + if (HeadCond[0].getImm() == -1) { + switch (HeadCond[1].getImm()) { + case AArch64::CBZW: + case AArch64::CBNZW: + case AArch64::CBZX: + case AArch64::CBNZX: + // Therefore delta += 1 + delta = 1; + break; + default: + llvm_unreachable("Cannot convert Head branch"); + } + } + // If the Cmp terminator was one of the cbz / tbz branches with + // built-in compare, it will be turned into a compare instruction + // into Head, but we do not save any instruction. + // Otherwise, we save the branch instruction. + switch (CmpMI->getOpcode()) { + default: + --delta; + break; + case AArch64::CBZW: + case AArch64::CBNZW: + case AArch64::CBZX: + case AArch64::CBNZX: + break; + } + return delta; +} + +//===----------------------------------------------------------------------===// +// AArch64ConditionalCompares Pass +//===----------------------------------------------------------------------===// + +namespace { +class AArch64ConditionalCompares : public MachineFunctionPass { + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + const MCSchedModel *SchedModel; + // Does the proceeded function has Oz attribute. + bool MinSize; + MachineRegisterInfo *MRI; + MachineDominatorTree *DomTree; + MachineLoopInfo *Loops; + MachineTraceMetrics *Traces; + MachineTraceMetrics::Ensemble *MinInstr; + SSACCmpConv CmpConv; + +public: + static char ID; + AArch64ConditionalCompares() : MachineFunctionPass(ID) {} + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnMachineFunction(MachineFunction &MF) override; + const char *getPassName() const override { + return "AArch64 Conditional Compares"; + } + +private: + bool tryConvert(MachineBasicBlock *); + void updateDomTree(ArrayRef Removed); + void updateLoops(ArrayRef Removed); + void invalidateTraces(); + bool shouldConvert(); +}; +} // end anonymous namespace + +char AArch64ConditionalCompares::ID = 0; + +namespace llvm { +void initializeAArch64ConditionalComparesPass(PassRegistry &); +} + +INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp", + "AArch64 CCMP Pass", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) +INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp", + "AArch64 CCMP Pass", false, false) + +FunctionPass *llvm::createAArch64ConditionalCompares() { + return new AArch64ConditionalCompares(); +} + +void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +/// Update the dominator tree after if-conversion erased some blocks. +void AArch64ConditionalCompares::updateDomTree( + ArrayRef Removed) { + // convert() removes CmpBB which was previously dominated by Head. + // CmpBB children should be transferred to Head. + MachineDomTreeNode *HeadNode = DomTree->getNode(CmpConv.Head); + for (unsigned i = 0, e = Removed.size(); i != e; ++i) { + MachineDomTreeNode *Node = DomTree->getNode(Removed[i]); + assert(Node != HeadNode && "Cannot erase the head node"); + assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head"); + while (Node->getNumChildren()) + DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode); + DomTree->eraseNode(Removed[i]); + } +} + +/// Update LoopInfo after if-conversion. +void +AArch64ConditionalCompares::updateLoops(ArrayRef Removed) { + if (!Loops) + return; + for (unsigned i = 0, e = Removed.size(); i != e; ++i) + Loops->removeBlock(Removed[i]); +} + +/// Invalidate MachineTraceMetrics before if-conversion. +void AArch64ConditionalCompares::invalidateTraces() { + Traces->invalidate(CmpConv.Head); + Traces->invalidate(CmpConv.CmpBB); +} + +/// Apply cost model and heuristics to the if-conversion in IfConv. +/// Return true if the conversion is a good idea. +/// +bool AArch64ConditionalCompares::shouldConvert() { + // Stress testing mode disables all cost considerations. + if (Stress) + return true; + if (!MinInstr) + MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); + + // Head dominates CmpBB, so it is always included in its trace. + MachineTraceMetrics::Trace Trace = MinInstr->getTrace(CmpConv.CmpBB); + + // If code size is the main concern + if (MinSize) { + int CodeSizeDelta = CmpConv.expectedCodeSizeDelta(); + DEBUG(dbgs() << "Code size delta: " << CodeSizeDelta << '\n'); + // If we are minimizing the code size, do the conversion whatever + // the cost is. + if (CodeSizeDelta < 0) + return true; + if (CodeSizeDelta > 0) { + DEBUG(dbgs() << "Code size is increasing, give up on this one.\n"); + return false; + } + // CodeSizeDelta == 0, continue with the regular heuristics + } + + // Heuristic: The compare conversion delays the execution of the branch + // instruction because we must wait for the inputs to the second compare as + // well. The branch has no dependent instructions, but delaying it increases + // the cost of a misprediction. + // + // Set a limit on the delay we will accept. + unsigned DelayLimit = SchedModel->MispredictPenalty * 3 / 4; + + // Instruction depths can be computed for all trace instructions above CmpBB. + unsigned HeadDepth = + Trace.getInstrCycles(CmpConv.Head->getFirstTerminator()).Depth; + unsigned CmpBBDepth = + Trace.getInstrCycles(CmpConv.CmpBB->getFirstTerminator()).Depth; + DEBUG(dbgs() << "Head depth: " << HeadDepth + << "\nCmpBB depth: " << CmpBBDepth << '\n'); + if (CmpBBDepth > HeadDepth + DelayLimit) { + DEBUG(dbgs() << "Branch delay would be larger than " << DelayLimit + << " cycles.\n"); + return false; + } + + // Check the resource depth at the bottom of CmpBB - these instructions will + // be speculated. + unsigned ResDepth = Trace.getResourceDepth(true); + DEBUG(dbgs() << "Resources: " << ResDepth << '\n'); + + // Heuristic: The speculatively executed instructions must all be able to + // merge into the Head block. The Head critical path should dominate the + // resource cost of the speculated instructions. + if (ResDepth > HeadDepth) { + DEBUG(dbgs() << "Too many instructions to speculate.\n"); + return false; + } + return true; +} + +bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) { + bool Changed = false; + while (CmpConv.canConvert(MBB) && shouldConvert()) { + invalidateTraces(); + SmallVector RemovedBlocks; + CmpConv.convert(RemovedBlocks); + Changed = true; + updateDomTree(RemovedBlocks); + updateLoops(RemovedBlocks); + } + return Changed; +} + +bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) { + DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n" + << "********** Function: " << MF.getName() << '\n'); + TII = MF.getTarget().getInstrInfo(); + TRI = MF.getTarget().getRegisterInfo(); + SchedModel = + MF.getTarget().getSubtarget().getSchedModel(); + MRI = &MF.getRegInfo(); + DomTree = &getAnalysis(); + Loops = getAnalysisIfAvailable(); + Traces = &getAnalysis(); + MinInstr = nullptr; + MinSize = MF.getFunction()->getAttributes().hasAttribute( + AttributeSet::FunctionIndex, Attribute::MinSize); + + bool Changed = false; + CmpConv.runOnMachineFunction(MF); + + // Visit blocks in dominator tree pre-order. The pre-order enables multiple + // cmp-conversions from the same head block. + // Note that updateDomTree() modifies the children of the DomTree node + // currently being visited. The df_iterator supports that; it doesn't look at + // child_begin() / child_end() until after a node has been visited. + for (auto *I : depth_first(DomTree)) + if (tryConvert(I->getBlock())) + Changed = true; + + return Changed; +} diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp new file mode 100644 index 00000000000..a2d853c85fe --- /dev/null +++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp @@ -0,0 +1,134 @@ +//==-- AArch64DeadRegisterDefinitions.cpp - Replace dead defs w/ zero reg --==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// When allowed by the instruction, replace a dead definition of a GPR with +// the zero register. This makes the code a bit friendlier towards the +// hardware's register renamer. +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64RegisterInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "aarch64-dead-defs" + +STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced"); + +namespace { +class AArch64DeadRegisterDefinitions : public MachineFunctionPass { +private: + const TargetRegisterInfo *TRI; + bool implicitlyDefinesOverlappingReg(unsigned Reg, const MachineInstr &MI); + bool processMachineBasicBlock(MachineBasicBlock &MBB); + bool usesFrameIndex(const MachineInstr &MI); +public: + static char ID; // Pass identification, replacement for typeid. + explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {} + + virtual bool runOnMachineFunction(MachineFunction &F) override; + + const char *getPassName() const override { return "Dead register definitions"; } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +char AArch64DeadRegisterDefinitions::ID = 0; +} // end anonymous namespace + +bool AArch64DeadRegisterDefinitions::implicitlyDefinesOverlappingReg( + unsigned Reg, const MachineInstr &MI) { + for (const MachineOperand &MO : MI.implicit_operands()) + if (MO.isReg() && MO.isDef()) + if (TRI->regsOverlap(Reg, MO.getReg())) + return true; + return false; +} + +bool AArch64DeadRegisterDefinitions::usesFrameIndex(const MachineInstr &MI) { + for (const MachineOperand &Op : MI.uses()) + if (Op.isFI()) + return true; + return false; +} + +bool AArch64DeadRegisterDefinitions::processMachineBasicBlock( + MachineBasicBlock &MBB) { + bool Changed = false; + for (MachineInstr &MI : MBB) { + if (usesFrameIndex(MI)) { + // We need to skip this instruction because while it appears to have a + // dead def it uses a frame index which might expand into a multi + // instruction sequence during EPI. + DEBUG(dbgs() << " Ignoring, operand is frame index\n"); + continue; + } + for (int i = 0, e = MI.getDesc().getNumDefs(); i != e; ++i) { + MachineOperand &MO = MI.getOperand(i); + if (MO.isReg() && MO.isDead() && MO.isDef()) { + assert(!MO.isImplicit() && "Unexpected implicit def!"); + DEBUG(dbgs() << " Dead def operand #" << i << " in:\n "; + MI.print(dbgs())); + // Be careful not to change the register if it's a tied operand. + if (MI.isRegTiedToUseOperand(i)) { + DEBUG(dbgs() << " Ignoring, def is tied operand.\n"); + continue; + } + // Don't change the register if there's an implicit def of a subreg or + // supperreg. + if (implicitlyDefinesOverlappingReg(MO.getReg(), MI)) { + DEBUG(dbgs() << " Ignoring, implicitly defines overlap reg.\n"); + continue; + } + // Make sure the instruction take a register class that contains + // the zero register and replace it if so. + unsigned NewReg; + switch (MI.getDesc().OpInfo[i].RegClass) { + default: + DEBUG(dbgs() << " Ignoring, register is not a GPR.\n"); + continue; + case AArch64::GPR32RegClassID: + NewReg = AArch64::WZR; + break; + case AArch64::GPR64RegClassID: + NewReg = AArch64::XZR; + break; + } + DEBUG(dbgs() << " Replacing with zero register. New:\n "); + MO.setReg(NewReg); + DEBUG(MI.print(dbgs())); + ++NumDeadDefsReplaced; + } + } + } + return Changed; +} + +// Scan the function for instructions that have a dead definition of a +// register. Replace that register with the zero register when possible. +bool AArch64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) { + TRI = MF.getTarget().getRegisterInfo(); + bool Changed = false; + DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n"); + + for (auto &MBB : MF) + if (processMachineBasicBlock(MBB)) + Changed = true; + return Changed; +} + +FunctionPass *llvm::createAArch64DeadRegisterDefinitions() { + return new AArch64DeadRegisterDefinitions(); +} diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp new file mode 100644 index 00000000000..a76fd76e5ed --- /dev/null +++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -0,0 +1,749 @@ +//==-- AArch64ExpandPseudoInsts.cpp - Expand pseudo instructions --*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that expands pseudo instructions into target +// instructions to allow proper scheduling and other late optimizations. This +// pass should be run after register allocation but before the post-regalloc +// scheduling pass. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "AArch64InstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/MathExtras.h" +using namespace llvm; + +namespace { +class AArch64ExpandPseudo : public MachineFunctionPass { +public: + static char ID; + AArch64ExpandPseudo() : MachineFunctionPass(ID) {} + + const AArch64InstrInfo *TII; + + bool runOnMachineFunction(MachineFunction &Fn) override; + + const char *getPassName() const override { + return "AArch64 pseudo instruction expansion pass"; + } + +private: + bool expandMBB(MachineBasicBlock &MBB); + bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); + bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + unsigned BitSize); +}; +char AArch64ExpandPseudo::ID = 0; +} + +/// \brief Transfer implicit operands on the pseudo instruction to the +/// instructions created from the expansion. +static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI, + MachineInstrBuilder &DefMI) { + const MCInstrDesc &Desc = OldMI.getDesc(); + for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); i != e; + ++i) { + const MachineOperand &MO = OldMI.getOperand(i); + assert(MO.isReg() && MO.getReg()); + if (MO.isUse()) + UseMI.addOperand(MO); + else + DefMI.addOperand(MO); + } +} + +/// \brief Helper function which extracts the specified 16-bit chunk from a +/// 64-bit value. +static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) { + assert(ChunkIdx < 4 && "Out of range chunk index specified!"); + + return (Imm >> (ChunkIdx * 16)) & 0xFFFF; +} + +/// \brief Helper function which replicates a 16-bit chunk within a 64-bit +/// value. Indices correspond to element numbers in a v4i16. +static uint64_t replicateChunk(uint64_t Imm, unsigned FromIdx, unsigned ToIdx) { + assert((FromIdx < 4) && (ToIdx < 4) && "Out of range chunk index specified!"); + const unsigned ShiftAmt = ToIdx * 16; + + // Replicate the source chunk to the destination position. + const uint64_t Chunk = getChunk(Imm, FromIdx) << ShiftAmt; + // Clear the destination chunk. + Imm &= ~(0xFFFFLL << ShiftAmt); + // Insert the replicated chunk. + return Imm | Chunk; +} + +/// \brief Helper function which tries to materialize a 64-bit value with an +/// ORR + MOVK instruction sequence. +static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + const AArch64InstrInfo *TII, unsigned ChunkIdx) { + assert(ChunkIdx < 4 && "Out of range chunk index specified!"); + const unsigned ShiftAmt = ChunkIdx * 16; + + uint64_t Encoding; + if (AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding)) { + // Create the ORR-immediate instruction. + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) + .addOperand(MI.getOperand(0)) + .addReg(AArch64::XZR) + .addImm(Encoding); + + // Create the MOVK instruction. + const unsigned Imm16 = getChunk(UImm, ChunkIdx); + const unsigned DstReg = MI.getOperand(0).getReg(); + const bool DstIsDead = MI.getOperand(0).isDead(); + MachineInstrBuilder MIB1 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg) + .addImm(Imm16) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt)); + + transferImpOps(MI, MIB, MIB1); + MI.eraseFromParent(); + return true; + } + + return false; +} + +/// \brief Check whether the given 16-bit chunk replicated to full 64-bit width +/// can be materialized with an ORR instruction. +static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) { + Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk; + + return AArch64_AM::processLogicalImmediate(Chunk, 64, Encoding); +} + +/// \brief Check for identical 16-bit chunks within the constant and if so +/// materialize them with a single ORR instruction. The remaining one or two +/// 16-bit chunks will be materialized with MOVK instructions. +/// +/// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order +/// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with +/// an ORR instruction. +/// +static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + const AArch64InstrInfo *TII) { + typedef DenseMap CountMap; + CountMap Counts; + + // Scan the constant and count how often every chunk occurs. + for (unsigned Idx = 0; Idx < 4; ++Idx) + ++Counts[getChunk(UImm, Idx)]; + + // Traverse the chunks to find one which occurs more than once. + for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end(); + Chunk != End; ++Chunk) { + const uint64_t ChunkVal = Chunk->first; + const unsigned Count = Chunk->second; + + uint64_t Encoding = 0; + + // We are looking for chunks which have two or three instances and can be + // materialized with an ORR instruction. + if ((Count != 2 && Count != 3) || !canUseOrr(ChunkVal, Encoding)) + continue; + + const bool CountThree = Count == 3; + // Create the ORR-immediate instruction. + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) + .addOperand(MI.getOperand(0)) + .addReg(AArch64::XZR) + .addImm(Encoding); + + const unsigned DstReg = MI.getOperand(0).getReg(); + const bool DstIsDead = MI.getOperand(0).isDead(); + + unsigned ShiftAmt = 0; + uint64_t Imm16 = 0; + // Find the first chunk not materialized with the ORR instruction. + for (; ShiftAmt < 64; ShiftAmt += 16) { + Imm16 = (UImm >> ShiftAmt) & 0xFFFF; + + if (Imm16 != ChunkVal) + break; + } + + // Create the first MOVK instruction. + MachineInstrBuilder MIB1 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) + .addReg(DstReg, + RegState::Define | getDeadRegState(DstIsDead && CountThree)) + .addReg(DstReg) + .addImm(Imm16) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt)); + + // In case we have three instances the whole constant is now materialized + // and we can exit. + if (CountThree) { + transferImpOps(MI, MIB, MIB1); + MI.eraseFromParent(); + return true; + } + + // Find the remaining chunk which needs to be materialized. + for (ShiftAmt += 16; ShiftAmt < 64; ShiftAmt += 16) { + Imm16 = (UImm >> ShiftAmt) & 0xFFFF; + + if (Imm16 != ChunkVal) + break; + } + + // Create the second MOVK instruction. + MachineInstrBuilder MIB2 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg) + .addImm(Imm16) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt)); + + transferImpOps(MI, MIB, MIB2); + MI.eraseFromParent(); + return true; + } + + return false; +} + +/// \brief Check whether this chunk matches the pattern '1...0...'. This pattern +/// starts a contiguous sequence of ones if we look at the bits from the LSB +/// towards the MSB. +static bool isStartChunk(uint64_t Chunk) { + if (Chunk == 0 || Chunk == UINT64_MAX) + return false; + + return (CountLeadingOnes_64(Chunk) + countTrailingZeros(Chunk)) == 64; +} + +/// \brief Check whether this chunk matches the pattern '0...1...' This pattern +/// ends a contiguous sequence of ones if we look at the bits from the LSB +/// towards the MSB. +static bool isEndChunk(uint64_t Chunk) { + if (Chunk == 0 || Chunk == UINT64_MAX) + return false; + + return (countLeadingZeros(Chunk) + CountTrailingOnes_64(Chunk)) == 64; +} + +/// \brief Clear or set all bits in the chunk at the given index. +static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) { + const uint64_t Mask = 0xFFFF; + + if (Clear) + // Clear chunk in the immediate. + Imm &= ~(Mask << (Idx * 16)); + else + // Set all bits in the immediate for the particular chunk. + Imm |= Mask << (Idx * 16); + + return Imm; +} + +/// \brief Check whether the constant contains a sequence of contiguous ones, +/// which might be interrupted by one or two chunks. If so, materialize the +/// sequence of contiguous ones with an ORR instruction. +/// Materialize the chunks which are either interrupting the sequence or outside +/// of the sequence with a MOVK instruction. +/// +/// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk +/// which ends the sequence (0...1...). Then we are looking for constants which +/// contain at least one S and E chunk. +/// E.g. |E|A|B|S|, |A|E|B|S| or |A|B|E|S|. +/// +/// We are also looking for constants like |S|A|B|E| where the contiguous +/// sequence of ones wraps around the MSB into the LSB. +/// +static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator &MBBI, + const AArch64InstrInfo *TII) { + const int NotSet = -1; + const uint64_t Mask = 0xFFFF; + + int StartIdx = NotSet; + int EndIdx = NotSet; + // Try to find the chunks which start/end a contiguous sequence of ones. + for (int Idx = 0; Idx < 4; ++Idx) { + int64_t Chunk = getChunk(UImm, Idx); + // Sign extend the 16-bit chunk to 64-bit. + Chunk = (Chunk << 48) >> 48; + + if (isStartChunk(Chunk)) + StartIdx = Idx; + else if (isEndChunk(Chunk)) + EndIdx = Idx; + } + + // Early exit in case we can't find a start/end chunk. + if (StartIdx == NotSet || EndIdx == NotSet) + return false; + + // Outside of the contiguous sequence of ones everything needs to be zero. + uint64_t Outside = 0; + // Chunks between the start and end chunk need to have all their bits set. + uint64_t Inside = Mask; + + // If our contiguous sequence of ones wraps around from the MSB into the LSB, + // just swap indices and pretend we are materializing a contiguous sequence + // of zeros surrounded by a contiguous sequence of ones. + if (StartIdx > EndIdx) { + std::swap(StartIdx, EndIdx); + std::swap(Outside, Inside); + } + + uint64_t OrrImm = UImm; + int FirstMovkIdx = NotSet; + int SecondMovkIdx = NotSet; + + // Find out which chunks we need to patch up to obtain a contiguous sequence + // of ones. + for (int Idx = 0; Idx < 4; ++Idx) { + const uint64_t Chunk = getChunk(UImm, Idx); + + // Check whether we are looking at a chunk which is not part of the + // contiguous sequence of ones. + if ((Idx < StartIdx || EndIdx < Idx) && Chunk != Outside) { + OrrImm = updateImm(OrrImm, Idx, Outside == 0); + + // Remember the index we need to patch. + if (FirstMovkIdx == NotSet) + FirstMovkIdx = Idx; + else + SecondMovkIdx = Idx; + + // Check whether we are looking a chunk which is part of the contiguous + // sequence of ones. + } else if (Idx > StartIdx && Idx < EndIdx && Chunk != Inside) { + OrrImm = updateImm(OrrImm, Idx, Inside != Mask); + + // Remember the index we need to patch. + if (FirstMovkIdx == NotSet) + FirstMovkIdx = Idx; + else + SecondMovkIdx = Idx; + } + } + assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!"); + + // Create the ORR-immediate instruction. + uint64_t Encoding = 0; + AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding); + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri)) + .addOperand(MI.getOperand(0)) + .addReg(AArch64::XZR) + .addImm(Encoding); + + const unsigned DstReg = MI.getOperand(0).getReg(); + const bool DstIsDead = MI.getOperand(0).isDead(); + + const bool SingleMovk = SecondMovkIdx == NotSet; + // Create the first MOVK instruction. + MachineInstrBuilder MIB1 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) + .addReg(DstReg, + RegState::Define | getDeadRegState(DstIsDead && SingleMovk)) + .addReg(DstReg) + .addImm(getChunk(UImm, FirstMovkIdx)) + .addImm( + AArch64_AM::getShifterImm(AArch64_AM::LSL, FirstMovkIdx * 16)); + + // Early exit in case we only need to emit a single MOVK instruction. + if (SingleMovk) { + transferImpOps(MI, MIB, MIB1); + MI.eraseFromParent(); + return true; + } + + // Create the second MOVK instruction. + MachineInstrBuilder MIB2 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi)) + .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) + .addReg(DstReg) + .addImm(getChunk(UImm, SecondMovkIdx)) + .addImm( + AArch64_AM::getShifterImm(AArch64_AM::LSL, SecondMovkIdx * 16)); + + transferImpOps(MI, MIB, MIB2); + MI.eraseFromParent(); + return true; +} + +/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more +/// real move-immediate instructions to synthesize the immediate. +bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned BitSize) { + MachineInstr &MI = *MBBI; + uint64_t Imm = MI.getOperand(1).getImm(); + const unsigned Mask = 0xFFFF; + + // Try a MOVI instruction (aka ORR-immediate with the zero register). + uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); + uint64_t Encoding; + if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { + unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri); + MachineInstrBuilder MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)) + .addOperand(MI.getOperand(0)) + .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR) + .addImm(Encoding); + transferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); + return true; + } + + // Scan the immediate and count the number of 16-bit chunks which are either + // all ones or all zeros. + unsigned OneChunks = 0; + unsigned ZeroChunks = 0; + for (unsigned Shift = 0; Shift < BitSize; Shift += 16) { + const unsigned Chunk = (Imm >> Shift) & Mask; + if (Chunk == Mask) + OneChunks++; + else if (Chunk == 0) + ZeroChunks++; + } + + // Since we can't materialize the constant with a single ORR instruction, + // let's see whether we can materialize 3/4 of the constant with an ORR + // instruction and use an additional MOVK instruction to materialize the + // remaining 1/4. + // + // We are looking for constants with a pattern like: |A|X|B|X| or |X|A|X|B|. + // + // E.g. assuming |A|X|A|X| is a pattern which can be materialized with ORR, + // we would create the following instruction sequence: + // + // ORR x0, xzr, |A|X|A|X| + // MOVK x0, |B|, LSL #16 + // + // Only look at 64-bit constants which can't be materialized with a single + // instruction e.g. which have less than either three all zero or all one + // chunks. + // + // Ignore 32-bit constants here, they always can be materialized with a + // MOVZ/MOVN + MOVK pair. Since the 32-bit constant can't be materialized + // with a single ORR, the best sequence we can achieve is a ORR + MOVK pair. + // Thus we fall back to the default code below which in the best case creates + // a single MOVZ/MOVN instruction (in case one chunk is all zero or all one). + // + if (BitSize == 64 && OneChunks < 3 && ZeroChunks < 3) { + // If we interpret the 64-bit constant as a v4i16, are elements 0 and 2 + // identical? + if (getChunk(UImm, 0) == getChunk(UImm, 2)) { + // See if we can come up with a constant which can be materialized with + // ORR-immediate by replicating element 3 into element 1. + uint64_t OrrImm = replicateChunk(UImm, 3, 1); + if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 1)) + return true; + + // See if we can come up with a constant which can be materialized with + // ORR-immediate by replicating element 1 into element 3. + OrrImm = replicateChunk(UImm, 1, 3); + if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 3)) + return true; + + // If we interpret the 64-bit constant as a v4i16, are elements 1 and 3 + // identical? + } else if (getChunk(UImm, 1) == getChunk(UImm, 3)) { + // See if we can come up with a constant which can be materialized with + // ORR-immediate by replicating element 2 into element 0. + uint64_t OrrImm = replicateChunk(UImm, 2, 0); + if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 0)) + return true; + + // See if we can come up with a constant which can be materialized with + // ORR-immediate by replicating element 1 into element 3. + OrrImm = replicateChunk(UImm, 0, 2); + if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 2)) + return true; + } + } + + // Check for identical 16-bit chunks within the constant and if so materialize + // them with a single ORR instruction. The remaining one or two 16-bit chunks + // will be materialized with MOVK instructions. + if (BitSize == 64 && tryToreplicateChunks(UImm, MI, MBB, MBBI, TII)) + return true; + + // Check whether the constant contains a sequence of contiguous ones, which + // might be interrupted by one or two chunks. If so, materialize the sequence + // of contiguous ones with an ORR instruction. Materialize the chunks which + // are either interrupting the sequence or outside of the sequence with a + // MOVK instruction. + if (BitSize == 64 && trySequenceOfOnes(UImm, MI, MBB, MBBI, TII)) + return true; + + // Use a MOVZ or MOVN instruction to set the high bits, followed by one or + // more MOVK instructions to insert additional 16-bit portions into the + // lower bits. + bool isNeg = false; + + // Use MOVN to materialize the high bits if we have more all one chunks + // than all zero chunks. + if (OneChunks > ZeroChunks) { + isNeg = true; + Imm = ~Imm; + } + + unsigned FirstOpc; + if (BitSize == 32) { + Imm &= (1LL << 32) - 1; + FirstOpc = (isNeg ? AArch64::MOVNWi : AArch64::MOVZWi); + } else { + FirstOpc = (isNeg ? AArch64::MOVNXi : AArch64::MOVZXi); + } + unsigned Shift = 0; // LSL amount for high bits with MOVZ/MOVN + unsigned LastShift = 0; // LSL amount for last MOVK + if (Imm != 0) { + unsigned LZ = countLeadingZeros(Imm); + unsigned TZ = countTrailingZeros(Imm); + Shift = ((63 - LZ) / 16) * 16; + LastShift = (TZ / 16) * 16; + } + unsigned Imm16 = (Imm >> Shift) & Mask; + unsigned DstReg = MI.getOperand(0).getReg(); + bool DstIsDead = MI.getOperand(0).isDead(); + MachineInstrBuilder MIB1 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc)) + .addReg(DstReg, RegState::Define | + getDeadRegState(DstIsDead && Shift == LastShift)) + .addImm(Imm16) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)); + + // If a MOVN was used for the high bits of a negative value, flip the rest + // of the bits back for use with MOVK. + if (isNeg) + Imm = ~Imm; + + if (Shift == LastShift) { + transferImpOps(MI, MIB1, MIB1); + MI.eraseFromParent(); + return true; + } + + MachineInstrBuilder MIB2; + unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi); + while (Shift != LastShift) { + Shift -= 16; + Imm16 = (Imm >> Shift) & Mask; + if (Imm16 == (isNeg ? Mask : 0)) + continue; // This 16-bit portion is already set correctly. + MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)) + .addReg(DstReg, + RegState::Define | + getDeadRegState(DstIsDead && Shift == LastShift)) + .addReg(DstReg) + .addImm(Imm16) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift)); + } + + transferImpOps(MI, MIB1, MIB2); + MI.eraseFromParent(); + return true; +} + +/// \brief If MBBI references a pseudo instruction that should be expanded here, +/// do the expansion and return true. Otherwise return false. +bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + MachineInstr &MI = *MBBI; + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + default: + break; + + case AArch64::ADDWrr: + case AArch64::SUBWrr: + case AArch64::ADDXrr: + case AArch64::SUBXrr: + case AArch64::ADDSWrr: + case AArch64::SUBSWrr: + case AArch64::ADDSXrr: + case AArch64::SUBSXrr: + case AArch64::ANDWrr: + case AArch64::ANDXrr: + case AArch64::BICWrr: + case AArch64::BICXrr: + case AArch64::ANDSWrr: + case AArch64::ANDSXrr: + case AArch64::BICSWrr: + case AArch64::BICSXrr: + case AArch64::EONWrr: + case AArch64::EONXrr: + case AArch64::EORWrr: + case AArch64::EORXrr: + case AArch64::ORNWrr: + case AArch64::ORNXrr: + case AArch64::ORRWrr: + case AArch64::ORRXrr: { + unsigned Opcode; + switch (MI.getOpcode()) { + default: + return false; + case AArch64::ADDWrr: Opcode = AArch64::ADDWrs; break; + case AArch64::SUBWrr: Opcode = AArch64::SUBWrs; break; + case AArch64::ADDXrr: Opcode = AArch64::ADDXrs; break; + case AArch64::SUBXrr: Opcode = AArch64::SUBXrs; break; + case AArch64::ADDSWrr: Opcode = AArch64::ADDSWrs; break; + case AArch64::SUBSWrr: Opcode = AArch64::SUBSWrs; break; + case AArch64::ADDSXrr: Opcode = AArch64::ADDSXrs; break; + case AArch64::SUBSXrr: Opcode = AArch64::SUBSXrs; break; + case AArch64::ANDWrr: Opcode = AArch64::ANDWrs; break; + case AArch64::ANDXrr: Opcode = AArch64::ANDXrs; break; + case AArch64::BICWrr: Opcode = AArch64::BICWrs; break; + case AArch64::BICXrr: Opcode = AArch64::BICXrs; break; + case AArch64::ANDSWrr: Opcode = AArch64::ANDSWrs; break; + case AArch64::ANDSXrr: Opcode = AArch64::ANDSXrs; break; + case AArch64::BICSWrr: Opcode = AArch64::BICSWrs; break; + case AArch64::BICSXrr: Opcode = AArch64::BICSXrs; break; + case AArch64::EONWrr: Opcode = AArch64::EONWrs; break; + case AArch64::EONXrr: Opcode = AArch64::EONXrs; break; + case AArch64::EORWrr: Opcode = AArch64::EORWrs; break; + case AArch64::EORXrr: Opcode = AArch64::EORXrs; break; + case AArch64::ORNWrr: Opcode = AArch64::ORNWrs; break; + case AArch64::ORNXrr: Opcode = AArch64::ORNXrs; break; + case AArch64::ORRWrr: Opcode = AArch64::ORRWrs; break; + case AArch64::ORRXrr: Opcode = AArch64::ORRXrs; break; + } + MachineInstrBuilder MIB1 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode), + MI.getOperand(0).getReg()) + .addOperand(MI.getOperand(1)) + .addOperand(MI.getOperand(2)) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); + transferImpOps(MI, MIB1, MIB1); + MI.eraseFromParent(); + return true; + } + + case AArch64::FCVTSHpseudo: { + MachineOperand Src = MI.getOperand(1); + Src.setImplicit(); + unsigned SrcH = + TII->getRegisterInfo().getSubReg(Src.getReg(), AArch64::hsub); + auto MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::FCVTSHr)) + .addOperand(MI.getOperand(0)) + .addReg(SrcH, RegState::Undef) + .addOperand(Src); + transferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); + return true; + } + case AArch64::LOADgot: { + // Expand into ADRP + LDR. + unsigned DstReg = MI.getOperand(0).getReg(); + const MachineOperand &MO1 = MI.getOperand(1); + unsigned Flags = MO1.getTargetFlags(); + MachineInstrBuilder MIB1 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg); + MachineInstrBuilder MIB2 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui)) + .addOperand(MI.getOperand(0)) + .addReg(DstReg); + + if (MO1.isGlobal()) { + MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | AArch64II::MO_PAGE); + MIB2.addGlobalAddress(MO1.getGlobal(), 0, + Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + } else if (MO1.isSymbol()) { + MIB1.addExternalSymbol(MO1.getSymbolName(), Flags | AArch64II::MO_PAGE); + MIB2.addExternalSymbol(MO1.getSymbolName(), + Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + } else { + assert(MO1.isCPI() && + "Only expect globals, externalsymbols, or constant pools"); + MIB1.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(), + Flags | AArch64II::MO_PAGE); + MIB2.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(), + Flags | AArch64II::MO_PAGEOFF | + AArch64II::MO_NC); + } + + transferImpOps(MI, MIB1, MIB2); + MI.eraseFromParent(); + return true; + } + + case AArch64::MOVaddr: + case AArch64::MOVaddrJT: + case AArch64::MOVaddrCP: + case AArch64::MOVaddrBA: + case AArch64::MOVaddrTLS: + case AArch64::MOVaddrEXT: { + // Expand into ADRP + ADD. + unsigned DstReg = MI.getOperand(0).getReg(); + MachineInstrBuilder MIB1 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg) + .addOperand(MI.getOperand(1)); + + MachineInstrBuilder MIB2 = + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri)) + .addOperand(MI.getOperand(0)) + .addReg(DstReg) + .addOperand(MI.getOperand(2)) + .addImm(0); + + transferImpOps(MI, MIB1, MIB2); + MI.eraseFromParent(); + return true; + } + + case AArch64::MOVi32imm: + return expandMOVImm(MBB, MBBI, 32); + case AArch64::MOVi64imm: + return expandMOVImm(MBB, MBBI, 64); + case AArch64::RET_ReallyLR: + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::RET)) + .addReg(AArch64::LR); + MI.eraseFromParent(); + return true; + } + return false; +} + +/// \brief Iterate over the instructions in basic block MBB and expand any +/// pseudo instructions. Return true if anything was modified. +bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) { + bool Modified = false; + + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + while (MBBI != E) { + MachineBasicBlock::iterator NMBBI = std::next(MBBI); + Modified |= expandMI(MBB, MBBI); + MBBI = NMBBI; + } + + return Modified; +} + +bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) { + TII = static_cast(MF.getTarget().getInstrInfo()); + + bool Modified = false; + for (auto &MBB : MF) + Modified |= expandMBB(MBB); + return Modified; +} + +/// \brief Returns an instance of the pseudo instruction expansion pass. +FunctionPass *llvm::createAArch64ExpandPseudoPass() { + return new AArch64ExpandPseudo(); +} diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp new file mode 100644 index 00000000000..58178b1a48b --- /dev/null +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -0,0 +1,1977 @@ +//===-- AArch6464FastISel.cpp - AArch64 FastISel implementation -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the AArch64-specific support for the FastISel class. Some +// of the target-specific code is generated by tablegen in the file +// AArch64GenFastISel.inc, which is #included here. +// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64TargetMachine.h" +#include "AArch64Subtarget.h" +#include "AArch64CallingConv.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/FastISel.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Operator.h" +#include "llvm/Support/CommandLine.h" +using namespace llvm; + +namespace { + +class AArch64FastISel : public FastISel { + + class Address { + public: + typedef enum { + RegBase, + FrameIndexBase + } BaseKind; + + private: + BaseKind Kind; + union { + unsigned Reg; + int FI; + } Base; + int64_t Offset; + + public: + Address() : Kind(RegBase), Offset(0) { Base.Reg = 0; } + void setKind(BaseKind K) { Kind = K; } + BaseKind getKind() const { return Kind; } + bool isRegBase() const { return Kind == RegBase; } + bool isFIBase() const { return Kind == FrameIndexBase; } + void setReg(unsigned Reg) { + assert(isRegBase() && "Invalid base register access!"); + Base.Reg = Reg; + } + unsigned getReg() const { + assert(isRegBase() && "Invalid base register access!"); + return Base.Reg; + } + void setFI(unsigned FI) { + assert(isFIBase() && "Invalid base frame index access!"); + Base.FI = FI; + } + unsigned getFI() const { + assert(isFIBase() && "Invalid base frame index access!"); + return Base.FI; + } + void setOffset(int64_t O) { Offset = O; } + int64_t getOffset() { return Offset; } + + bool isValid() { return isFIBase() || (isRegBase() && getReg() != 0); } + }; + + /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can + /// make the right decision when generating code for different targets. + const AArch64Subtarget *Subtarget; + LLVMContext *Context; + +private: + // Selection routines. + bool SelectLoad(const Instruction *I); + bool SelectStore(const Instruction *I); + bool SelectBranch(const Instruction *I); + bool SelectIndirectBr(const Instruction *I); + bool SelectCmp(const Instruction *I); + bool SelectSelect(const Instruction *I); + bool SelectFPExt(const Instruction *I); + bool SelectFPTrunc(const Instruction *I); + bool SelectFPToInt(const Instruction *I, bool Signed); + bool SelectIntToFP(const Instruction *I, bool Signed); + bool SelectRem(const Instruction *I, unsigned ISDOpcode); + bool SelectCall(const Instruction *I, const char *IntrMemName); + bool SelectIntrinsicCall(const IntrinsicInst &I); + bool SelectRet(const Instruction *I); + bool SelectTrunc(const Instruction *I); + bool SelectIntExt(const Instruction *I); + bool SelectMul(const Instruction *I); + + // Utility helper routines. + bool isTypeLegal(Type *Ty, MVT &VT); + bool isLoadStoreTypeLegal(Type *Ty, MVT &VT); + bool ComputeAddress(const Value *Obj, Address &Addr); + bool SimplifyAddress(Address &Addr, MVT VT, int64_t ScaleFactor, + bool UseUnscaled); + void AddLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB, + unsigned Flags, bool UseUnscaled); + bool IsMemCpySmall(uint64_t Len, unsigned Alignment); + bool TryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len, + unsigned Alignment); + // Emit functions. + bool EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt); + bool EmitLoad(MVT VT, unsigned &ResultReg, Address Addr, + bool UseUnscaled = false); + bool EmitStore(MVT VT, unsigned SrcReg, Address Addr, + bool UseUnscaled = false); + unsigned EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt); + unsigned Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt); + + unsigned AArch64MaterializeFP(const ConstantFP *CFP, MVT VT); + unsigned AArch64MaterializeGV(const GlobalValue *GV); + + // Call handling routines. +private: + CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const; + bool ProcessCallArgs(SmallVectorImpl &Args, + SmallVectorImpl &ArgRegs, + SmallVectorImpl &ArgVTs, + SmallVectorImpl &ArgFlags, + SmallVectorImpl &RegArgs, CallingConv::ID CC, + unsigned &NumBytes); + bool FinishCall(MVT RetVT, SmallVectorImpl &UsedRegs, + const Instruction *I, CallingConv::ID CC, unsigned &NumBytes); + +public: + // Backend specific FastISel code. + unsigned TargetMaterializeAlloca(const AllocaInst *AI) override; + unsigned TargetMaterializeConstant(const Constant *C) override; + + explicit AArch64FastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) + : FastISel(funcInfo, libInfo) { + Subtarget = &TM.getSubtarget(); + Context = &funcInfo.Fn->getContext(); + } + + bool TargetSelectInstruction(const Instruction *I) override; + +#include "AArch64GenFastISel.inc" +}; + +} // end anonymous namespace + +#include "AArch64GenCallingConv.inc" + +CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const { + if (CC == CallingConv::WebKit_JS) + return CC_AArch64_WebKit_JS; + return Subtarget->isTargetDarwin() ? CC_AArch64_DarwinPCS : CC_AArch64_AAPCS; +} + +unsigned AArch64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) { + assert(TLI.getValueType(AI->getType(), true) == MVT::i64 && + "Alloca should always return a pointer."); + + // Don't handle dynamic allocas. + if (!FuncInfo.StaticAllocaMap.count(AI)) + return 0; + + DenseMap::iterator SI = + FuncInfo.StaticAllocaMap.find(AI); + + if (SI != FuncInfo.StaticAllocaMap.end()) { + unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri), + ResultReg) + .addFrameIndex(SI->second) + .addImm(0) + .addImm(0); + return ResultReg; + } + + return 0; +} + +unsigned AArch64FastISel::AArch64MaterializeFP(const ConstantFP *CFP, MVT VT) { + if (VT != MVT::f32 && VT != MVT::f64) + return 0; + + const APFloat Val = CFP->getValueAPF(); + bool is64bit = (VT == MVT::f64); + + // This checks to see if we can use FMOV instructions to materialize + // a constant, otherwise we have to materialize via the constant pool. + if (TLI.isFPImmLegal(Val, VT)) { + int Imm; + unsigned Opc; + if (is64bit) { + Imm = AArch64_AM::getFP64Imm(Val); + Opc = AArch64::FMOVDi; + } else { + Imm = AArch64_AM::getFP32Imm(Val); + Opc = AArch64::FMOVSi; + } + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addImm(Imm); + return ResultReg; + } + + // Materialize via constant pool. MachineConstantPool wants an explicit + // alignment. + unsigned Align = DL.getPrefTypeAlignment(CFP->getType()); + if (Align == 0) + Align = DL.getTypeAllocSize(CFP->getType()); + + unsigned Idx = MCP.getConstantPoolIndex(cast(CFP), Align); + unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP), + ADRPReg).addConstantPoolIndex(Idx, 0, AArch64II::MO_PAGE); + + unsigned Opc = is64bit ? AArch64::LDRDui : AArch64::LDRSui; + unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addReg(ADRPReg) + .addConstantPoolIndex(Idx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + return ResultReg; +} + +unsigned AArch64FastISel::AArch64MaterializeGV(const GlobalValue *GV) { + // We can't handle thread-local variables quickly yet. Unfortunately we have + // to peer through any aliases to find out if that rule applies. + const GlobalValue *TLSGV = GV; + if (const GlobalAlias *GA = dyn_cast(GV)) + TLSGV = GA->getAliasee(); + + if (const GlobalVariable *GVar = dyn_cast(TLSGV)) + if (GVar->isThreadLocal()) + return 0; + + unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM); + + EVT DestEVT = TLI.getValueType(GV->getType(), true); + if (!DestEVT.isSimple()) + return 0; + + unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass); + unsigned ResultReg; + + if (OpFlags & AArch64II::MO_GOT) { + // ADRP + LDRX + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP), + ADRPReg) + .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGE); + + ResultReg = createResultReg(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui), + ResultReg) + .addReg(ADRPReg) + .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF | + AArch64II::MO_NC); + } else { + // ADRP + ADDX + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP), + ADRPReg).addGlobalAddress(GV, 0, AArch64II::MO_PAGE); + + ResultReg = createResultReg(&AArch64::GPR64spRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri), + ResultReg) + .addReg(ADRPReg) + .addGlobalAddress(GV, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC) + .addImm(0); + } + return ResultReg; +} + +unsigned AArch64FastISel::TargetMaterializeConstant(const Constant *C) { + EVT CEVT = TLI.getValueType(C->getType(), true); + + // Only handle simple types. + if (!CEVT.isSimple()) + return 0; + MVT VT = CEVT.getSimpleVT(); + + // FIXME: Handle ConstantInt. + if (const ConstantFP *CFP = dyn_cast(C)) + return AArch64MaterializeFP(CFP, VT); + else if (const GlobalValue *GV = dyn_cast(C)) + return AArch64MaterializeGV(GV); + + return 0; +} + +// Computes the address to get to an object. +bool AArch64FastISel::ComputeAddress(const Value *Obj, Address &Addr) { + const User *U = nullptr; + unsigned Opcode = Instruction::UserOp1; + if (const Instruction *I = dyn_cast(Obj)) { + // Don't walk into other basic blocks unless the object is an alloca from + // another block, otherwise it may not have a virtual register assigned. + if (FuncInfo.StaticAllocaMap.count(static_cast(Obj)) || + FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) { + Opcode = I->getOpcode(); + U = I; + } + } else if (const ConstantExpr *C = dyn_cast(Obj)) { + Opcode = C->getOpcode(); + U = C; + } + + if (const PointerType *Ty = dyn_cast(Obj->getType())) + if (Ty->getAddressSpace() > 255) + // Fast instruction selection doesn't support the special + // address spaces. + return false; + + switch (Opcode) { + default: + break; + case Instruction::BitCast: { + // Look through bitcasts. + return ComputeAddress(U->getOperand(0), Addr); + } + case Instruction::IntToPtr: { + // Look past no-op inttoptrs. + if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) + return ComputeAddress(U->getOperand(0), Addr); + break; + } + case Instruction::PtrToInt: { + // Look past no-op ptrtoints. + if (TLI.getValueType(U->getType()) == TLI.getPointerTy()) + return ComputeAddress(U->getOperand(0), Addr); + break; + } + case Instruction::GetElementPtr: { + Address SavedAddr = Addr; + uint64_t TmpOffset = Addr.getOffset(); + + // Iterate through the GEP folding the constants into offsets where + // we can. + gep_type_iterator GTI = gep_type_begin(U); + for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e; + ++i, ++GTI) { + const Value *Op = *i; + if (StructType *STy = dyn_cast(*GTI)) { + const StructLayout *SL = DL.getStructLayout(STy); + unsigned Idx = cast(Op)->getZExtValue(); + TmpOffset += SL->getElementOffset(Idx); + } else { + uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); + for (;;) { + if (const ConstantInt *CI = dyn_cast(Op)) { + // Constant-offset addressing. + TmpOffset += CI->getSExtValue() * S; + break; + } + if (canFoldAddIntoGEP(U, Op)) { + // A compatible add with a constant operand. Fold the constant. + ConstantInt *CI = + cast(cast(Op)->getOperand(1)); + TmpOffset += CI->getSExtValue() * S; + // Iterate on the other operand. + Op = cast(Op)->getOperand(0); + continue; + } + // Unsupported + goto unsupported_gep; + } + } + } + + // Try to grab the base operand now. + Addr.setOffset(TmpOffset); + if (ComputeAddress(U->getOperand(0), Addr)) + return true; + + // We failed, restore everything and try the other options. + Addr = SavedAddr; + + unsupported_gep: + break; + } + case Instruction::Alloca: { + const AllocaInst *AI = cast(Obj); + DenseMap::iterator SI = + FuncInfo.StaticAllocaMap.find(AI); + if (SI != FuncInfo.StaticAllocaMap.end()) { + Addr.setKind(Address::FrameIndexBase); + Addr.setFI(SI->second); + return true; + } + break; + } + } + + // Try to get this in a register if nothing else has worked. + if (!Addr.isValid()) + Addr.setReg(getRegForValue(Obj)); + return Addr.isValid(); +} + +bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) { + EVT evt = TLI.getValueType(Ty, true); + + // Only handle simple types. + if (evt == MVT::Other || !evt.isSimple()) + return false; + VT = evt.getSimpleVT(); + + // This is a legal type, but it's not something we handle in fast-isel. + if (VT == MVT::f128) + return false; + + // Handle all other legal types, i.e. a register that will directly hold this + // value. + return TLI.isTypeLegal(VT); +} + +bool AArch64FastISel::isLoadStoreTypeLegal(Type *Ty, MVT &VT) { + if (isTypeLegal(Ty, VT)) + return true; + + // If this is a type than can be sign or zero-extended to a basic operation + // go ahead and accept it now. For stores, this reflects truncation. + if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16) + return true; + + return false; +} + +bool AArch64FastISel::SimplifyAddress(Address &Addr, MVT VT, + int64_t ScaleFactor, bool UseUnscaled) { + bool needsLowering = false; + int64_t Offset = Addr.getOffset(); + switch (VT.SimpleTy) { + default: + return false; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + case MVT::i64: + case MVT::f32: + case MVT::f64: + if (!UseUnscaled) + // Using scaled, 12-bit, unsigned immediate offsets. + needsLowering = ((Offset & 0xfff) != Offset); + else + // Using unscaled, 9-bit, signed immediate offsets. + needsLowering = (Offset > 256 || Offset < -256); + break; + } + + // FIXME: If this is a stack pointer and the offset needs to be simplified + // then put the alloca address into a register, set the base type back to + // register and continue. This should almost never happen. + if (needsLowering && Addr.getKind() == Address::FrameIndexBase) { + return false; + } + + // Since the offset is too large for the load/store instruction get the + // reg+offset into a register. + if (needsLowering) { + uint64_t UnscaledOffset = Addr.getOffset() * ScaleFactor; + unsigned ResultReg = FastEmit_ri_(MVT::i64, ISD::ADD, Addr.getReg(), false, + UnscaledOffset, MVT::i64); + if (ResultReg == 0) + return false; + Addr.setReg(ResultReg); + Addr.setOffset(0); + } + return true; +} + +void AArch64FastISel::AddLoadStoreOperands(Address &Addr, + const MachineInstrBuilder &MIB, + unsigned Flags, bool UseUnscaled) { + int64_t Offset = Addr.getOffset(); + // Frame base works a bit differently. Handle it separately. + if (Addr.getKind() == Address::FrameIndexBase) { + int FI = Addr.getFI(); + // FIXME: We shouldn't be using getObjectSize/getObjectAlignment. The size + // and alignment should be based on the VT. + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(FI, Offset), Flags, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); + // Now add the rest of the operands. + MIB.addFrameIndex(FI).addImm(Offset).addMemOperand(MMO); + } else { + // Now add the rest of the operands. + MIB.addReg(Addr.getReg()); + MIB.addImm(Offset); + } +} + +bool AArch64FastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address Addr, + bool UseUnscaled) { + // Negative offsets require unscaled, 9-bit, signed immediate offsets. + // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets. + if (!UseUnscaled && Addr.getOffset() < 0) + UseUnscaled = true; + + unsigned Opc; + const TargetRegisterClass *RC; + bool VTIsi1 = false; + int64_t ScaleFactor = 0; + switch (VT.SimpleTy) { + default: + return false; + case MVT::i1: + VTIsi1 = true; + // Intentional fall-through. + case MVT::i8: + Opc = UseUnscaled ? AArch64::LDURBBi : AArch64::LDRBBui; + RC = &AArch64::GPR32RegClass; + ScaleFactor = 1; + break; + case MVT::i16: + Opc = UseUnscaled ? AArch64::LDURHHi : AArch64::LDRHHui; + RC = &AArch64::GPR32RegClass; + ScaleFactor = 2; + break; + case MVT::i32: + Opc = UseUnscaled ? AArch64::LDURWi : AArch64::LDRWui; + RC = &AArch64::GPR32RegClass; + ScaleFactor = 4; + break; + case MVT::i64: + Opc = UseUnscaled ? AArch64::LDURXi : AArch64::LDRXui; + RC = &AArch64::GPR64RegClass; + ScaleFactor = 8; + break; + case MVT::f32: + Opc = UseUnscaled ? AArch64::LDURSi : AArch64::LDRSui; + RC = TLI.getRegClassFor(VT); + ScaleFactor = 4; + break; + case MVT::f64: + Opc = UseUnscaled ? AArch64::LDURDi : AArch64::LDRDui; + RC = TLI.getRegClassFor(VT); + ScaleFactor = 8; + break; + } + // Scale the offset. + if (!UseUnscaled) { + int64_t Offset = Addr.getOffset(); + if (Offset & (ScaleFactor - 1)) + // Retry using an unscaled, 9-bit, signed immediate offset. + return EmitLoad(VT, ResultReg, Addr, /*UseUnscaled*/ true); + + Addr.setOffset(Offset / ScaleFactor); + } + + // Simplify this down to something we can handle. + if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled)) + return false; + + // Create the base instruction, then add the operands. + ResultReg = createResultReg(RC); + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(Opc), ResultReg); + AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, UseUnscaled); + + // Loading an i1 requires special handling. + if (VTIsi1) { + MRI.constrainRegClass(ResultReg, &AArch64::GPR32RegClass); + unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri), + ANDReg) + .addReg(ResultReg) + .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); + ResultReg = ANDReg; + } + return true; +} + +bool AArch64FastISel::SelectLoad(const Instruction *I) { + MVT VT; + // Verify we have a legal type before going any further. Currently, we handle + // simple types that will directly fit in a register (i32/f32/i64/f64) or + // those that can be sign or zero-extended to a basic operation (i1/i8/i16). + if (!isLoadStoreTypeLegal(I->getType(), VT) || cast(I)->isAtomic()) + return false; + + // See if we can handle this address. + Address Addr; + if (!ComputeAddress(I->getOperand(0), Addr)) + return false; + + unsigned ResultReg; + if (!EmitLoad(VT, ResultReg, Addr)) + return false; + + UpdateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr, + bool UseUnscaled) { + // Negative offsets require unscaled, 9-bit, signed immediate offsets. + // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets. + if (!UseUnscaled && Addr.getOffset() < 0) + UseUnscaled = true; + + unsigned StrOpc; + bool VTIsi1 = false; + int64_t ScaleFactor = 0; + // Using scaled, 12-bit, unsigned immediate offsets. + switch (VT.SimpleTy) { + default: + return false; + case MVT::i1: + VTIsi1 = true; + case MVT::i8: + StrOpc = UseUnscaled ? AArch64::STURBBi : AArch64::STRBBui; + ScaleFactor = 1; + break; + case MVT::i16: + StrOpc = UseUnscaled ? AArch64::STURHHi : AArch64::STRHHui; + ScaleFactor = 2; + break; + case MVT::i32: + StrOpc = UseUnscaled ? AArch64::STURWi : AArch64::STRWui; + ScaleFactor = 4; + break; + case MVT::i64: + StrOpc = UseUnscaled ? AArch64::STURXi : AArch64::STRXui; + ScaleFactor = 8; + break; + case MVT::f32: + StrOpc = UseUnscaled ? AArch64::STURSi : AArch64::STRSui; + ScaleFactor = 4; + break; + case MVT::f64: + StrOpc = UseUnscaled ? AArch64::STURDi : AArch64::STRDui; + ScaleFactor = 8; + break; + } + // Scale the offset. + if (!UseUnscaled) { + int64_t Offset = Addr.getOffset(); + if (Offset & (ScaleFactor - 1)) + // Retry using an unscaled, 9-bit, signed immediate offset. + return EmitStore(VT, SrcReg, Addr, /*UseUnscaled*/ true); + + Addr.setOffset(Offset / ScaleFactor); + } + + // Simplify this down to something we can handle. + if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled)) + return false; + + // Storing an i1 requires special handling. + if (VTIsi1) { + MRI.constrainRegClass(SrcReg, &AArch64::GPR32RegClass); + unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri), + ANDReg) + .addReg(SrcReg) + .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); + SrcReg = ANDReg; + } + // Create the base instruction, then add the operands. + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(StrOpc)).addReg(SrcReg); + AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOStore, UseUnscaled); + return true; +} + +bool AArch64FastISel::SelectStore(const Instruction *I) { + MVT VT; + Value *Op0 = I->getOperand(0); + // Verify we have a legal type before going any further. Currently, we handle + // simple types that will directly fit in a register (i32/f32/i64/f64) or + // those that can be sign or zero-extended to a basic operation (i1/i8/i16). + if (!isLoadStoreTypeLegal(Op0->getType(), VT) || + cast(I)->isAtomic()) + return false; + + // Get the value to be stored into a register. + unsigned SrcReg = getRegForValue(Op0); + if (SrcReg == 0) + return false; + + // See if we can handle this address. + Address Addr; + if (!ComputeAddress(I->getOperand(1), Addr)) + return false; + + if (!EmitStore(VT, SrcReg, Addr)) + return false; + return true; +} + +static AArch64CC::CondCode getCompareCC(CmpInst::Predicate Pred) { + switch (Pred) { + case CmpInst::FCMP_ONE: + case CmpInst::FCMP_UEQ: + default: + // AL is our "false" for now. The other two need more compares. + return AArch64CC::AL; + case CmpInst::ICMP_EQ: + case CmpInst::FCMP_OEQ: + return AArch64CC::EQ; + case CmpInst::ICMP_SGT: + case CmpInst::FCMP_OGT: + return AArch64CC::GT; + case CmpInst::ICMP_SGE: + case CmpInst::FCMP_OGE: + return AArch64CC::GE; + case CmpInst::ICMP_UGT: + case CmpInst::FCMP_UGT: + return AArch64CC::HI; + case CmpInst::FCMP_OLT: + return AArch64CC::MI; + case CmpInst::ICMP_ULE: + case CmpInst::FCMP_OLE: + return AArch64CC::LS; + case CmpInst::FCMP_ORD: + return AArch64CC::VC; + case CmpInst::FCMP_UNO: + return AArch64CC::VS; + case CmpInst::FCMP_UGE: + return AArch64CC::PL; + case CmpInst::ICMP_SLT: + case CmpInst::FCMP_ULT: + return AArch64CC::LT; + case CmpInst::ICMP_SLE: + case CmpInst::FCMP_ULE: + return AArch64CC::LE; + case CmpInst::FCMP_UNE: + case CmpInst::ICMP_NE: + return AArch64CC::NE; + case CmpInst::ICMP_UGE: + return AArch64CC::HS; + case CmpInst::ICMP_ULT: + return AArch64CC::LO; + } +} + +bool AArch64FastISel::SelectBranch(const Instruction *I) { + const BranchInst *BI = cast(I); + MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; + MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; + + if (const CmpInst *CI = dyn_cast(BI->getCondition())) { + if (CI->hasOneUse() && (CI->getParent() == I->getParent())) { + // We may not handle every CC for now. + AArch64CC::CondCode CC = getCompareCC(CI->getPredicate()); + if (CC == AArch64CC::AL) + return false; + + // Emit the cmp. + if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned())) + return false; + + // Emit the branch. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) + .addImm(CC) + .addMBB(TBB); + FuncInfo.MBB->addSuccessor(TBB); + + FastEmitBranch(FBB, DbgLoc); + return true; + } + } else if (TruncInst *TI = dyn_cast(BI->getCondition())) { + MVT SrcVT; + if (TI->hasOneUse() && TI->getParent() == I->getParent() && + (isLoadStoreTypeLegal(TI->getOperand(0)->getType(), SrcVT))) { + unsigned CondReg = getRegForValue(TI->getOperand(0)); + if (CondReg == 0) + return false; + + // Issue an extract_subreg to get the lower 32-bits. + if (SrcVT == MVT::i64) + CondReg = FastEmitInst_extractsubreg(MVT::i32, CondReg, /*Kill=*/true, + AArch64::sub_32); + + MRI.constrainRegClass(CondReg, &AArch64::GPR32RegClass); + unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::ANDWri), ANDReg) + .addReg(CondReg) + .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::SUBSWri)) + .addReg(ANDReg) + .addReg(ANDReg) + .addImm(0) + .addImm(0); + + unsigned CC = AArch64CC::NE; + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + CC = AArch64CC::EQ; + } + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) + .addImm(CC) + .addMBB(TBB); + FuncInfo.MBB->addSuccessor(TBB); + FastEmitBranch(FBB, DbgLoc); + return true; + } + } else if (const ConstantInt *CI = + dyn_cast(BI->getCondition())) { + uint64_t Imm = CI->getZExtValue(); + MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB; + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::B)) + .addMBB(Target); + FuncInfo.MBB->addSuccessor(Target); + return true; + } + + unsigned CondReg = getRegForValue(BI->getCondition()); + if (CondReg == 0) + return false; + + // We've been divorced from our compare! Our block was split, and + // now our compare lives in a predecessor block. We musn't + // re-compare here, as the children of the compare aren't guaranteed + // live across the block boundary (we *could* check for this). + // Regardless, the compare has been done in the predecessor block, + // and it left a value for us in a virtual register. Ergo, we test + // the one-bit value left in the virtual register. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBSWri), + AArch64::WZR) + .addReg(CondReg) + .addImm(0) + .addImm(0); + + unsigned CC = AArch64CC::NE; + if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { + std::swap(TBB, FBB); + CC = AArch64CC::EQ; + } + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) + .addImm(CC) + .addMBB(TBB); + FuncInfo.MBB->addSuccessor(TBB); + FastEmitBranch(FBB, DbgLoc); + return true; +} + +bool AArch64FastISel::SelectIndirectBr(const Instruction *I) { + const IndirectBrInst *BI = cast(I); + unsigned AddrReg = getRegForValue(BI->getOperand(0)); + if (AddrReg == 0) + return false; + + // Emit the indirect branch. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BR)) + .addReg(AddrReg); + + // Make sure the CFG is up-to-date. + for (unsigned i = 0, e = BI->getNumSuccessors(); i != e; ++i) + FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[BI->getSuccessor(i)]); + + return true; +} + +bool AArch64FastISel::EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt) { + Type *Ty = Src1Value->getType(); + EVT SrcEVT = TLI.getValueType(Ty, true); + if (!SrcEVT.isSimple()) + return false; + MVT SrcVT = SrcEVT.getSimpleVT(); + + // Check to see if the 2nd operand is a constant that we can encode directly + // in the compare. + uint64_t Imm; + bool UseImm = false; + bool isNegativeImm = false; + if (const ConstantInt *ConstInt = dyn_cast(Src2Value)) { + if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 || + SrcVT == MVT::i8 || SrcVT == MVT::i1) { + const APInt &CIVal = ConstInt->getValue(); + + Imm = (isZExt) ? CIVal.getZExtValue() : CIVal.getSExtValue(); + if (CIVal.isNegative()) { + isNegativeImm = true; + Imm = -Imm; + } + // FIXME: We can handle more immediates using shifts. + UseImm = ((Imm & 0xfff) == Imm); + } + } else if (const ConstantFP *ConstFP = dyn_cast(Src2Value)) { + if (SrcVT == MVT::f32 || SrcVT == MVT::f64) + if (ConstFP->isZero() && !ConstFP->isNegative()) + UseImm = true; + } + + unsigned ZReg; + unsigned CmpOpc; + bool isICmp = true; + bool needsExt = false; + switch (SrcVT.SimpleTy) { + default: + return false; + case MVT::i1: + case MVT::i8: + case MVT::i16: + needsExt = true; + // Intentional fall-through. + case MVT::i32: + ZReg = AArch64::WZR; + if (UseImm) + CmpOpc = isNegativeImm ? AArch64::ADDSWri : AArch64::SUBSWri; + else + CmpOpc = AArch64::SUBSWrr; + break; + case MVT::i64: + ZReg = AArch64::XZR; + if (UseImm) + CmpOpc = isNegativeImm ? AArch64::ADDSXri : AArch64::SUBSXri; + else + CmpOpc = AArch64::SUBSXrr; + break; + case MVT::f32: + isICmp = false; + CmpOpc = UseImm ? AArch64::FCMPSri : AArch64::FCMPSrr; + break; + case MVT::f64: + isICmp = false; + CmpOpc = UseImm ? AArch64::FCMPDri : AArch64::FCMPDrr; + break; + } + + unsigned SrcReg1 = getRegForValue(Src1Value); + if (SrcReg1 == 0) + return false; + + unsigned SrcReg2; + if (!UseImm) { + SrcReg2 = getRegForValue(Src2Value); + if (SrcReg2 == 0) + return false; + } + + // We have i1, i8, or i16, we need to either zero extend or sign extend. + if (needsExt) { + SrcReg1 = EmitIntExt(SrcVT, SrcReg1, MVT::i32, isZExt); + if (SrcReg1 == 0) + return false; + if (!UseImm) { + SrcReg2 = EmitIntExt(SrcVT, SrcReg2, MVT::i32, isZExt); + if (SrcReg2 == 0) + return false; + } + } + + if (isICmp) { + if (UseImm) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc)) + .addReg(ZReg) + .addReg(SrcReg1) + .addImm(Imm) + .addImm(0); + else + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc)) + .addReg(ZReg) + .addReg(SrcReg1) + .addReg(SrcReg2); + } else { + if (UseImm) + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc)) + .addReg(SrcReg1); + else + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc)) + .addReg(SrcReg1) + .addReg(SrcReg2); + } + return true; +} + +bool AArch64FastISel::SelectCmp(const Instruction *I) { + const CmpInst *CI = cast(I); + + // We may not handle every CC for now. + AArch64CC::CondCode CC = getCompareCC(CI->getPredicate()); + if (CC == AArch64CC::AL) + return false; + + // Emit the cmp. + if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned())) + return false; + + // Now set a register based on the comparison. + AArch64CC::CondCode invertedCC = getInvertedCondCode(CC); + unsigned ResultReg = createResultReg(&AArch64::GPR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr), + ResultReg) + .addReg(AArch64::WZR) + .addReg(AArch64::WZR) + .addImm(invertedCC); + + UpdateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::SelectSelect(const Instruction *I) { + const SelectInst *SI = cast(I); + + EVT DestEVT = TLI.getValueType(SI->getType(), true); + if (!DestEVT.isSimple()) + return false; + + MVT DestVT = DestEVT.getSimpleVT(); + if (DestVT != MVT::i32 && DestVT != MVT::i64 && DestVT != MVT::f32 && + DestVT != MVT::f64) + return false; + + unsigned CondReg = getRegForValue(SI->getCondition()); + if (CondReg == 0) + return false; + unsigned TrueReg = getRegForValue(SI->getTrueValue()); + if (TrueReg == 0) + return false; + unsigned FalseReg = getRegForValue(SI->getFalseValue()); + if (FalseReg == 0) + return false; + + + MRI.constrainRegClass(CondReg, &AArch64::GPR32RegClass); + unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri), + ANDReg) + .addReg(CondReg) + .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBSWri)) + .addReg(ANDReg) + .addReg(ANDReg) + .addImm(0) + .addImm(0); + + unsigned SelectOpc; + switch (DestVT.SimpleTy) { + default: + return false; + case MVT::i32: + SelectOpc = AArch64::CSELWr; + break; + case MVT::i64: + SelectOpc = AArch64::CSELXr; + break; + case MVT::f32: + SelectOpc = AArch64::FCSELSrrr; + break; + case MVT::f64: + SelectOpc = AArch64::FCSELDrrr; + break; + } + + unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SelectOpc), + ResultReg) + .addReg(TrueReg) + .addReg(FalseReg) + .addImm(AArch64CC::NE); + + UpdateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::SelectFPExt(const Instruction *I) { + Value *V = I->getOperand(0); + if (!I->getType()->isDoubleTy() || !V->getType()->isFloatTy()) + return false; + + unsigned Op = getRegForValue(V); + if (Op == 0) + return false; + + unsigned ResultReg = createResultReg(&AArch64::FPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTDSr), + ResultReg).addReg(Op); + UpdateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::SelectFPTrunc(const Instruction *I) { + Value *V = I->getOperand(0); + if (!I->getType()->isFloatTy() || !V->getType()->isDoubleTy()) + return false; + + unsigned Op = getRegForValue(V); + if (Op == 0) + return false; + + unsigned ResultReg = createResultReg(&AArch64::FPR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTSDr), + ResultReg).addReg(Op); + UpdateValueMap(I, ResultReg); + return true; +} + +// FPToUI and FPToSI +bool AArch64FastISel::SelectFPToInt(const Instruction *I, bool Signed) { + MVT DestVT; + if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector()) + return false; + + unsigned SrcReg = getRegForValue(I->getOperand(0)); + if (SrcReg == 0) + return false; + + EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true); + if (SrcVT == MVT::f128) + return false; + + unsigned Opc; + if (SrcVT == MVT::f64) { + if (Signed) + Opc = (DestVT == MVT::i32) ? AArch64::FCVTZSUWDr : AArch64::FCVTZSUXDr; + else + Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWDr : AArch64::FCVTZUUXDr; + } else { + if (Signed) + Opc = (DestVT == MVT::i32) ? AArch64::FCVTZSUWSr : AArch64::FCVTZSUXSr; + else + Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWSr : AArch64::FCVTZUUXSr; + } + unsigned ResultReg = createResultReg( + DestVT == MVT::i32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addReg(SrcReg); + UpdateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::SelectIntToFP(const Instruction *I, bool Signed) { + MVT DestVT; + if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector()) + return false; + assert ((DestVT == MVT::f32 || DestVT == MVT::f64) && + "Unexpected value type."); + + unsigned SrcReg = getRegForValue(I->getOperand(0)); + if (SrcReg == 0) + return false; + + EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true); + + // Handle sign-extension. + if (SrcVT == MVT::i16 || SrcVT == MVT::i8 || SrcVT == MVT::i1) { + SrcReg = + EmitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed); + if (SrcReg == 0) + return false; + } + + MRI.constrainRegClass(SrcReg, SrcVT == MVT::i64 ? &AArch64::GPR64RegClass + : &AArch64::GPR32RegClass); + + unsigned Opc; + if (SrcVT == MVT::i64) { + if (Signed) + Opc = (DestVT == MVT::f32) ? AArch64::SCVTFUXSri : AArch64::SCVTFUXDri; + else + Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUXSri : AArch64::UCVTFUXDri; + } else { + if (Signed) + Opc = (DestVT == MVT::f32) ? AArch64::SCVTFUWSri : AArch64::SCVTFUWDri; + else + Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUWSri : AArch64::UCVTFUWDri; + } + + unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addReg(SrcReg); + UpdateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::ProcessCallArgs( + SmallVectorImpl &Args, SmallVectorImpl &ArgRegs, + SmallVectorImpl &ArgVTs, SmallVectorImpl &ArgFlags, + SmallVectorImpl &RegArgs, CallingConv::ID CC, + unsigned &NumBytes) { + SmallVector ArgLocs; + CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context); + CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC)); + + // Get a count of how many bytes are to be pushed on the stack. + NumBytes = CCInfo.getNextStackOffset(); + + // Issue CALLSEQ_START + unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) + .addImm(NumBytes); + + // Process the args. + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + unsigned Arg = ArgRegs[VA.getValNo()]; + MVT ArgVT = ArgVTs[VA.getValNo()]; + + // Handle arg promotion: SExt, ZExt, AExt. + switch (VA.getLocInfo()) { + case CCValAssign::Full: + break; + case CCValAssign::SExt: { + MVT DestVT = VA.getLocVT(); + MVT SrcVT = ArgVT; + Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ false); + if (Arg == 0) + return false; + ArgVT = DestVT; + break; + } + case CCValAssign::AExt: + // Intentional fall-through. + case CCValAssign::ZExt: { + MVT DestVT = VA.getLocVT(); + MVT SrcVT = ArgVT; + Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ true); + if (Arg == 0) + return false; + ArgVT = DestVT; + break; + } + default: + llvm_unreachable("Unknown arg promotion!"); + } + + // Now copy/store arg to correct locations. + if (VA.isRegLoc() && !VA.needsCustom()) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg); + RegArgs.push_back(VA.getLocReg()); + } else if (VA.needsCustom()) { + // FIXME: Handle custom args. + return false; + } else { + assert(VA.isMemLoc() && "Assuming store on stack."); + + // Need to store on the stack. + unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8; + + unsigned BEAlign = 0; + if (ArgSize < 8 && !Subtarget->isLittleEndian()) + BEAlign = 8 - ArgSize; + + Address Addr; + Addr.setKind(Address::RegBase); + Addr.setReg(AArch64::SP); + Addr.setOffset(VA.getLocMemOffset() + BEAlign); + + if (!EmitStore(ArgVT, Arg, Addr)) + return false; + } + } + return true; +} + +bool AArch64FastISel::FinishCall(MVT RetVT, SmallVectorImpl &UsedRegs, + const Instruction *I, CallingConv::ID CC, + unsigned &NumBytes) { + // Issue CALLSEQ_END + unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp)) + .addImm(NumBytes) + .addImm(0); + + // Now the return value. + if (RetVT != MVT::isVoid) { + SmallVector RVLocs; + CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context); + CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC)); + + // Only handle a single return value. + if (RVLocs.size() != 1) + return false; + + // Copy all of the result registers out of their specified physreg. + MVT CopyVT = RVLocs[0].getValVT(); + unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), + ResultReg).addReg(RVLocs[0].getLocReg()); + UsedRegs.push_back(RVLocs[0].getLocReg()); + + // Finally update the result. + UpdateValueMap(I, ResultReg); + } + + return true; +} + +bool AArch64FastISel::SelectCall(const Instruction *I, + const char *IntrMemName = nullptr) { + const CallInst *CI = cast(I); + const Value *Callee = CI->getCalledValue(); + + // Don't handle inline asm or intrinsics. + if (isa(Callee)) + return false; + + // Only handle global variable Callees. + const GlobalValue *GV = dyn_cast(Callee); + if (!GV) + return false; + + // Check the calling convention. + ImmutableCallSite CS(CI); + CallingConv::ID CC = CS.getCallingConv(); + + // Let SDISel handle vararg functions. + PointerType *PT = cast(CS.getCalledValue()->getType()); + FunctionType *FTy = cast(PT->getElementType()); + if (FTy->isVarArg()) + return false; + + // Handle *simple* calls for now. + MVT RetVT; + Type *RetTy = I->getType(); + if (RetTy->isVoidTy()) + RetVT = MVT::isVoid; + else if (!isTypeLegal(RetTy, RetVT)) + return false; + + // Set up the argument vectors. + SmallVector Args; + SmallVector ArgRegs; + SmallVector ArgVTs; + SmallVector ArgFlags; + Args.reserve(CS.arg_size()); + ArgRegs.reserve(CS.arg_size()); + ArgVTs.reserve(CS.arg_size()); + ArgFlags.reserve(CS.arg_size()); + + for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); + i != e; ++i) { + // If we're lowering a memory intrinsic instead of a regular call, skip the + // last two arguments, which shouldn't be passed to the underlying function. + if (IntrMemName && e - i <= 2) + break; + + unsigned Arg = getRegForValue(*i); + if (Arg == 0) + return false; + + ISD::ArgFlagsTy Flags; + unsigned AttrInd = i - CS.arg_begin() + 1; + if (CS.paramHasAttr(AttrInd, Attribute::SExt)) + Flags.setSExt(); + if (CS.paramHasAttr(AttrInd, Attribute::ZExt)) + Flags.setZExt(); + + // FIXME: Only handle *easy* calls for now. + if (CS.paramHasAttr(AttrInd, Attribute::InReg) || + CS.paramHasAttr(AttrInd, Attribute::StructRet) || + CS.paramHasAttr(AttrInd, Attribute::Nest) || + CS.paramHasAttr(AttrInd, Attribute::ByVal)) + return false; + + MVT ArgVT; + Type *ArgTy = (*i)->getType(); + if (!isTypeLegal(ArgTy, ArgVT) && + !(ArgVT == MVT::i1 || ArgVT == MVT::i8 || ArgVT == MVT::i16)) + return false; + + // We don't handle vector parameters yet. + if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64) + return false; + + unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy); + Flags.setOrigAlign(OriginalAlignment); + + Args.push_back(*i); + ArgRegs.push_back(Arg); + ArgVTs.push_back(ArgVT); + ArgFlags.push_back(Flags); + } + + // Handle the arguments now that we've gotten them. + SmallVector RegArgs; + unsigned NumBytes; + if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes)) + return false; + + // Issue the call. + MachineInstrBuilder MIB; + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BL)); + if (!IntrMemName) + MIB.addGlobalAddress(GV, 0, 0); + else + MIB.addExternalSymbol(IntrMemName, 0); + + // Add implicit physical register uses to the call. + for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) + MIB.addReg(RegArgs[i], RegState::Implicit); + + // Add a register mask with the call-preserved registers. + // Proper defs for return values will be added by setPhysRegsDeadExcept(). + MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv())); + + // Finish off the call including any return values. + SmallVector UsedRegs; + if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes)) + return false; + + // Set all unused physreg defs as dead. + static_cast(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI); + + return true; +} + +bool AArch64FastISel::IsMemCpySmall(uint64_t Len, unsigned Alignment) { + if (Alignment) + return Len / Alignment <= 4; + else + return Len < 32; +} + +bool AArch64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src, + uint64_t Len, unsigned Alignment) { + // Make sure we don't bloat code by inlining very large memcpy's. + if (!IsMemCpySmall(Len, Alignment)) + return false; + + int64_t UnscaledOffset = 0; + Address OrigDest = Dest; + Address OrigSrc = Src; + + while (Len) { + MVT VT; + if (!Alignment || Alignment >= 8) { + if (Len >= 8) + VT = MVT::i64; + else if (Len >= 4) + VT = MVT::i32; + else if (Len >= 2) + VT = MVT::i16; + else { + VT = MVT::i8; + } + } else { + // Bound based on alignment. + if (Len >= 4 && Alignment == 4) + VT = MVT::i32; + else if (Len >= 2 && Alignment == 2) + VT = MVT::i16; + else { + VT = MVT::i8; + } + } + + bool RV; + unsigned ResultReg; + RV = EmitLoad(VT, ResultReg, Src); + assert(RV == true && "Should be able to handle this load."); + RV = EmitStore(VT, ResultReg, Dest); + assert(RV == true && "Should be able to handle this store."); + (void)RV; + + int64_t Size = VT.getSizeInBits() / 8; + Len -= Size; + UnscaledOffset += Size; + + // We need to recompute the unscaled offset for each iteration. + Dest.setOffset(OrigDest.getOffset() + UnscaledOffset); + Src.setOffset(OrigSrc.getOffset() + UnscaledOffset); + } + + return true; +} + +bool AArch64FastISel::SelectIntrinsicCall(const IntrinsicInst &I) { + // FIXME: Handle more intrinsics. + switch (I.getIntrinsicID()) { + default: + return false; + case Intrinsic::memcpy: + case Intrinsic::memmove: { + const MemTransferInst &MTI = cast(I); + // Don't handle volatile. + if (MTI.isVolatile()) + return false; + + // Disable inlining for memmove before calls to ComputeAddress. Otherwise, + // we would emit dead code because we don't currently handle memmoves. + bool isMemCpy = (I.getIntrinsicID() == Intrinsic::memcpy); + if (isa(MTI.getLength()) && isMemCpy) { + // Small memcpy's are common enough that we want to do them without a call + // if possible. + uint64_t Len = cast(MTI.getLength())->getZExtValue(); + unsigned Alignment = MTI.getAlignment(); + if (IsMemCpySmall(Len, Alignment)) { + Address Dest, Src; + if (!ComputeAddress(MTI.getRawDest(), Dest) || + !ComputeAddress(MTI.getRawSource(), Src)) + return false; + if (TryEmitSmallMemCpy(Dest, Src, Len, Alignment)) + return true; + } + } + + if (!MTI.getLength()->getType()->isIntegerTy(64)) + return false; + + if (MTI.getSourceAddressSpace() > 255 || MTI.getDestAddressSpace() > 255) + // Fast instruction selection doesn't support the special + // address spaces. + return false; + + const char *IntrMemName = isa(I) ? "memcpy" : "memmove"; + return SelectCall(&I, IntrMemName); + } + case Intrinsic::memset: { + const MemSetInst &MSI = cast(I); + // Don't handle volatile. + if (MSI.isVolatile()) + return false; + + if (!MSI.getLength()->getType()->isIntegerTy(64)) + return false; + + if (MSI.getDestAddressSpace() > 255) + // Fast instruction selection doesn't support the special + // address spaces. + return false; + + return SelectCall(&I, "memset"); + } + case Intrinsic::trap: { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK)) + .addImm(1); + return true; + } + } + return false; +} + +bool AArch64FastISel::SelectRet(const Instruction *I) { + const ReturnInst *Ret = cast(I); + const Function &F = *I->getParent()->getParent(); + + if (!FuncInfo.CanLowerReturn) + return false; + + if (F.isVarArg()) + return false; + + // Build a list of return value registers. + SmallVector RetRegs; + + if (Ret->getNumOperands() > 0) { + CallingConv::ID CC = F.getCallingConv(); + SmallVector Outs; + GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector ValLocs; + CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs, + I->getContext()); + CCAssignFn *RetCC = CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS + : RetCC_AArch64_AAPCS; + CCInfo.AnalyzeReturn(Outs, RetCC); + + // Only handle a single return value for now. + if (ValLocs.size() != 1) + return false; + + CCValAssign &VA = ValLocs[0]; + const Value *RV = Ret->getOperand(0); + + // Don't bother handling odd stuff for now. + if (VA.getLocInfo() != CCValAssign::Full) + return false; + // Only handle register returns for now. + if (!VA.isRegLoc()) + return false; + unsigned Reg = getRegForValue(RV); + if (Reg == 0) + return false; + + unsigned SrcReg = Reg + VA.getValNo(); + unsigned DestReg = VA.getLocReg(); + // Avoid a cross-class copy. This is very unlikely. + if (!MRI.getRegClass(SrcReg)->contains(DestReg)) + return false; + + EVT RVEVT = TLI.getValueType(RV->getType()); + if (!RVEVT.isSimple()) + return false; + + // Vectors (of > 1 lane) in big endian need tricky handling. + if (RVEVT.isVector() && RVEVT.getVectorNumElements() > 1) + return false; + + MVT RVVT = RVEVT.getSimpleVT(); + if (RVVT == MVT::f128) + return false; + MVT DestVT = VA.getValVT(); + // Special handling for extended integers. + if (RVVT != DestVT) { + if (RVVT != MVT::i1 && RVVT != MVT::i8 && RVVT != MVT::i16) + return false; + + if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt()) + return false; + + bool isZExt = Outs[0].Flags.isZExt(); + SrcReg = EmitIntExt(RVVT, SrcReg, DestVT, isZExt); + if (SrcReg == 0) + return false; + } + + // Make the copy. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), DestReg).addReg(SrcReg); + + // Add register to return instruction. + RetRegs.push_back(VA.getLocReg()); + } + + MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::RET_ReallyLR)); + for (unsigned i = 0, e = RetRegs.size(); i != e; ++i) + MIB.addReg(RetRegs[i], RegState::Implicit); + return true; +} + +bool AArch64FastISel::SelectTrunc(const Instruction *I) { + Type *DestTy = I->getType(); + Value *Op = I->getOperand(0); + Type *SrcTy = Op->getType(); + + EVT SrcEVT = TLI.getValueType(SrcTy, true); + EVT DestEVT = TLI.getValueType(DestTy, true); + if (!SrcEVT.isSimple()) + return false; + if (!DestEVT.isSimple()) + return false; + + MVT SrcVT = SrcEVT.getSimpleVT(); + MVT DestVT = DestEVT.getSimpleVT(); + + if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 && + SrcVT != MVT::i8) + return false; + if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8 && + DestVT != MVT::i1) + return false; + + unsigned SrcReg = getRegForValue(Op); + if (!SrcReg) + return false; + + // If we're truncating from i64 to a smaller non-legal type then generate an + // AND. Otherwise, we know the high bits are undefined and a truncate doesn't + // generate any code. + if (SrcVT == MVT::i64) { + uint64_t Mask = 0; + switch (DestVT.SimpleTy) { + default: + // Trunc i64 to i32 is handled by the target-independent fast-isel. + return false; + case MVT::i1: + Mask = 0x1; + break; + case MVT::i8: + Mask = 0xff; + break; + case MVT::i16: + Mask = 0xffff; + break; + } + // Issue an extract_subreg to get the lower 32-bits. + unsigned Reg32 = FastEmitInst_extractsubreg(MVT::i32, SrcReg, /*Kill=*/true, + AArch64::sub_32); + MRI.constrainRegClass(Reg32, &AArch64::GPR32RegClass); + // Create the AND instruction which performs the actual truncation. + unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri), + ANDReg) + .addReg(Reg32) + .addImm(AArch64_AM::encodeLogicalImmediate(Mask, 32)); + SrcReg = ANDReg; + } + + UpdateValueMap(I, SrcReg); + return true; +} + +unsigned AArch64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) { + assert((DestVT == MVT::i8 || DestVT == MVT::i16 || DestVT == MVT::i32 || + DestVT == MVT::i64) && + "Unexpected value type."); + // Handle i8 and i16 as i32. + if (DestVT == MVT::i8 || DestVT == MVT::i16) + DestVT = MVT::i32; + + if (isZExt) { + MRI.constrainRegClass(SrcReg, &AArch64::GPR32RegClass); + unsigned ResultReg = createResultReg(&AArch64::GPR32spRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri), + ResultReg) + .addReg(SrcReg) + .addImm(AArch64_AM::encodeLogicalImmediate(1, 32)); + + if (DestVT == MVT::i64) { + // We're ZExt i1 to i64. The ANDWri Wd, Ws, #1 implicitly clears the + // upper 32 bits. Emit a SUBREG_TO_REG to extend from Wd to Xd. + unsigned Reg64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::SUBREG_TO_REG), Reg64) + .addImm(0) + .addReg(ResultReg) + .addImm(AArch64::sub_32); + ResultReg = Reg64; + } + return ResultReg; + } else { + if (DestVT == MVT::i64) { + // FIXME: We're SExt i1 to i64. + return 0; + } + unsigned ResultReg = createResultReg(&AArch64::GPR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SBFMWri), + ResultReg) + .addReg(SrcReg) + .addImm(0) + .addImm(0); + return ResultReg; + } +} + +unsigned AArch64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, + bool isZExt) { + assert(DestVT != MVT::i1 && "ZeroExt/SignExt an i1?"); + unsigned Opc; + unsigned Imm = 0; + + switch (SrcVT.SimpleTy) { + default: + return 0; + case MVT::i1: + return Emiti1Ext(SrcReg, DestVT, isZExt); + case MVT::i8: + if (DestVT == MVT::i64) + Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri; + else + Opc = isZExt ? AArch64::UBFMWri : AArch64::SBFMWri; + Imm = 7; + break; + case MVT::i16: + if (DestVT == MVT::i64) + Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri; + else + Opc = isZExt ? AArch64::UBFMWri : AArch64::SBFMWri; + Imm = 15; + break; + case MVT::i32: + assert(DestVT == MVT::i64 && "IntExt i32 to i32?!?"); + Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri; + Imm = 31; + break; + } + + // Handle i8 and i16 as i32. + if (DestVT == MVT::i8 || DestVT == MVT::i16) + DestVT = MVT::i32; + else if (DestVT == MVT::i64) { + unsigned Src64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(AArch64::SUBREG_TO_REG), Src64) + .addImm(0) + .addReg(SrcReg) + .addImm(AArch64::sub_32); + SrcReg = Src64; + } + + unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addReg(SrcReg) + .addImm(0) + .addImm(Imm); + + return ResultReg; +} + +bool AArch64FastISel::SelectIntExt(const Instruction *I) { + // On ARM, in general, integer casts don't involve legal types; this code + // handles promotable integers. The high bits for a type smaller than + // the register size are assumed to be undefined. + Type *DestTy = I->getType(); + Value *Src = I->getOperand(0); + Type *SrcTy = Src->getType(); + + bool isZExt = isa(I); + unsigned SrcReg = getRegForValue(Src); + if (!SrcReg) + return false; + + EVT SrcEVT = TLI.getValueType(SrcTy, true); + EVT DestEVT = TLI.getValueType(DestTy, true); + if (!SrcEVT.isSimple()) + return false; + if (!DestEVT.isSimple()) + return false; + + MVT SrcVT = SrcEVT.getSimpleVT(); + MVT DestVT = DestEVT.getSimpleVT(); + unsigned ResultReg = EmitIntExt(SrcVT, SrcReg, DestVT, isZExt); + if (ResultReg == 0) + return false; + UpdateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) { + EVT DestEVT = TLI.getValueType(I->getType(), true); + if (!DestEVT.isSimple()) + return false; + + MVT DestVT = DestEVT.getSimpleVT(); + if (DestVT != MVT::i64 && DestVT != MVT::i32) + return false; + + unsigned DivOpc; + bool is64bit = (DestVT == MVT::i64); + switch (ISDOpcode) { + default: + return false; + case ISD::SREM: + DivOpc = is64bit ? AArch64::SDIVXr : AArch64::SDIVWr; + break; + case ISD::UREM: + DivOpc = is64bit ? AArch64::UDIVXr : AArch64::UDIVWr; + break; + } + unsigned MSubOpc = is64bit ? AArch64::MSUBXrrr : AArch64::MSUBWrrr; + unsigned Src0Reg = getRegForValue(I->getOperand(0)); + if (!Src0Reg) + return false; + + unsigned Src1Reg = getRegForValue(I->getOperand(1)); + if (!Src1Reg) + return false; + + unsigned QuotReg = createResultReg(TLI.getRegClassFor(DestVT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(DivOpc), QuotReg) + .addReg(Src0Reg) + .addReg(Src1Reg); + // The remainder is computed as numerator - (quotient * denominator) using the + // MSUB instruction. + unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MSubOpc), ResultReg) + .addReg(QuotReg) + .addReg(Src1Reg) + .addReg(Src0Reg); + UpdateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::SelectMul(const Instruction *I) { + EVT SrcEVT = TLI.getValueType(I->getOperand(0)->getType(), true); + if (!SrcEVT.isSimple()) + return false; + MVT SrcVT = SrcEVT.getSimpleVT(); + + // Must be simple value type. Don't handle vectors. + if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 && + SrcVT != MVT::i8) + return false; + + unsigned Opc; + unsigned ZReg; + switch (SrcVT.SimpleTy) { + default: + return false; + case MVT::i8: + case MVT::i16: + case MVT::i32: + ZReg = AArch64::WZR; + Opc = AArch64::MADDWrrr; + break; + case MVT::i64: + ZReg = AArch64::XZR; + Opc = AArch64::MADDXrrr; + break; + } + + unsigned Src0Reg = getRegForValue(I->getOperand(0)); + if (!Src0Reg) + return false; + + unsigned Src1Reg = getRegForValue(I->getOperand(1)); + if (!Src1Reg) + return false; + + // Create the base instruction, then add the operands. + unsigned ResultReg = createResultReg(TLI.getRegClassFor(SrcVT)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) + .addReg(Src0Reg) + .addReg(Src1Reg) + .addReg(ZReg); + UpdateValueMap(I, ResultReg); + return true; +} + +bool AArch64FastISel::TargetSelectInstruction(const Instruction *I) { + switch (I->getOpcode()) { + default: + break; + case Instruction::Load: + return SelectLoad(I); + case Instruction::Store: + return SelectStore(I); + case Instruction::Br: + return SelectBranch(I); + case Instruction::IndirectBr: + return SelectIndirectBr(I); + case Instruction::FCmp: + case Instruction::ICmp: + return SelectCmp(I); + case Instruction::Select: + return SelectSelect(I); + case Instruction::FPExt: + return SelectFPExt(I); + case Instruction::FPTrunc: + return SelectFPTrunc(I); + case Instruction::FPToSI: + return SelectFPToInt(I, /*Signed=*/true); + case Instruction::FPToUI: + return SelectFPToInt(I, /*Signed=*/false); + case Instruction::SIToFP: + return SelectIntToFP(I, /*Signed=*/true); + case Instruction::UIToFP: + return SelectIntToFP(I, /*Signed=*/false); + case Instruction::SRem: + return SelectRem(I, ISD::SREM); + case Instruction::URem: + return SelectRem(I, ISD::UREM); + case Instruction::Call: + if (const IntrinsicInst *II = dyn_cast(I)) + return SelectIntrinsicCall(*II); + return SelectCall(I); + case Instruction::Ret: + return SelectRet(I); + case Instruction::Trunc: + return SelectTrunc(I); + case Instruction::ZExt: + case Instruction::SExt: + return SelectIntExt(I); + case Instruction::Mul: + // FIXME: This really should be handled by the target-independent selector. + return SelectMul(I); + } + return false; + // Silence warnings. + (void)&CC_AArch64_DarwinPCS_VarArg; +} + +namespace llvm { +llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) { + return new AArch64FastISel(funcInfo, libInfo); +} +} diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp new file mode 100644 index 00000000000..deb306a506d --- /dev/null +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -0,0 +1,891 @@ +//===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the AArch64 implementation of TargetFrameLowering class. +// +//===----------------------------------------------------------------------===// + +#include "AArch64FrameLowering.h" +#include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64Subtarget.h" +#include "AArch64TargetMachine.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "frame-info" + +static cl::opt EnableRedZone("aarch64-redzone", + cl::desc("enable use of redzone on AArch64"), + cl::init(false), cl::Hidden); + +STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); + +static unsigned estimateStackSize(MachineFunction &MF) { + const MachineFrameInfo *FFI = MF.getFrameInfo(); + int Offset = 0; + for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) { + int FixedOff = -FFI->getObjectOffset(i); + if (FixedOff > Offset) + Offset = FixedOff; + } + for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) { + if (FFI->isDeadObjectIndex(i)) + continue; + Offset += FFI->getObjectSize(i); + unsigned Align = FFI->getObjectAlignment(i); + // Adjust to alignment boundary + Offset = (Offset + Align - 1) / Align * Align; + } + // This does not include the 16 bytes used for fp and lr. + return (unsigned)Offset; +} + +bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { + if (!EnableRedZone) + return false; + // Don't use the red zone if the function explicitly asks us not to. + // This is typically used for kernel code. + if (MF.getFunction()->getAttributes().hasAttribute( + AttributeSet::FunctionIndex, Attribute::NoRedZone)) + return false; + + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const AArch64FunctionInfo *AFI = MF.getInfo(); + unsigned NumBytes = AFI->getLocalStackSize(); + + // Note: currently hasFP() is always true for hasCalls(), but that's an + // implementation detail of the current code, not a strict requirement, + // so stay safe here and check both. + if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128) + return false; + return true; +} + +/// hasFP - Return true if the specified function should have a dedicated frame +/// pointer register. +bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + +#ifndef NDEBUG + const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo(); + assert(!RegInfo->needsStackRealignment(MF) && + "No stack realignment on AArch64!"); +#endif + + return (MFI->hasCalls() || MFI->hasVarSizedObjects() || + MFI->isFrameAddressTaken()); +} + +/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is +/// not required, we reserve argument space for call sites in the function +/// immediately on entry to the current function. This eliminates the need for +/// add/sub sp brackets around call sites. Returns true if the call frame is +/// included as part of the stack frame. +bool +AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { + return !MF.getFrameInfo()->hasVarSizedObjects(); +} + +void AArch64FrameLowering::eliminateCallFramePseudoInstr( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + const AArch64InstrInfo *TII = + static_cast(MF.getTarget().getInstrInfo()); + DebugLoc DL = I->getDebugLoc(); + int Opc = I->getOpcode(); + bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); + uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; + + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + if (!TFI->hasReservedCallFrame(MF)) { + unsigned Align = getStackAlignment(); + + int64_t Amount = I->getOperand(0).getImm(); + Amount = RoundUpToAlignment(Amount, Align); + if (!IsDestroy) + Amount = -Amount; + + // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it + // doesn't have to pop anything), then the first operand will be zero too so + // this adjustment is a no-op. + if (CalleePopAmount == 0) { + // FIXME: in-function stack adjustment for calls is limited to 24-bits + // because there's no guaranteed temporary register available. + // + // ADD/SUB (immediate) has only LSL #0 and LSL #12 avaiable. + // 1) For offset <= 12-bit, we use LSL #0 + // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses + // LSL #0, and the other uses LSL #12. + // + // Mostly call frames will be allocated at the start of a function so + // this is OK, but it is a limitation that needs dealing with. + assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large"); + emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII); + } + } else if (CalleePopAmount != 0) { + // If the calling convention demands that the callee pops arguments from the + // stack, we want to add it back if we have a reserved call frame. + assert(CalleePopAmount < 0xffffff && "call frame too large"); + emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount, + TII); + } + MBB.erase(I); +} + +void AArch64FrameLowering::emitCalleeSavedFrameMoves( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + unsigned FramePtr) const { + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineModuleInfo &MMI = MF.getMMI(); + const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); + const AArch64InstrInfo *TII = TM.getInstrInfo(); + DebugLoc DL = MBB.findDebugLoc(MBBI); + + // Add callee saved registers to move list. + const std::vector &CSI = MFI->getCalleeSavedInfo(); + if (CSI.empty()) + return; + + const DataLayout *TD = MF.getTarget().getDataLayout(); + bool HasFP = hasFP(MF); + + // Calculate amount of bytes used for return address storing. + int stackGrowth = -TD->getPointerSize(0); + + // Calculate offsets. + int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth; + unsigned TotalSkipped = 0; + for (const auto &Info : CSI) { + unsigned Reg = Info.getReg(); + int64_t Offset = MFI->getObjectOffset(Info.getFrameIdx()) - + getOffsetOfLocalArea() + saveAreaOffset; + + // Don't output a new CFI directive if we're re-saving the frame pointer or + // link register. This happens when the PrologEpilogInserter has inserted an + // extra "STP" of the frame pointer and link register -- the "emitPrologue" + // method automatically generates the directives when frame pointers are + // used. If we generate CFI directives for the extra "STP"s, the linker will + // lose track of the correct values for the frame pointer and link register. + if (HasFP && (FramePtr == Reg || Reg == AArch64::LR)) { + TotalSkipped += stackGrowth; + continue; + } + + unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); + unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( + nullptr, DwarfReg, Offset - TotalSkipped)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } +} + +void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const { + MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB. + MachineBasicBlock::iterator MBBI = MBB.begin(); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const Function *Fn = MF.getFunction(); + const AArch64RegisterInfo *RegInfo = TM.getRegisterInfo(); + const AArch64InstrInfo *TII = TM.getInstrInfo(); + MachineModuleInfo &MMI = MF.getMMI(); + AArch64FunctionInfo *AFI = MF.getInfo(); + bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry(); + bool HasFP = hasFP(MF); + DebugLoc DL = MBB.findDebugLoc(MBBI); + + int NumBytes = (int)MFI->getStackSize(); + if (!AFI->hasStackFrame()) { + assert(!HasFP && "unexpected function without stack frame but with FP"); + + // All of the stack allocation is for locals. + AFI->setLocalStackSize(NumBytes); + + // Label used to tie together the PROLOG_LABEL and the MachineMoves. + MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol(); + + // REDZONE: If the stack size is less than 128 bytes, we don't need + // to actually allocate. + if (NumBytes && !canUseRedZone(MF)) { + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, + MachineInstr::FrameSetup); + + // Encode the stack size of the leaf function. + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } else if (NumBytes) { + ++NumRedZoneFunctions; + } + + return; + } + + // Only set up FP if we actually need to. + int FPOffset = 0; + if (HasFP) { + // First instruction must a) allocate the stack and b) have an immediate + // that is a multiple of -2. + assert((MBBI->getOpcode() == AArch64::STPXpre || + MBBI->getOpcode() == AArch64::STPDpre) && + MBBI->getOperand(3).getReg() == AArch64::SP && + MBBI->getOperand(4).getImm() < 0 && + (MBBI->getOperand(4).getImm() & 1) == 0); + + // Frame pointer is fp = sp - 16. Since the STPXpre subtracts the space + // required for the callee saved register area we get the frame pointer + // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8. + FPOffset = -(MBBI->getOperand(4).getImm() + 2) * 8; + assert(FPOffset >= 0 && "Bad Framepointer Offset"); + } + + // Move past the saves of the callee-saved registers. + while (MBBI->getOpcode() == AArch64::STPXi || + MBBI->getOpcode() == AArch64::STPDi || + MBBI->getOpcode() == AArch64::STPXpre || + MBBI->getOpcode() == AArch64::STPDpre) { + ++MBBI; + NumBytes -= 16; + } + assert(NumBytes >= 0 && "Negative stack allocation size!?"); + if (HasFP) { + // Issue sub fp, sp, FPOffset or + // mov fp,sp when FPOffset is zero. + // Note: All stores of callee-saved registers are marked as "FrameSetup". + // This code marks the instruction(s) that set the FP also. + emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, FPOffset, TII, + MachineInstr::FrameSetup); + } + + // All of the remaining stack allocations are for locals. + AFI->setLocalStackSize(NumBytes); + + // Allocate space for the rest of the frame. + if (NumBytes) { + // If we're a leaf function, try using the red zone. + if (!canUseRedZone(MF)) + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, + MachineInstr::FrameSetup); + } + + // If we need a base pointer, set it up here. It's whatever the value of the + // stack pointer is at this point. Any variable size objects will be allocated + // after this, so we can still use the base pointer to reference locals. + // + // FIXME: Clarify FrameSetup flags here. + // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is + // needed. + // + if (RegInfo->hasBasePointer(MF)) + TII->copyPhysReg(MBB, MBBI, DL, AArch64::X19, AArch64::SP, false); + + if (needsFrameMoves) { + const DataLayout *TD = MF.getTarget().getDataLayout(); + const int StackGrowth = -TD->getPointerSize(0); + unsigned FramePtr = RegInfo->getFrameRegister(MF); + + // An example of the prologue: + // + // .globl __foo + // .align 2 + // __foo: + // Ltmp0: + // .cfi_startproc + // .cfi_personality 155, ___gxx_personality_v0 + // Leh_func_begin: + // .cfi_lsda 16, Lexception33 + // + // stp xa,bx, [sp, -#offset]! + // ... + // stp x28, x27, [sp, #offset-32] + // stp fp, lr, [sp, #offset-16] + // add fp, sp, #offset - 16 + // sub sp, sp, #1360 + // + // The Stack: + // +-------------------------------------------+ + // 10000 | ........ | ........ | ........ | ........ | + // 10004 | ........ | ........ | ........ | ........ | + // +-------------------------------------------+ + // 10008 | ........ | ........ | ........ | ........ | + // 1000c | ........ | ........ | ........ | ........ | + // +===========================================+ + // 10010 | X28 Register | + // 10014 | X28 Register | + // +-------------------------------------------+ + // 10018 | X27 Register | + // 1001c | X27 Register | + // +===========================================+ + // 10020 | Frame Pointer | + // 10024 | Frame Pointer | + // +-------------------------------------------+ + // 10028 | Link Register | + // 1002c | Link Register | + // +===========================================+ + // 10030 | ........ | ........ | ........ | ........ | + // 10034 | ........ | ........ | ........ | ........ | + // +-------------------------------------------+ + // 10038 | ........ | ........ | ........ | ........ | + // 1003c | ........ | ........ | ........ | ........ | + // +-------------------------------------------+ + // + // [sp] = 10030 :: >>initial value<< + // sp = 10020 :: stp fp, lr, [sp, #-16]! + // fp = sp == 10020 :: mov fp, sp + // [sp] == 10020 :: stp x28, x27, [sp, #-16]! + // sp == 10010 :: >>final value<< + // + // The frame pointer (w29) points to address 10020. If we use an offset of + // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24 + // for w27, and -32 for w28: + // + // Ltmp1: + // .cfi_def_cfa w29, 16 + // Ltmp2: + // .cfi_offset w30, -8 + // Ltmp3: + // .cfi_offset w29, -16 + // Ltmp4: + // .cfi_offset w27, -24 + // Ltmp5: + // .cfi_offset w28, -32 + + if (HasFP) { + // Define the current CFA rule to use the provided FP. + unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true); + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + + // Record the location of the stored LR + unsigned LR = RegInfo->getDwarfRegNum(AArch64::LR, true); + CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createOffset(nullptr, LR, StackGrowth)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + + // Record the location of the stored FP + CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } else { + // Encode the stack size of the leaf function. + unsigned CFIIndex = MMI.addFrameInst( + MCCFIInstruction::createDefCfaOffset(nullptr, -MFI->getStackSize())); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex); + } + + // Now emit the moves for whatever callee saved regs we have. + emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr); + } +} + +static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) { + for (unsigned i = 0; CSRegs[i]; ++i) + if (Reg == CSRegs[i]) + return true; + return false; +} + +static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) { + unsigned RtIdx = 0; + if (MI->getOpcode() == AArch64::LDPXpost || + MI->getOpcode() == AArch64::LDPDpost) + RtIdx = 1; + + if (MI->getOpcode() == AArch64::LDPXpost || + MI->getOpcode() == AArch64::LDPDpost || + MI->getOpcode() == AArch64::LDPXi || MI->getOpcode() == AArch64::LDPDi) { + if (!isCalleeSavedRegister(MI->getOperand(RtIdx).getReg(), CSRegs) || + !isCalleeSavedRegister(MI->getOperand(RtIdx + 1).getReg(), CSRegs) || + MI->getOperand(RtIdx + 2).getReg() != AArch64::SP) + return false; + return true; + } + + return false; +} + +void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + assert(MBBI->isReturn() && "Can only insert epilog into returning blocks"); + MachineFrameInfo *MFI = MF.getFrameInfo(); + const AArch64InstrInfo *TII = + static_cast(MF.getTarget().getInstrInfo()); + const AArch64RegisterInfo *RegInfo = static_cast( + MF.getTarget().getRegisterInfo()); + DebugLoc DL = MBBI->getDebugLoc(); + unsigned RetOpcode = MBBI->getOpcode(); + + int NumBytes = MFI->getStackSize(); + const AArch64FunctionInfo *AFI = MF.getInfo(); + + // Initial and residual are named for consitency with the prologue. Note that + // in the epilogue, the residual adjustment is executed first. + uint64_t ArgumentPopSize = 0; + if (RetOpcode == AArch64::TCRETURNdi || RetOpcode == AArch64::TCRETURNri) { + MachineOperand &StackAdjust = MBBI->getOperand(1); + + // For a tail-call in a callee-pops-arguments environment, some or all of + // the stack may actually be in use for the call's arguments, this is + // calculated during LowerCall and consumed here... + ArgumentPopSize = StackAdjust.getImm(); + } else { + // ... otherwise the amount to pop is *all* of the argument space, + // conveniently stored in the MachineFunctionInfo by + // LowerFormalArguments. This will, of course, be zero for the C calling + // convention. + ArgumentPopSize = AFI->getArgumentStackToRestore(); + } + + // The stack frame should be like below, + // + // ---------------------- --- + // | | | + // | BytesInStackArgArea| CalleeArgStackSize + // | (NumReusableBytes) | (of tail call) + // | | --- + // | | | + // ---------------------| --- | + // | | | | + // | CalleeSavedReg | | | + // | (NumRestores * 16) | | | + // | | | | + // ---------------------| | NumBytes + // | | StackSize (StackAdjustUp) + // | LocalStackSize | | | + // | (covering callee | | | + // | args) | | | + // | | | | + // ---------------------- --- --- + // + // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize + // = StackSize + ArgumentPopSize + // + // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps + // it as the 2nd argument of AArch64ISD::TC_RETURN. + NumBytes += ArgumentPopSize; + + unsigned NumRestores = 0; + // Move past the restores of the callee-saved registers. + MachineBasicBlock::iterator LastPopI = MBBI; + const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); + if (LastPopI != MBB.begin()) { + do { + ++NumRestores; + --LastPopI; + } while (LastPopI != MBB.begin() && isCSRestore(LastPopI, CSRegs)); + if (!isCSRestore(LastPopI, CSRegs)) { + ++LastPopI; + --NumRestores; + } + } + NumBytes -= NumRestores * 16; + assert(NumBytes >= 0 && "Negative stack allocation size!?"); + + if (!hasFP(MF)) { + // If this was a redzone leaf function, we don't need to restore the + // stack pointer. + if (!canUseRedZone(MF)) + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, + TII); + return; + } + + // Restore the original stack pointer. + // FIXME: Rather than doing the math here, we should instead just use + // non-post-indexed loads for the restores if we aren't actually going to + // be able to save any instructions. + if (NumBytes || MFI->hasVarSizedObjects()) + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP, + -(NumRestores - 1) * 16, TII, MachineInstr::NoFlags); +} + +/// getFrameIndexOffset - Returns the displacement from the frame register to +/// the stack frame of the specified index. +int AArch64FrameLowering::getFrameIndexOffset(const MachineFunction &MF, + int FI) const { + unsigned FrameReg; + return getFrameIndexReference(MF, FI, FrameReg); +} + +/// getFrameIndexReference - Provide a base+offset reference to an FI slot for +/// debug info. It's the same as what we use for resolving the code-gen +/// references for now. FIXME: This can go wrong when references are +/// SP-relative and simple call frames aren't used. +int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, + int FI, + unsigned &FrameReg) const { + return resolveFrameIndexReference(MF, FI, FrameReg); +} + +int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF, + int FI, unsigned &FrameReg, + bool PreferFP) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const AArch64RegisterInfo *RegInfo = static_cast( + MF.getTarget().getRegisterInfo()); + const AArch64FunctionInfo *AFI = MF.getInfo(); + int FPOffset = MFI->getObjectOffset(FI) + 16; + int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize(); + bool isFixed = MFI->isFixedObjectIndex(FI); + + // Use frame pointer to reference fixed objects. Use it for locals if + // there are VLAs (and thus the SP isn't reliable as a base). + // Make sure useFPForScavengingIndex() does the right thing for the emergency + // spill slot. + bool UseFP = false; + if (AFI->hasStackFrame()) { + // Note: Keeping the following as multiple 'if' statements rather than + // merging to a single expression for readability. + // + // Argument access should always use the FP. + if (isFixed) { + UseFP = hasFP(MF); + } else if (hasFP(MF) && !RegInfo->hasBasePointer(MF)) { + // Use SP or FP, whichever gives us the best chance of the offset + // being in range for direct access. If the FPOffset is positive, + // that'll always be best, as the SP will be even further away. + // If the FPOffset is negative, we have to keep in mind that the + // available offset range for negative offsets is smaller than for + // positive ones. If we have variable sized objects, we're stuck with + // using the FP regardless, though, as the SP offset is unknown + // and we don't have a base pointer available. If an offset is + // available via the FP and the SP, use whichever is closest. + if (PreferFP || MFI->hasVarSizedObjects() || FPOffset >= 0 || + (FPOffset >= -256 && Offset > -FPOffset)) + UseFP = true; + } + } + + if (UseFP) { + FrameReg = RegInfo->getFrameRegister(MF); + return FPOffset; + } + + // Use the base pointer if we have one. + if (RegInfo->hasBasePointer(MF)) + FrameReg = RegInfo->getBaseRegister(); + else { + FrameReg = AArch64::SP; + // If we're using the red zone for this function, the SP won't actually + // be adjusted, so the offsets will be negative. They're also all + // within range of the signed 9-bit immediate instructions. + if (canUseRedZone(MF)) + Offset -= AFI->getLocalStackSize(); + } + + return Offset; +} + +static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) { + if (Reg != AArch64::LR) + return getKillRegState(true); + + // LR maybe referred to later by an @llvm.returnaddress intrinsic. + bool LRLiveIn = MF.getRegInfo().isLiveIn(AArch64::LR); + bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken()); + return getKillRegState(LRKill); +} + +bool AArch64FrameLowering::spillCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const std::vector &CSI, + const TargetRegisterInfo *TRI) const { + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + unsigned Count = CSI.size(); + DebugLoc DL; + assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); + + if (MI != MBB.end()) + DL = MI->getDebugLoc(); + + for (unsigned i = 0; i < Count; i += 2) { + unsigned idx = Count - i - 2; + unsigned Reg1 = CSI[idx].getReg(); + unsigned Reg2 = CSI[idx + 1].getReg(); + // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI + // list to come in sorted by frame index so that we can issue the store + // pair instructions directly. Assert if we see anything otherwise. + // + // The order of the registers in the list is controlled by + // getCalleeSavedRegs(), so they will always be in-order, as well. + assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() && + "Out of order callee saved regs!"); + unsigned StrOpc; + assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); + assert((i & 1) == 0 && "Odd index for callee-saved reg spill!"); + // Issue sequence of non-sp increment and pi sp spills for cs regs. The + // first spill is a pre-increment that allocates the stack. + // For example: + // stp x22, x21, [sp, #-48]! // addImm(-6) + // stp x20, x19, [sp, #16] // addImm(+2) + // stp fp, lr, [sp, #32] // addImm(+4) + // Rationale: This sequence saves uop updates compared to a sequence of + // pre-increment spills like stp xi,xj,[sp,#-16]! + // Note: Similar rational and sequence for restores in epilog. + if (AArch64::GPR64RegClass.contains(Reg1)) { + assert(AArch64::GPR64RegClass.contains(Reg2) && + "Expected GPR64 callee-saved register pair!"); + // For first spill use pre-increment store. + if (i == 0) + StrOpc = AArch64::STPXpre; + else + StrOpc = AArch64::STPXi; + } else if (AArch64::FPR64RegClass.contains(Reg1)) { + assert(AArch64::FPR64RegClass.contains(Reg2) && + "Expected FPR64 callee-saved register pair!"); + // For first spill use pre-increment store. + if (i == 0) + StrOpc = AArch64::STPDpre; + else + StrOpc = AArch64::STPDi; + } else + llvm_unreachable("Unexpected callee saved register!"); + DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", " + << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx() + << ", " << CSI[idx + 1].getFrameIdx() << ")\n"); + // Compute offset: i = 0 => offset = -Count; + // i = 2 => offset = -(Count - 2) + Count = 2 = i; etc. + const int Offset = (i == 0) ? -Count : i; + assert((Offset >= -64 && Offset <= 63) && + "Offset out of bounds for STP immediate"); + MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); + if (StrOpc == AArch64::STPDpre || StrOpc == AArch64::STPXpre) + MIB.addReg(AArch64::SP, RegState::Define); + + MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)) + .addReg(Reg1, getPrologueDeath(MF, Reg1)) + .addReg(AArch64::SP) + .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit + .setMIFlag(MachineInstr::FrameSetup); + } + return true; +} + +bool AArch64FrameLowering::restoreCalleeSavedRegisters( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + const std::vector &CSI, + const TargetRegisterInfo *TRI) const { + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); + unsigned Count = CSI.size(); + DebugLoc DL; + assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); + + if (MI != MBB.end()) + DL = MI->getDebugLoc(); + + for (unsigned i = 0; i < Count; i += 2) { + unsigned Reg1 = CSI[i].getReg(); + unsigned Reg2 = CSI[i + 1].getReg(); + // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI + // list to come in sorted by frame index so that we can issue the store + // pair instructions directly. Assert if we see anything otherwise. + assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() && + "Out of order callee saved regs!"); + // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only + // the last load is sp-pi post-increment and de-allocates the stack: + // For example: + // ldp fp, lr, [sp, #32] // addImm(+4) + // ldp x20, x19, [sp, #16] // addImm(+2) + // ldp x22, x21, [sp], #48 // addImm(+6) + // Note: see comment in spillCalleeSavedRegisters() + unsigned LdrOpc; + + assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); + assert((i & 1) == 0 && "Odd index for callee-saved reg spill!"); + if (AArch64::GPR64RegClass.contains(Reg1)) { + assert(AArch64::GPR64RegClass.contains(Reg2) && + "Expected GPR64 callee-saved register pair!"); + if (i == Count - 2) + LdrOpc = AArch64::LDPXpost; + else + LdrOpc = AArch64::LDPXi; + } else if (AArch64::FPR64RegClass.contains(Reg1)) { + assert(AArch64::FPR64RegClass.contains(Reg2) && + "Expected FPR64 callee-saved register pair!"); + if (i == Count - 2) + LdrOpc = AArch64::LDPDpost; + else + LdrOpc = AArch64::LDPDi; + } else + llvm_unreachable("Unexpected callee saved register!"); + DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", " + << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx() + << ", " << CSI[i + 1].getFrameIdx() << ")\n"); + + // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4; + // etc. + const int Offset = (i == Count - 2) ? Count : Count - i - 2; + assert((Offset >= -64 && Offset <= 63) && + "Offset out of bounds for LDP immediate"); + MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc)); + if (LdrOpc == AArch64::LDPXpost || LdrOpc == AArch64::LDPDpost) + MIB.addReg(AArch64::SP, RegState::Define); + + MIB.addReg(Reg2, getDefRegState(true)) + .addReg(Reg1, getDefRegState(true)) + .addReg(AArch64::SP) + .addImm(Offset); // [sp], #offset * 8 or [sp, #offset * 8] + // where the factor * 8 is implicit + } + return true; +} + +void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan( + MachineFunction &MF, RegScavenger *RS) const { + const AArch64RegisterInfo *RegInfo = static_cast( + MF.getTarget().getRegisterInfo()); + AArch64FunctionInfo *AFI = MF.getInfo(); + MachineRegisterInfo *MRI = &MF.getRegInfo(); + SmallVector UnspilledCSGPRs; + SmallVector UnspilledCSFPRs; + + // The frame record needs to be created by saving the appropriate registers + if (hasFP(MF)) { + MRI->setPhysRegUsed(AArch64::FP); + MRI->setPhysRegUsed(AArch64::LR); + } + + // Spill the BasePtr if it's used. Do this first thing so that the + // getCalleeSavedRegs() below will get the right answer. + if (RegInfo->hasBasePointer(MF)) + MRI->setPhysRegUsed(RegInfo->getBaseRegister()); + + // If any callee-saved registers are used, the frame cannot be eliminated. + unsigned NumGPRSpilled = 0; + unsigned NumFPRSpilled = 0; + bool ExtraCSSpill = false; + bool CanEliminateFrame = true; + DEBUG(dbgs() << "*** processFunctionBeforeCalleeSavedScan\nUsed CSRs:"); + const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); + + // Check pairs of consecutive callee-saved registers. + for (unsigned i = 0; CSRegs[i]; i += 2) { + assert(CSRegs[i + 1] && "Odd number of callee-saved registers!"); + + const unsigned OddReg = CSRegs[i]; + const unsigned EvenReg = CSRegs[i + 1]; + assert((AArch64::GPR64RegClass.contains(OddReg) && + AArch64::GPR64RegClass.contains(EvenReg)) ^ + (AArch64::FPR64RegClass.contains(OddReg) && + AArch64::FPR64RegClass.contains(EvenReg)) && + "Register class mismatch!"); + + const bool OddRegUsed = MRI->isPhysRegUsed(OddReg); + const bool EvenRegUsed = MRI->isPhysRegUsed(EvenReg); + + // Early exit if none of the registers in the register pair is actually + // used. + if (!OddRegUsed && !EvenRegUsed) { + if (AArch64::GPR64RegClass.contains(OddReg)) { + UnspilledCSGPRs.push_back(OddReg); + UnspilledCSGPRs.push_back(EvenReg); + } else { + UnspilledCSFPRs.push_back(OddReg); + UnspilledCSFPRs.push_back(EvenReg); + } + continue; + } + + unsigned Reg = AArch64::NoRegister; + // If only one of the registers of the register pair is used, make sure to + // mark the other one as used as well. + if (OddRegUsed ^ EvenRegUsed) { + // Find out which register is the additional spill. + Reg = OddRegUsed ? EvenReg : OddReg; + MRI->setPhysRegUsed(Reg); + } + + DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo)); + DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo)); + + assert(((OddReg == AArch64::LR && EvenReg == AArch64::FP) || + (RegInfo->getEncodingValue(OddReg) + 1 == + RegInfo->getEncodingValue(EvenReg))) && + "Register pair of non-adjacent registers!"); + if (AArch64::GPR64RegClass.contains(OddReg)) { + NumGPRSpilled += 2; + // If it's not a reserved register, we can use it in lieu of an + // emergency spill slot for the register scavenger. + // FIXME: It would be better to instead keep looking and choose another + // unspilled register that isn't reserved, if there is one. + if (Reg != AArch64::NoRegister && !RegInfo->isReservedReg(MF, Reg)) + ExtraCSSpill = true; + } else + NumFPRSpilled += 2; + + CanEliminateFrame = false; + } + + // FIXME: Set BigStack if any stack slot references may be out of range. + // For now, just conservatively guestimate based on unscaled indexing + // range. We'll end up allocating an unnecessary spill slot a lot, but + // realistically that's not a big deal at this stage of the game. + // The CSR spill slots have not been allocated yet, so estimateStackSize + // won't include them. + MachineFrameInfo *MFI = MF.getFrameInfo(); + unsigned CFSize = estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled); + DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n"); + bool BigStack = (CFSize >= 256); + if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) + AFI->setHasStackFrame(true); + + // Estimate if we might need to scavenge a register at some point in order + // to materialize a stack offset. If so, either spill one additional + // callee-saved register or reserve a special spill slot to facilitate + // register scavenging. If we already spilled an extra callee-saved register + // above to keep the number of spills even, we don't need to do anything else + // here. + if (BigStack && !ExtraCSSpill) { + + // If we're adding a register to spill here, we have to add two of them + // to keep the number of regs to spill even. + assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!"); + unsigned Count = 0; + while (!UnspilledCSGPRs.empty() && Count < 2) { + unsigned Reg = UnspilledCSGPRs.back(); + UnspilledCSGPRs.pop_back(); + DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo) + << " to get a scratch register.\n"); + MRI->setPhysRegUsed(Reg); + ExtraCSSpill = true; + ++Count; + } + + // If we didn't find an extra callee-saved register to spill, create + // an emergency spill slot. + if (!ExtraCSSpill) { + const TargetRegisterClass *RC = &AArch64::GPR64RegClass; + int FI = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), false); + RS->addScavengingFrameIndex(FI); + DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI + << " as the emergency spill slot.\n"); + } + } +} diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h new file mode 100644 index 00000000000..0e00d168003 --- /dev/null +++ b/lib/Target/AArch64/AArch64FrameLowering.h @@ -0,0 +1,75 @@ +//==-- AArch64FrameLowering.h - TargetFrameLowering for AArch64 --*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +// +//===----------------------------------------------------------------------===// + +#ifndef AArch64_FRAMELOWERING_H +#define AArch64_FRAMELOWERING_H + +#include "llvm/Target/TargetFrameLowering.h" + +namespace llvm { + +class AArch64Subtarget; +class AArch64TargetMachine; + +class AArch64FrameLowering : public TargetFrameLowering { + const AArch64TargetMachine &TM; + +public: + explicit AArch64FrameLowering(const AArch64TargetMachine &TM, + const AArch64Subtarget &STI) + : TargetFrameLowering(StackGrowsDown, 16, 0, 16, + false /*StackRealignable*/), + TM(TM) {} + + void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + unsigned FramePtr) const; + + void eliminateCallFramePseudoInstr(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const override; + + /// emitProlog/emitEpilog - These methods insert prolog and epilog code into + /// the function. + void emitPrologue(MachineFunction &MF) const override; + void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + + int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; + int resolveFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg, + bool PreferFP = false) const; + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector &CSI, + const TargetRegisterInfo *TRI) const override; + + bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const std::vector &CSI, + const TargetRegisterInfo *TRI) const override; + + /// \brief Can this function use the red zone for local allocations. + bool canUseRedZone(const MachineFunction &MF) const; + + bool hasFP(const MachineFunction &MF) const override; + bool hasReservedCallFrame(const MachineFunction &MF) const override; + + void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, + RegScavenger *RS) const override; +}; + +} // End llvm namespace + +#endif diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp new file mode 100644 index 00000000000..7007ffcce29 --- /dev/null +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -0,0 +1,3035 @@ +//===-- AArch64ISelDAGToDAG.cpp - A dag to dag inst selector for AArch64 --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines an instruction selector for the AArch64 target. +// +//===----------------------------------------------------------------------===// + +#include "AArch64TargetMachine.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/APSInt.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/IR/Function.h" // To access function attributes. +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-isel" + +//===--------------------------------------------------------------------===// +/// AArch64DAGToDAGISel - AArch64 specific code to select AArch64 machine +/// instructions for SelectionDAG operations. +/// +namespace { + +class AArch64DAGToDAGISel : public SelectionDAGISel { + AArch64TargetMachine &TM; + + /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can + /// make the right decision when generating code for different targets. + const AArch64Subtarget *Subtarget; + + bool ForCodeSize; + +public: + explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm, + CodeGenOpt::Level OptLevel) + : SelectionDAGISel(tm, OptLevel), TM(tm), Subtarget(nullptr), + ForCodeSize(false) {} + + const char *getPassName() const override { + return "AArch64 Instruction Selection"; + } + + bool runOnMachineFunction(MachineFunction &MF) override { + AttributeSet FnAttrs = MF.getFunction()->getAttributes(); + ForCodeSize = + FnAttrs.hasAttribute(AttributeSet::FunctionIndex, + Attribute::OptimizeForSize) || + FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); + Subtarget = &TM.getSubtarget(); + return SelectionDAGISel::runOnMachineFunction(MF); + } + + SDNode *Select(SDNode *Node) override; + + /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for + /// inline asm expressions. + bool SelectInlineAsmMemoryOperand(const SDValue &Op, + char ConstraintCode, + std::vector &OutOps) override; + + SDNode *SelectMLAV64LaneV128(SDNode *N); + SDNode *SelectMULLV64LaneV128(unsigned IntNo, SDNode *N); + bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift); + bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift); + bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift); + bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) { + return SelectShiftedRegister(N, false, Reg, Shift); + } + bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) { + return SelectShiftedRegister(N, true, Reg, Shift); + } + bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed(N, 1, Base, OffImm); + } + bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed(N, 2, Base, OffImm); + } + bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed(N, 4, Base, OffImm); + } + bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed(N, 8, Base, OffImm); + } + bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed(N, 16, Base, OffImm); + } + bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeUnscaled(N, 1, Base, OffImm); + } + bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeUnscaled(N, 2, Base, OffImm); + } + bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeUnscaled(N, 4, Base, OffImm); + } + bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeUnscaled(N, 8, Base, OffImm); + } + bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeUnscaled(N, 16, Base, OffImm); + } + + template + bool SelectAddrModeWRO(SDValue N, SDValue &Base, SDValue &Offset, + SDValue &SignExtend, SDValue &DoShift) { + return SelectAddrModeWRO(N, Width / 8, Base, Offset, SignExtend, DoShift); + } + + template + bool SelectAddrModeXRO(SDValue N, SDValue &Base, SDValue &Offset, + SDValue &SignExtend, SDValue &DoShift) { + return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift); + } + + + /// Form sequences of consecutive 64/128-bit registers for use in NEON + /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have + /// between 1 and 4 elements. If it contains a single element that is returned + /// unchanged; otherwise a REG_SEQUENCE value is returned. + SDValue createDTuple(ArrayRef Vecs); + SDValue createQTuple(ArrayRef Vecs); + + /// Generic helper for the createDTuple/createQTuple + /// functions. Those should almost always be called instead. + SDValue createTuple(ArrayRef Vecs, unsigned RegClassIDs[], + unsigned SubRegs[]); + + SDNode *SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt); + + SDNode *SelectIndexedLoad(SDNode *N, bool &Done); + + SDNode *SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc, + unsigned SubRegIdx); + SDNode *SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc, + unsigned SubRegIdx); + SDNode *SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); + SDNode *SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); + + SDNode *SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc); + SDNode *SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc); + SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); + SDNode *SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); + + SDNode *SelectSIMDAddSubNarrowing(unsigned IntNo, SDNode *Node); + SDNode *SelectSIMDXtnNarrowing(unsigned IntNo, SDNode *Node); + + SDNode *SelectBitfieldExtractOp(SDNode *N); + SDNode *SelectBitfieldInsertOp(SDNode *N); + + SDNode *SelectLIBM(SDNode *N); + +// Include the pieces autogenerated from the target description. +#include "AArch64GenDAGISel.inc" + +private: + bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg, + SDValue &Shift); + bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base, + SDValue &OffImm); + bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base, + SDValue &OffImm); + bool SelectAddrModeWRO(SDValue N, unsigned Size, SDValue &Base, + SDValue &Offset, SDValue &SignExtend, + SDValue &DoShift); + bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base, + SDValue &Offset, SDValue &SignExtend, + SDValue &DoShift); + bool isWorthFolding(SDValue V) const; + bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend, + SDValue &Offset, SDValue &SignExtend); + + template + bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) { + return SelectCVTFixedPosOperand(N, FixedPos, RegWidth); + } + + bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width); +}; +} // end anonymous namespace + +/// isIntImmediate - This method tests to see if the node is a constant +/// operand. If so Imm will receive the 32-bit value. +static bool isIntImmediate(const SDNode *N, uint64_t &Imm) { + if (const ConstantSDNode *C = dyn_cast(N)) { + Imm = C->getZExtValue(); + return true; + } + return false; +} + +// isIntImmediate - This method tests to see if a constant operand. +// If so Imm will receive the value. +static bool isIntImmediate(SDValue N, uint64_t &Imm) { + return isIntImmediate(N.getNode(), Imm); +} + +// isOpcWithIntImmediate - This method tests to see if the node is a specific +// opcode and that it has a immediate integer right operand. +// If so Imm will receive the 32 bit value. +static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, + uint64_t &Imm) { + return N->getOpcode() == Opc && + isIntImmediate(N->getOperand(1).getNode(), Imm); +} + +bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand( + const SDValue &Op, char ConstraintCode, std::vector &OutOps) { + assert(ConstraintCode == 'm' && "unexpected asm memory constraint"); + // Require the address to be in a register. That is safe for all AArch64 + // variants and it is hard to do anything much smarter without knowing + // how the operand is used. + OutOps.push_back(Op); + return false; +} + +/// SelectArithImmed - Select an immediate value that can be represented as +/// a 12-bit value shifted left by either 0 or 12. If so, return true with +/// Val set to the 12-bit value and Shift set to the shifter operand. +bool AArch64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val, + SDValue &Shift) { + // This function is called from the addsub_shifted_imm ComplexPattern, + // which lists [imm] as the list of opcode it's interested in, however + // we still need to check whether the operand is actually an immediate + // here because the ComplexPattern opcode list is only used in + // root-level opcode matching. + if (!isa(N.getNode())) + return false; + + uint64_t Immed = cast(N.getNode())->getZExtValue(); + unsigned ShiftAmt; + + if (Immed >> 12 == 0) { + ShiftAmt = 0; + } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) { + ShiftAmt = 12; + Immed = Immed >> 12; + } else + return false; + + unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt); + Val = CurDAG->getTargetConstant(Immed, MVT::i32); + Shift = CurDAG->getTargetConstant(ShVal, MVT::i32); + return true; +} + +/// SelectNegArithImmed - As above, but negates the value before trying to +/// select it. +bool AArch64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val, + SDValue &Shift) { + // This function is called from the addsub_shifted_imm ComplexPattern, + // which lists [imm] as the list of opcode it's interested in, however + // we still need to check whether the operand is actually an immediate + // here because the ComplexPattern opcode list is only used in + // root-level opcode matching. + if (!isa(N.getNode())) + return false; + + // The immediate operand must be a 24-bit zero-extended immediate. + uint64_t Immed = cast(N.getNode())->getZExtValue(); + + // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0" + // have the opposite effect on the C flag, so this pattern mustn't match under + // those circumstances. + if (Immed == 0) + return false; + + if (N.getValueType() == MVT::i32) + Immed = ~((uint32_t)Immed) + 1; + else + Immed = ~Immed + 1ULL; + if (Immed & 0xFFFFFFFFFF000000ULL) + return false; + + Immed &= 0xFFFFFFULL; + return SelectArithImmed(CurDAG->getConstant(Immed, MVT::i32), Val, Shift); +} + +/// getShiftTypeForNode - Translate a shift node to the corresponding +/// ShiftType value. +static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) { + switch (N.getOpcode()) { + default: + return AArch64_AM::InvalidShiftExtend; + case ISD::SHL: + return AArch64_AM::LSL; + case ISD::SRL: + return AArch64_AM::LSR; + case ISD::SRA: + return AArch64_AM::ASR; + case ISD::ROTR: + return AArch64_AM::ROR; + } +} + +/// \brief Determine wether it is worth to fold V into an extended register. +bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const { + // it hurts if the a value is used at least twice, unless we are optimizing + // for code size. + if (ForCodeSize || V.hasOneUse()) + return true; + return false; +} + +/// SelectShiftedRegister - Select a "shifted register" operand. If the value +/// is not shifted, set the Shift operand to default of "LSL 0". The logical +/// instructions allow the shifted register to be rotated, but the arithmetic +/// instructions do not. The AllowROR parameter specifies whether ROR is +/// supported. +bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR, + SDValue &Reg, SDValue &Shift) { + AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N); + if (ShType == AArch64_AM::InvalidShiftExtend) + return false; + if (!AllowROR && ShType == AArch64_AM::ROR) + return false; + + if (ConstantSDNode *RHS = dyn_cast(N.getOperand(1))) { + unsigned BitSize = N.getValueType().getSizeInBits(); + unsigned Val = RHS->getZExtValue() & (BitSize - 1); + unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val); + + Reg = N.getOperand(0); + Shift = CurDAG->getTargetConstant(ShVal, MVT::i32); + return isWorthFolding(N); + } + + return false; +} + +/// getExtendTypeForNode - Translate an extend node to the corresponding +/// ExtendType value. +static AArch64_AM::ShiftExtendType +getExtendTypeForNode(SDValue N, bool IsLoadStore = false) { + if (N.getOpcode() == ISD::SIGN_EXTEND || + N.getOpcode() == ISD::SIGN_EXTEND_INREG) { + EVT SrcVT; + if (N.getOpcode() == ISD::SIGN_EXTEND_INREG) + SrcVT = cast(N.getOperand(1))->getVT(); + else + SrcVT = N.getOperand(0).getValueType(); + + if (!IsLoadStore && SrcVT == MVT::i8) + return AArch64_AM::SXTB; + else if (!IsLoadStore && SrcVT == MVT::i16) + return AArch64_AM::SXTH; + else if (SrcVT == MVT::i32) + return AArch64_AM::SXTW; + assert(SrcVT != MVT::i64 && "extend from 64-bits?"); + + return AArch64_AM::InvalidShiftExtend; + } else if (N.getOpcode() == ISD::ZERO_EXTEND || + N.getOpcode() == ISD::ANY_EXTEND) { + EVT SrcVT = N.getOperand(0).getValueType(); + if (!IsLoadStore && SrcVT == MVT::i8) + return AArch64_AM::UXTB; + else if (!IsLoadStore && SrcVT == MVT::i16) + return AArch64_AM::UXTH; + else if (SrcVT == MVT::i32) + return AArch64_AM::UXTW; + assert(SrcVT != MVT::i64 && "extend from 64-bits?"); + + return AArch64_AM::InvalidShiftExtend; + } else if (N.getOpcode() == ISD::AND) { + ConstantSDNode *CSD = dyn_cast(N.getOperand(1)); + if (!CSD) + return AArch64_AM::InvalidShiftExtend; + uint64_t AndMask = CSD->getZExtValue(); + + switch (AndMask) { + default: + return AArch64_AM::InvalidShiftExtend; + case 0xFF: + return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend; + case 0xFFFF: + return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend; + case 0xFFFFFFFF: + return AArch64_AM::UXTW; + } + } + + return AArch64_AM::InvalidShiftExtend; +} + +// Helper for SelectMLAV64LaneV128 - Recognize high lane extracts. +static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) { + if (DL->getOpcode() != AArch64ISD::DUPLANE16 && + DL->getOpcode() != AArch64ISD::DUPLANE32) + return false; + + SDValue SV = DL->getOperand(0); + if (SV.getOpcode() != ISD::INSERT_SUBVECTOR) + return false; + + SDValue EV = SV.getOperand(1); + if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR) + return false; + + ConstantSDNode *DLidx = cast(DL->getOperand(1).getNode()); + ConstantSDNode *EVidx = cast(EV.getOperand(1).getNode()); + LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue(); + LaneOp = EV.getOperand(0); + + return true; +} + +// Helper for SelectOpcV64LaneV128 - Recogzine operatinos where one operand is a +// high lane extract. +static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp, + SDValue &LaneOp, int &LaneIdx) { + + if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) { + std::swap(Op0, Op1); + if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) + return false; + } + StdOp = Op1; + return true; +} + +/// SelectMLAV64LaneV128 - AArch64 supports vector MLAs where one multiplicand +/// is a lane in the upper half of a 128-bit vector. Recognize and select this +/// so that we don't emit unnecessary lane extracts. +SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) { + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + SDValue MLAOp1; // Will hold ordinary multiplicand for MLA. + SDValue MLAOp2; // Will hold lane-accessed multiplicand for MLA. + int LaneIdx = -1; // Will hold the lane index. + + if (Op1.getOpcode() != ISD::MUL || + !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2, + LaneIdx)) { + std::swap(Op0, Op1); + if (Op1.getOpcode() != ISD::MUL || + !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2, + LaneIdx)) + return nullptr; + } + + SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64); + + SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal }; + + unsigned MLAOpc = ~0U; + + switch (N->getSimpleValueType(0).SimpleTy) { + default: + llvm_unreachable("Unrecognized MLA."); + case MVT::v4i16: + MLAOpc = AArch64::MLAv4i16_indexed; + break; + case MVT::v8i16: + MLAOpc = AArch64::MLAv8i16_indexed; + break; + case MVT::v2i32: + MLAOpc = AArch64::MLAv2i32_indexed; + break; + case MVT::v4i32: + MLAOpc = AArch64::MLAv4i32_indexed; + break; + } + + return CurDAG->getMachineNode(MLAOpc, SDLoc(N), N->getValueType(0), Ops); +} + +SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) { + SDValue SMULLOp0; + SDValue SMULLOp1; + int LaneIdx; + + if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1, + LaneIdx)) + return nullptr; + + SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64); + + SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal }; + + unsigned SMULLOpc = ~0U; + + if (IntNo == Intrinsic::aarch64_neon_smull) { + switch (N->getSimpleValueType(0).SimpleTy) { + default: + llvm_unreachable("Unrecognized SMULL."); + case MVT::v4i32: + SMULLOpc = AArch64::SMULLv4i16_indexed; + break; + case MVT::v2i64: + SMULLOpc = AArch64::SMULLv2i32_indexed; + break; + } + } else if (IntNo == Intrinsic::aarch64_neon_umull) { + switch (N->getSimpleValueType(0).SimpleTy) { + default: + llvm_unreachable("Unrecognized SMULL."); + case MVT::v4i32: + SMULLOpc = AArch64::UMULLv4i16_indexed; + break; + case MVT::v2i64: + SMULLOpc = AArch64::UMULLv2i32_indexed; + break; + } + } else + llvm_unreachable("Unrecognized intrinsic."); + + return CurDAG->getMachineNode(SMULLOpc, SDLoc(N), N->getValueType(0), Ops); +} + +/// Instructions that accept extend modifiers like UXTW expect the register +/// being extended to be a GPR32, but the incoming DAG might be acting on a +/// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if +/// this is the case. +static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) { + if (N.getValueType() == MVT::i32) + return N; + + SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32); + MachineSDNode *Node = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + SDLoc(N), MVT::i32, N, SubReg); + return SDValue(Node, 0); +} + + +/// SelectArithExtendedRegister - Select a "extended register" operand. This +/// operand folds in an extend followed by an optional left shift. +bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg, + SDValue &Shift) { + unsigned ShiftVal = 0; + AArch64_AM::ShiftExtendType Ext; + + if (N.getOpcode() == ISD::SHL) { + ConstantSDNode *CSD = dyn_cast(N.getOperand(1)); + if (!CSD) + return false; + ShiftVal = CSD->getZExtValue(); + if (ShiftVal > 4) + return false; + + Ext = getExtendTypeForNode(N.getOperand(0)); + if (Ext == AArch64_AM::InvalidShiftExtend) + return false; + + Reg = N.getOperand(0).getOperand(0); + } else { + Ext = getExtendTypeForNode(N); + if (Ext == AArch64_AM::InvalidShiftExtend) + return false; + + Reg = N.getOperand(0); + } + + // AArch64 mandates that the RHS of the operation must use the smallest + // register classs that could contain the size being extended from. Thus, + // if we're folding a (sext i8), we need the RHS to be a GPR32, even though + // there might not be an actual 32-bit value in the program. We can + // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here. + assert(Ext != AArch64_AM::UXTX && Ext != AArch64_AM::SXTX); + Reg = narrowIfNeeded(CurDAG, Reg); + Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), MVT::i32); + return isWorthFolding(N); +} + +/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit +/// immediate" address. The "Size" argument is the size in bytes of the memory +/// reference, which determines the scale. +bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size, + SDValue &Base, SDValue &OffImm) { + const TargetLowering *TLI = getTargetLowering(); + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + OffImm = CurDAG->getTargetConstant(0, MVT::i64); + return true; + } + + if (N.getOpcode() == AArch64ISD::ADDlow) { + GlobalAddressSDNode *GAN = + dyn_cast(N.getOperand(1).getNode()); + Base = N.getOperand(0); + OffImm = N.getOperand(1); + if (!GAN) + return true; + + const GlobalValue *GV = GAN->getGlobal(); + unsigned Alignment = GV->getAlignment(); + const DataLayout *DL = TLI->getDataLayout(); + if (Alignment == 0 && !Subtarget->isTargetDarwin()) + Alignment = DL->getABITypeAlignment(GV->getType()->getElementType()); + + if (Alignment >= Size) + return true; + } + + if (CurDAG->isBaseWithConstantOffset(N)) { + if (ConstantSDNode *RHS = dyn_cast(N.getOperand(1))) { + int64_t RHSC = (int64_t)RHS->getZExtValue(); + unsigned Scale = Log2_32(Size); + if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + } + OffImm = CurDAG->getTargetConstant(RHSC >> Scale, MVT::i64); + return true; + } + } + } + + // Before falling back to our general case, check if the unscaled + // instructions can handle this. If so, that's preferable. + if (SelectAddrModeUnscaled(N, Size, Base, OffImm)) + return false; + + // Base only. The address will be materialized into a register before + // the memory is accessed. + // add x0, Xbase, #offset + // ldr x0, [x0] + Base = N; + OffImm = CurDAG->getTargetConstant(0, MVT::i64); + return true; +} + +/// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit +/// immediate" address. This should only match when there is an offset that +/// is not valid for a scaled immediate addressing mode. The "Size" argument +/// is the size in bytes of the memory reference, which is needed here to know +/// what is valid for a scaled immediate. +bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size, + SDValue &Base, + SDValue &OffImm) { + if (!CurDAG->isBaseWithConstantOffset(N)) + return false; + if (ConstantSDNode *RHS = dyn_cast(N.getOperand(1))) { + int64_t RHSC = RHS->getSExtValue(); + // If the offset is valid as a scaled immediate, don't match here. + if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && + RHSC < (0x1000 << Log2_32(Size))) + return false; + if (RHSC >= -256 && RHSC < 256) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast(Base)->getIndex(); + const TargetLowering *TLI = getTargetLowering(); + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + } + OffImm = CurDAG->getTargetConstant(RHSC, MVT::i64); + return true; + } + } + return false; +} + +static SDValue Widen(SelectionDAG *CurDAG, SDValue N) { + SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32); + SDValue ImpDef = SDValue( + CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SDLoc(N), MVT::i64), + 0); + MachineSDNode *Node = CurDAG->getMachineNode( + TargetOpcode::INSERT_SUBREG, SDLoc(N), MVT::i64, ImpDef, N, SubReg); + return SDValue(Node, 0); +} + +/// \brief Check if the given SHL node (\p N), can be used to form an +/// extended register for an addressing mode. +bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size, + bool WantExtend, SDValue &Offset, + SDValue &SignExtend) { + assert(N.getOpcode() == ISD::SHL && "Invalid opcode."); + ConstantSDNode *CSD = dyn_cast(N.getOperand(1)); + if (!CSD || (CSD->getZExtValue() & 0x7) != CSD->getZExtValue()) + return false; + + if (WantExtend) { + AArch64_AM::ShiftExtendType Ext = + getExtendTypeForNode(N.getOperand(0), true); + if (Ext == AArch64_AM::InvalidShiftExtend) + return false; + + Offset = narrowIfNeeded(CurDAG, N.getOperand(0).getOperand(0)); + SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32); + } else { + Offset = N.getOperand(0); + SignExtend = CurDAG->getTargetConstant(0, MVT::i32); + } + + unsigned LegalShiftVal = Log2_32(Size); + unsigned ShiftVal = CSD->getZExtValue(); + + if (ShiftVal != 0 && ShiftVal != LegalShiftVal) + return false; + + if (isWorthFolding(N)) + return true; + + return false; +} + +bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size, + SDValue &Base, SDValue &Offset, + SDValue &SignExtend, + SDValue &DoShift) { + if (N.getOpcode() != ISD::ADD) + return false; + SDValue LHS = N.getOperand(0); + SDValue RHS = N.getOperand(1); + + // We don't want to match immediate adds here, because they are better lowered + // to the register-immediate addressing modes. + if (isa(LHS) || isa(RHS)) + return false; + + // Check if this particular node is reused in any non-memory related + // operation. If yes, do not try to fold this node into the address + // computation, since the computation will be kept. + const SDNode *Node = N.getNode(); + for (SDNode *UI : Node->uses()) { + if (!isa(*UI)) + return false; + } + + // Remember if it is worth folding N when it produces extended register. + bool IsExtendedRegisterWorthFolding = isWorthFolding(N); + + // Try to match a shifted extend on the RHS. + if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL && + SelectExtendedSHL(RHS, Size, true, Offset, SignExtend)) { + Base = LHS; + DoShift = CurDAG->getTargetConstant(true, MVT::i32); + return true; + } + + // Try to match a shifted extend on the LHS. + if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL && + SelectExtendedSHL(LHS, Size, true, Offset, SignExtend)) { + Base = RHS; + DoShift = CurDAG->getTargetConstant(true, MVT::i32); + return true; + } + + // There was no shift, whatever else we find. + DoShift = CurDAG->getTargetConstant(false, MVT::i32); + + AArch64_AM::ShiftExtendType Ext = AArch64_AM::InvalidShiftExtend; + // Try to match an unshifted extend on the LHS. + if (IsExtendedRegisterWorthFolding && + (Ext = getExtendTypeForNode(LHS, true)) != + AArch64_AM::InvalidShiftExtend) { + Base = RHS; + Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0)); + SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32); + if (isWorthFolding(LHS)) + return true; + } + + // Try to match an unshifted extend on the RHS. + if (IsExtendedRegisterWorthFolding && + (Ext = getExtendTypeForNode(RHS, true)) != + AArch64_AM::InvalidShiftExtend) { + Base = LHS; + Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0)); + SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32); + if (isWorthFolding(RHS)) + return true; + } + + return false; +} + +bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size, + SDValue &Base, SDValue &Offset, + SDValue &SignExtend, + SDValue &DoShift) { + if (N.getOpcode() != ISD::ADD) + return false; + SDValue LHS = N.getOperand(0); + SDValue RHS = N.getOperand(1); + + // We don't want to match immediate adds here, because they are better lowered + // to the register-immediate addressing modes. + if (isa(LHS) || isa(RHS)) + return false; + + // Check if this particular node is reused in any non-memory related + // operation. If yes, do not try to fold this node into the address + // computation, since the computation will be kept. + const SDNode *Node = N.getNode(); + for (SDNode *UI : Node->uses()) { + if (!isa(*UI)) + return false; + } + + // Remember if it is worth folding N when it produces extended register. + bool IsExtendedRegisterWorthFolding = isWorthFolding(N); + + // Try to match a shifted extend on the RHS. + if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL && + SelectExtendedSHL(RHS, Size, false, Offset, SignExtend)) { + Base = LHS; + DoShift = CurDAG->getTargetConstant(true, MVT::i32); + return true; + } + + // Try to match a shifted extend on the LHS. + if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL && + SelectExtendedSHL(LHS, Size, false, Offset, SignExtend)) { + Base = RHS; + DoShift = CurDAG->getTargetConstant(true, MVT::i32); + return true; + } + + // Match any non-shifted, non-extend, non-immediate add expression. + Base = LHS; + Offset = RHS; + SignExtend = CurDAG->getTargetConstant(false, MVT::i32); + DoShift = CurDAG->getTargetConstant(false, MVT::i32); + // Reg1 + Reg2 is free: no check needed. + return true; +} + +SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef Regs) { + static unsigned RegClassIDs[] = { + AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID}; + static unsigned SubRegs[] = { AArch64::dsub0, AArch64::dsub1, + AArch64::dsub2, AArch64::dsub3 }; + + return createTuple(Regs, RegClassIDs, SubRegs); +} + +SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef Regs) { + static unsigned RegClassIDs[] = { + AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID}; + static unsigned SubRegs[] = { AArch64::qsub0, AArch64::qsub1, + AArch64::qsub2, AArch64::qsub3 }; + + return createTuple(Regs, RegClassIDs, SubRegs); +} + +SDValue AArch64DAGToDAGISel::createTuple(ArrayRef Regs, + unsigned RegClassIDs[], + unsigned SubRegs[]) { + // There's no special register-class for a vector-list of 1 element: it's just + // a vector. + if (Regs.size() == 1) + return Regs[0]; + + assert(Regs.size() >= 2 && Regs.size() <= 4); + + SDLoc DL(Regs[0].getNode()); + + SmallVector Ops; + + // First operand of REG_SEQUENCE is the desired RegClass. + Ops.push_back( + CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], MVT::i32)); + + // Then we get pairs of source & subregister-position for the components. + for (unsigned i = 0; i < Regs.size(); ++i) { + Ops.push_back(Regs[i]); + Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], MVT::i32)); + } + + SDNode *N = + CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops); + return SDValue(N, 0); +} + +SDNode *AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, + unsigned Opc, bool isExt) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + + unsigned ExtOff = isExt; + + // Form a REG_SEQUENCE to force register allocation. + unsigned Vec0Off = ExtOff + 1; + SmallVector Regs(N->op_begin() + Vec0Off, + N->op_begin() + Vec0Off + NumVecs); + SDValue RegSeq = createQTuple(Regs); + + SmallVector Ops; + if (isExt) + Ops.push_back(N->getOperand(1)); + Ops.push_back(RegSeq); + Ops.push_back(N->getOperand(NumVecs + ExtOff + 1)); + return CurDAG->getMachineNode(Opc, dl, VT, Ops); +} + +SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) { + LoadSDNode *LD = cast(N); + if (LD->isUnindexed()) + return nullptr; + EVT VT = LD->getMemoryVT(); + EVT DstVT = N->getValueType(0); + ISD::MemIndexedMode AM = LD->getAddressingMode(); + bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC; + + // We're not doing validity checking here. That was done when checking + // if we should mark the load as indexed or not. We're just selecting + // the right instruction. + unsigned Opcode = 0; + + ISD::LoadExtType ExtType = LD->getExtensionType(); + bool InsertTo64 = false; + if (VT == MVT::i64) + Opcode = IsPre ? AArch64::LDRXpre : AArch64::LDRXpost; + else if (VT == MVT::i32) { + if (ExtType == ISD::NON_EXTLOAD) + Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost; + else if (ExtType == ISD::SEXTLOAD) + Opcode = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost; + else { + Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost; + InsertTo64 = true; + // The result of the load is only i32. It's the subreg_to_reg that makes + // it into an i64. + DstVT = MVT::i32; + } + } else if (VT == MVT::i16) { + if (ExtType == ISD::SEXTLOAD) { + if (DstVT == MVT::i64) + Opcode = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost; + else + Opcode = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost; + } else { + Opcode = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost; + InsertTo64 = DstVT == MVT::i64; + // The result of the load is only i32. It's the subreg_to_reg that makes + // it into an i64. + DstVT = MVT::i32; + } + } else if (VT == MVT::i8) { + if (ExtType == ISD::SEXTLOAD) { + if (DstVT == MVT::i64) + Opcode = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost; + else + Opcode = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost; + } else { + Opcode = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost; + InsertTo64 = DstVT == MVT::i64; + // The result of the load is only i32. It's the subreg_to_reg that makes + // it into an i64. + DstVT = MVT::i32; + } + } else if (VT == MVT::f32) { + Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost; + } else if (VT == MVT::f64 || VT.is64BitVector()) { + Opcode = IsPre ? AArch64::LDRDpre : AArch64::LDRDpost; + } else if (VT.is128BitVector()) { + Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost; + } else + return nullptr; + SDValue Chain = LD->getChain(); + SDValue Base = LD->getBasePtr(); + ConstantSDNode *OffsetOp = cast(LD->getOffset()); + int OffsetVal = (int)OffsetOp->getZExtValue(); + SDValue Offset = CurDAG->getTargetConstant(OffsetVal, MVT::i64); + SDValue Ops[] = { Base, Offset, Chain }; + SDNode *Res = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i64, DstVT, + MVT::Other, Ops); + // Either way, we're replacing the node, so tell the caller that. + Done = true; + SDValue LoadedVal = SDValue(Res, 1); + if (InsertTo64) { + SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32); + LoadedVal = + SDValue(CurDAG->getMachineNode( + AArch64::SUBREG_TO_REG, SDLoc(N), MVT::i64, + CurDAG->getTargetConstant(0, MVT::i64), LoadedVal, SubReg), + 0); + } + + ReplaceUses(SDValue(N, 0), LoadedVal); + ReplaceUses(SDValue(N, 1), SDValue(Res, 0)); + ReplaceUses(SDValue(N, 2), SDValue(Res, 2)); + + return nullptr; +} + +SDNode *AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, + unsigned Opc, unsigned SubRegIdx) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + SDValue Chain = N->getOperand(0); + + SmallVector Ops; + Ops.push_back(N->getOperand(2)); // Mem operand; + Ops.push_back(Chain); + + std::vector ResTys; + ResTys.push_back(MVT::Untyped); + ResTys.push_back(MVT::Other); + + SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + SDValue SuperReg = SDValue(Ld, 0); + for (unsigned i = 0; i < NumVecs; ++i) + ReplaceUses(SDValue(N, i), + CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg)); + + ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1)); + return nullptr; +} + +SDNode *AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs, + unsigned Opc, unsigned SubRegIdx) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + SDValue Chain = N->getOperand(0); + + SmallVector Ops; + Ops.push_back(N->getOperand(1)); // Mem operand + Ops.push_back(N->getOperand(2)); // Incremental + Ops.push_back(Chain); + + std::vector ResTys; + ResTys.push_back(MVT::i64); // Type of the write back register + ResTys.push_back(MVT::Untyped); + ResTys.push_back(MVT::Other); + + SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + + // Update uses of write back register + ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0)); + + // Update uses of vector list + SDValue SuperReg = SDValue(Ld, 1); + if (NumVecs == 1) + ReplaceUses(SDValue(N, 0), SuperReg); + else + for (unsigned i = 0; i < NumVecs; ++i) + ReplaceUses(SDValue(N, i), + CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg)); + + // Update the chain + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2)); + return nullptr; +} + +SDNode *AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs, + unsigned Opc) { + SDLoc dl(N); + EVT VT = N->getOperand(2)->getValueType(0); + + // Form a REG_SEQUENCE to force register allocation. + bool Is128Bit = VT.getSizeInBits() == 128; + SmallVector Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs); + SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs); + + SmallVector Ops; + Ops.push_back(RegSeq); + Ops.push_back(N->getOperand(NumVecs + 2)); + Ops.push_back(N->getOperand(0)); + SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops); + + return St; +} + +SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs, + unsigned Opc) { + SDLoc dl(N); + EVT VT = N->getOperand(2)->getValueType(0); + SmallVector ResTys; + ResTys.push_back(MVT::i64); // Type of the write back register + ResTys.push_back(MVT::Other); // Type for the Chain + + // Form a REG_SEQUENCE to force register allocation. + bool Is128Bit = VT.getSizeInBits() == 128; + SmallVector Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs); + SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs); + + SmallVector Ops; + Ops.push_back(RegSeq); + Ops.push_back(N->getOperand(NumVecs + 1)); // base register + Ops.push_back(N->getOperand(NumVecs + 2)); // Incremental + Ops.push_back(N->getOperand(0)); // Chain + SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + + return St; +} + +/// WidenVector - Given a value in the V64 register class, produce the +/// equivalent value in the V128 register class. +class WidenVector { + SelectionDAG &DAG; + +public: + WidenVector(SelectionDAG &DAG) : DAG(DAG) {} + + SDValue operator()(SDValue V64Reg) { + EVT VT = V64Reg.getValueType(); + unsigned NarrowSize = VT.getVectorNumElements(); + MVT EltTy = VT.getVectorElementType().getSimpleVT(); + MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); + SDLoc DL(V64Reg); + + SDValue Undef = + SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0); + return DAG.getTargetInsertSubreg(AArch64::dsub, DL, WideTy, Undef, V64Reg); + } +}; + +/// NarrowVector - Given a value in the V128 register class, produce the +/// equivalent value in the V64 register class. +static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { + EVT VT = V128Reg.getValueType(); + unsigned WideSize = VT.getVectorNumElements(); + MVT EltTy = VT.getVectorElementType().getSimpleVT(); + MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); + + return DAG.getTargetExtractSubreg(AArch64::dsub, SDLoc(V128Reg), NarrowTy, + V128Reg); +} + +SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs, + unsigned Opc) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + bool Narrow = VT.getSizeInBits() == 64; + + // Form a REG_SEQUENCE to force register allocation. + SmallVector Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs); + + if (Narrow) + std::transform(Regs.begin(), Regs.end(), Regs.begin(), + WidenVector(*CurDAG)); + + SDValue RegSeq = createQTuple(Regs); + + std::vector ResTys; + ResTys.push_back(MVT::Untyped); + ResTys.push_back(MVT::Other); + + unsigned LaneNo = + cast(N->getOperand(NumVecs + 2))->getZExtValue(); + + SmallVector Ops; + Ops.push_back(RegSeq); + Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); + Ops.push_back(N->getOperand(NumVecs + 3)); + Ops.push_back(N->getOperand(0)); + SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + SDValue SuperReg = SDValue(Ld, 0); + + EVT WideVT = RegSeq.getOperand(1)->getValueType(0); + static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2, + AArch64::qsub3 }; + for (unsigned i = 0; i < NumVecs; ++i) { + SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg); + if (Narrow) + NV = NarrowVector(NV, *CurDAG); + ReplaceUses(SDValue(N, i), NV); + } + + ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1)); + + return Ld; +} + +SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs, + unsigned Opc) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + bool Narrow = VT.getSizeInBits() == 64; + + // Form a REG_SEQUENCE to force register allocation. + SmallVector Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs); + + if (Narrow) + std::transform(Regs.begin(), Regs.end(), Regs.begin(), + WidenVector(*CurDAG)); + + SDValue RegSeq = createQTuple(Regs); + + std::vector ResTys; + ResTys.push_back(MVT::i64); // Type of the write back register + ResTys.push_back(MVT::Untyped); + ResTys.push_back(MVT::Other); + + unsigned LaneNo = + cast(N->getOperand(NumVecs + 1))->getZExtValue(); + + SmallVector Ops; + Ops.push_back(RegSeq); + Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); // Lane Number + Ops.push_back(N->getOperand(NumVecs + 2)); // Base register + Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental + Ops.push_back(N->getOperand(0)); + SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + + // Update uses of the write back register + ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0)); + + // Update uses of the vector list + SDValue SuperReg = SDValue(Ld, 1); + if (NumVecs == 1) { + ReplaceUses(SDValue(N, 0), + Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg); + } else { + EVT WideVT = RegSeq.getOperand(1)->getValueType(0); + static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2, + AArch64::qsub3 }; + for (unsigned i = 0; i < NumVecs; ++i) { + SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, + SuperReg); + if (Narrow) + NV = NarrowVector(NV, *CurDAG); + ReplaceUses(SDValue(N, i), NV); + } + } + + // Update the Chain + ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2)); + + return Ld; +} + +SDNode *AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs, + unsigned Opc) { + SDLoc dl(N); + EVT VT = N->getOperand(2)->getValueType(0); + bool Narrow = VT.getSizeInBits() == 64; + + // Form a REG_SEQUENCE to force register allocation. + SmallVector Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs); + + if (Narrow) + std::transform(Regs.begin(), Regs.end(), Regs.begin(), + WidenVector(*CurDAG)); + + SDValue RegSeq = createQTuple(Regs); + + unsigned LaneNo = + cast(N->getOperand(NumVecs + 2))->getZExtValue(); + + SmallVector Ops; + Ops.push_back(RegSeq); + Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); + Ops.push_back(N->getOperand(NumVecs + 3)); + Ops.push_back(N->getOperand(0)); + SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); + + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(N)->getMemOperand(); + cast(St)->setMemRefs(MemOp, MemOp + 1); + + return St; +} + +SDNode *AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs, + unsigned Opc) { + SDLoc dl(N); + EVT VT = N->getOperand(2)->getValueType(0); + bool Narrow = VT.getSizeInBits() == 64; + + // Form a REG_SEQUENCE to force register allocation. + SmallVector Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs); + + if (Narrow) + std::transform(Regs.begin(), Regs.end(), Regs.begin(), + WidenVector(*CurDAG)); + + SDValue RegSeq = createQTuple(Regs); + + SmallVector ResTys; + ResTys.push_back(MVT::i64); // Type of the write back register + ResTys.push_back(MVT::Other); + + unsigned LaneNo = + cast(N->getOperand(NumVecs + 1))->getZExtValue(); + + SmallVector Ops; + Ops.push_back(RegSeq); + Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); + Ops.push_back(N->getOperand(NumVecs + 2)); // Base Register + Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental + Ops.push_back(N->getOperand(0)); + SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); + + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(N)->getMemOperand(); + cast(St)->setMemRefs(MemOp, MemOp + 1); + + return St; +} + +static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N, + unsigned &Opc, SDValue &Opd0, + unsigned &LSB, unsigned &MSB, + unsigned NumberOfIgnoredLowBits, + bool BiggerPattern) { + assert(N->getOpcode() == ISD::AND && + "N must be a AND operation to call this function"); + + EVT VT = N->getValueType(0); + + // Here we can test the type of VT and return false when the type does not + // match, but since it is done prior to that call in the current context + // we turned that into an assert to avoid redundant code. + assert((VT == MVT::i32 || VT == MVT::i64) && + "Type checking must have been done before calling this function"); + + // FIXME: simplify-demanded-bits in DAGCombine will probably have + // changed the AND node to a 32-bit mask operation. We'll have to + // undo that as part of the transform here if we want to catch all + // the opportunities. + // Currently the NumberOfIgnoredLowBits argument helps to recover + // form these situations when matching bigger pattern (bitfield insert). + + // For unsigned extracts, check for a shift right and mask + uint64_t And_imm = 0; + if (!isOpcWithIntImmediate(N, ISD::AND, And_imm)) + return false; + + const SDNode *Op0 = N->getOperand(0).getNode(); + + // Because of simplify-demanded-bits in DAGCombine, the mask may have been + // simplified. Try to undo that + And_imm |= (1 << NumberOfIgnoredLowBits) - 1; + + // The immediate is a mask of the low bits iff imm & (imm+1) == 0 + if (And_imm & (And_imm + 1)) + return false; + + bool ClampMSB = false; + uint64_t Srl_imm = 0; + // Handle the SRL + ANY_EXTEND case. + if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND && + isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, Srl_imm)) { + // Extend the incoming operand of the SRL to 64-bit. + Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0)); + // Make sure to clamp the MSB so that we preserve the semantics of the + // original operations. + ClampMSB = true; + } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE && + isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, + Srl_imm)) { + // If the shift result was truncated, we can still combine them. + Opd0 = Op0->getOperand(0).getOperand(0); + + // Use the type of SRL node. + VT = Opd0->getValueType(0); + } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) { + Opd0 = Op0->getOperand(0); + } else if (BiggerPattern) { + // Let's pretend a 0 shift right has been performed. + // The resulting code will be at least as good as the original one + // plus it may expose more opportunities for bitfield insert pattern. + // FIXME: Currently we limit this to the bigger pattern, because + // some optimizations expect AND and not UBFM + Opd0 = N->getOperand(0); + } else + return false; + + assert((BiggerPattern || (Srl_imm > 0 && Srl_imm < VT.getSizeInBits())) && + "bad amount in shift node!"); + + LSB = Srl_imm; + MSB = Srl_imm + (VT == MVT::i32 ? CountTrailingOnes_32(And_imm) + : CountTrailingOnes_64(And_imm)) - + 1; + if (ClampMSB) + // Since we're moving the extend before the right shift operation, we need + // to clamp the MSB to make sure we don't shift in undefined bits instead of + // the zeros which would get shifted in with the original right shift + // operation. + MSB = MSB > 31 ? 31 : MSB; + + Opc = VT == MVT::i32 ? AArch64::UBFMWri : AArch64::UBFMXri; + return true; +} + +static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, + unsigned &LSB, unsigned &MSB) { + // We are looking for the following pattern which basically extracts a single + // bit from the source value and places it in the LSB of the destination + // value, all other bits of the destination value or set to zero: + // + // Value2 = AND Value, MaskImm + // SRL Value2, ShiftImm + // + // with MaskImm >> ShiftImm == 1. + // + // This gets selected into a single UBFM: + // + // UBFM Value, ShiftImm, ShiftImm + // + + if (N->getOpcode() != ISD::SRL) + return false; + + uint64_t And_mask = 0; + if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_mask)) + return false; + + Opd0 = N->getOperand(0).getOperand(0); + + uint64_t Srl_imm = 0; + if (!isIntImmediate(N->getOperand(1), Srl_imm)) + return false; + + // Check whether we really have a one bit extract here. + if (And_mask >> Srl_imm == 0x1) { + if (N->getValueType(0) == MVT::i32) + Opc = AArch64::UBFMWri; + else + Opc = AArch64::UBFMXri; + + LSB = MSB = Srl_imm; + + return true; + } + + return false; +} + +static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, + unsigned &LSB, unsigned &MSB, + bool BiggerPattern) { + assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && + "N must be a SHR/SRA operation to call this function"); + + EVT VT = N->getValueType(0); + + // Here we can test the type of VT and return false when the type does not + // match, but since it is done prior to that call in the current context + // we turned that into an assert to avoid redundant code. + assert((VT == MVT::i32 || VT == MVT::i64) && + "Type checking must have been done before calling this function"); + + // Check for AND + SRL doing a one bit extract. + if (isOneBitExtractOpFromShr(N, Opc, Opd0, LSB, MSB)) + return true; + + // we're looking for a shift of a shift + uint64_t Shl_imm = 0; + uint64_t Trunc_bits = 0; + if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) { + Opd0 = N->getOperand(0).getOperand(0); + } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL && + N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) { + // We are looking for a shift of truncate. Truncate from i64 to i32 could + // be considered as setting high 32 bits as zero. Our strategy here is to + // always generate 64bit UBFM. This consistency will help the CSE pass + // later find more redundancy. + Opd0 = N->getOperand(0).getOperand(0); + Trunc_bits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits(); + VT = Opd0->getValueType(0); + assert(VT == MVT::i64 && "the promoted type should be i64"); + } else if (BiggerPattern) { + // Let's pretend a 0 shift left has been performed. + // FIXME: Currently we limit this to the bigger pattern case, + // because some optimizations expect AND and not UBFM + Opd0 = N->getOperand(0); + } else + return false; + + assert(Shl_imm < VT.getSizeInBits() && "bad amount in shift node!"); + uint64_t Srl_imm = 0; + if (!isIntImmediate(N->getOperand(1), Srl_imm)) + return false; + + assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() && + "bad amount in shift node!"); + // Note: The width operand is encoded as width-1. + unsigned Width = VT.getSizeInBits() - Trunc_bits - Srl_imm - 1; + int sLSB = Srl_imm - Shl_imm; + if (sLSB < 0) + return false; + LSB = sLSB; + MSB = LSB + Width; + // SRA requires a signed extraction + if (VT == MVT::i32) + Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri; + else + Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMXri : AArch64::UBFMXri; + return true; +} + +static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc, + SDValue &Opd0, unsigned &LSB, unsigned &MSB, + unsigned NumberOfIgnoredLowBits = 0, + bool BiggerPattern = false) { + if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64) + return false; + + switch (N->getOpcode()) { + default: + if (!N->isMachineOpcode()) + return false; + break; + case ISD::AND: + return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, LSB, MSB, + NumberOfIgnoredLowBits, BiggerPattern); + case ISD::SRL: + case ISD::SRA: + return isBitfieldExtractOpFromShr(N, Opc, Opd0, LSB, MSB, BiggerPattern); + } + + unsigned NOpc = N->getMachineOpcode(); + switch (NOpc) { + default: + return false; + case AArch64::SBFMWri: + case AArch64::UBFMWri: + case AArch64::SBFMXri: + case AArch64::UBFMXri: + Opc = NOpc; + Opd0 = N->getOperand(0); + LSB = cast(N->getOperand(1).getNode())->getZExtValue(); + MSB = cast(N->getOperand(2).getNode())->getZExtValue(); + return true; + } + // Unreachable + return false; +} + +SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) { + unsigned Opc, LSB, MSB; + SDValue Opd0; + if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, LSB, MSB)) + return nullptr; + + EVT VT = N->getValueType(0); + + // If the bit extract operation is 64bit but the original type is 32bit, we + // need to add one EXTRACT_SUBREG. + if ((Opc == AArch64::SBFMXri || Opc == AArch64::UBFMXri) && VT == MVT::i32) { + SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(LSB, MVT::i64), + CurDAG->getTargetConstant(MSB, MVT::i64)}; + + SDNode *BFM = CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i64, Ops64); + SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32); + MachineSDNode *Node = + CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N), MVT::i32, + SDValue(BFM, 0), SubReg); + return Node; + } + + SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(LSB, VT), + CurDAG->getTargetConstant(MSB, VT)}; + return CurDAG->SelectNodeTo(N, Opc, VT, Ops); +} + +/// Does DstMask form a complementary pair with the mask provided by +/// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking, +/// this asks whether DstMask zeroes precisely those bits that will be set by +/// the other half. +static bool isBitfieldDstMask(uint64_t DstMask, APInt BitsToBeInserted, + unsigned NumberOfIgnoredHighBits, EVT VT) { + assert((VT == MVT::i32 || VT == MVT::i64) && + "i32 or i64 mask type expected!"); + unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits; + + APInt SignificantDstMask = APInt(BitWidth, DstMask); + APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth); + + return (SignificantDstMask & SignificantBitsToBeInserted) == 0 && + (SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue(); +} + +// Look for bits that will be useful for later uses. +// A bit is consider useless as soon as it is dropped and never used +// before it as been dropped. +// E.g., looking for useful bit of x +// 1. y = x & 0x7 +// 2. z = y >> 2 +// After #1, x useful bits are 0x7, then the useful bits of x, live through +// y. +// After #2, the useful bits of x are 0x4. +// However, if x is used on an unpredicatable instruction, then all its bits +// are useful. +// E.g. +// 1. y = x & 0x7 +// 2. z = y >> 2 +// 3. str x, [@x] +static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0); + +static void getUsefulBitsFromAndWithImmediate(SDValue Op, APInt &UsefulBits, + unsigned Depth) { + uint64_t Imm = + cast(Op.getOperand(1).getNode())->getZExtValue(); + Imm = AArch64_AM::decodeLogicalImmediate(Imm, UsefulBits.getBitWidth()); + UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm); + getUsefulBits(Op, UsefulBits, Depth + 1); +} + +static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits, + uint64_t Imm, uint64_t MSB, + unsigned Depth) { + // inherit the bitwidth value + APInt OpUsefulBits(UsefulBits); + OpUsefulBits = 1; + + if (MSB >= Imm) { + OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1); + --OpUsefulBits; + // The interesting part will be in the lower part of the result + getUsefulBits(Op, OpUsefulBits, Depth + 1); + // The interesting part was starting at Imm in the argument + OpUsefulBits = OpUsefulBits.shl(Imm); + } else { + OpUsefulBits = OpUsefulBits.shl(MSB + 1); + --OpUsefulBits; + // The interesting part will be shifted in the result + OpUsefulBits = OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm); + getUsefulBits(Op, OpUsefulBits, Depth + 1); + // The interesting part was at zero in the argument + OpUsefulBits = OpUsefulBits.lshr(OpUsefulBits.getBitWidth() - Imm); + } + + UsefulBits &= OpUsefulBits; +} + +static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits, + unsigned Depth) { + uint64_t Imm = + cast(Op.getOperand(1).getNode())->getZExtValue(); + uint64_t MSB = + cast(Op.getOperand(2).getNode())->getZExtValue(); + + getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth); +} + +static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits, + unsigned Depth) { + uint64_t ShiftTypeAndValue = + cast(Op.getOperand(2).getNode())->getZExtValue(); + APInt Mask(UsefulBits); + Mask.clearAllBits(); + Mask.flipAllBits(); + + if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSL) { + // Shift Left + uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue); + Mask = Mask.shl(ShiftAmt); + getUsefulBits(Op, Mask, Depth + 1); + Mask = Mask.lshr(ShiftAmt); + } else if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSR) { + // Shift Right + // We do not handle AArch64_AM::ASR, because the sign will change the + // number of useful bits + uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue); + Mask = Mask.lshr(ShiftAmt); + getUsefulBits(Op, Mask, Depth + 1); + Mask = Mask.shl(ShiftAmt); + } else + return; + + UsefulBits &= Mask; +} + +static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits, + unsigned Depth) { + uint64_t Imm = + cast(Op.getOperand(2).getNode())->getZExtValue(); + uint64_t MSB = + cast(Op.getOperand(3).getNode())->getZExtValue(); + + if (Op.getOperand(1) == Orig) + return getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth); + + APInt OpUsefulBits(UsefulBits); + OpUsefulBits = 1; + + if (MSB >= Imm) { + OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1); + --OpUsefulBits; + UsefulBits &= ~OpUsefulBits; + getUsefulBits(Op, UsefulBits, Depth + 1); + } else { + OpUsefulBits = OpUsefulBits.shl(MSB + 1); + --OpUsefulBits; + UsefulBits = ~(OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm)); + getUsefulBits(Op, UsefulBits, Depth + 1); + } +} + +static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits, + SDValue Orig, unsigned Depth) { + + // Users of this node should have already been instruction selected + // FIXME: Can we turn that into an assert? + if (!UserNode->isMachineOpcode()) + return; + + switch (UserNode->getMachineOpcode()) { + default: + return; + case AArch64::ANDSWri: + case AArch64::ANDSXri: + case AArch64::ANDWri: + case AArch64::ANDXri: + // We increment Depth only when we call the getUsefulBits + return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits, + Depth); + case AArch64::UBFMWri: + case AArch64::UBFMXri: + return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth); + + case AArch64::ORRWrs: + case AArch64::ORRXrs: + if (UserNode->getOperand(1) != Orig) + return; + return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits, + Depth); + case AArch64::BFMWri: + case AArch64::BFMXri: + return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth); + } +} + +static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) { + if (Depth >= 6) + return; + // Initialize UsefulBits + if (!Depth) { + unsigned Bitwidth = Op.getValueType().getScalarType().getSizeInBits(); + // At the beginning, assume every produced bits is useful + UsefulBits = APInt(Bitwidth, 0); + UsefulBits.flipAllBits(); + } + APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0); + + for (SDNode *Node : Op.getNode()->uses()) { + // A use cannot produce useful bits + APInt UsefulBitsForUse = APInt(UsefulBits); + getUsefulBitsForUse(Node, UsefulBitsForUse, Op, Depth); + UsersUsefulBits |= UsefulBitsForUse; + } + // UsefulBits contains the produced bits that are meaningful for the + // current definition, thus a user cannot make a bit meaningful at + // this point + UsefulBits &= UsersUsefulBits; +} + +/// Create a machine node performing a notional SHL of Op by ShlAmount. If +/// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is +/// 0, return Op unchanged. +static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) { + if (ShlAmount == 0) + return Op; + + EVT VT = Op.getValueType(); + unsigned BitWidth = VT.getSizeInBits(); + unsigned UBFMOpc = BitWidth == 32 ? AArch64::UBFMWri : AArch64::UBFMXri; + + SDNode *ShiftNode; + if (ShlAmount > 0) { + // LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt + ShiftNode = CurDAG->getMachineNode( + UBFMOpc, SDLoc(Op), VT, Op, + CurDAG->getTargetConstant(BitWidth - ShlAmount, VT), + CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, VT)); + } else { + // LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1 + assert(ShlAmount < 0 && "expected right shift"); + int ShrAmount = -ShlAmount; + ShiftNode = CurDAG->getMachineNode( + UBFMOpc, SDLoc(Op), VT, Op, CurDAG->getTargetConstant(ShrAmount, VT), + CurDAG->getTargetConstant(BitWidth - 1, VT)); + } + + return SDValue(ShiftNode, 0); +} + +/// Does this tree qualify as an attempt to move a bitfield into position, +/// essentially "(and (shl VAL, N), Mask)". +static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op, + SDValue &Src, int &ShiftAmount, + int &MaskWidth) { + EVT VT = Op.getValueType(); + unsigned BitWidth = VT.getSizeInBits(); + (void)BitWidth; + assert(BitWidth == 32 || BitWidth == 64); + + APInt KnownZero, KnownOne; + CurDAG->computeKnownBits(Op, KnownZero, KnownOne); + + // Non-zero in the sense that they're not provably zero, which is the key + // point if we want to use this value + uint64_t NonZeroBits = (~KnownZero).getZExtValue(); + + // Discard a constant AND mask if present. It's safe because the node will + // already have been factored into the computeKnownBits calculation above. + uint64_t AndImm; + if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) { + assert((~APInt(BitWidth, AndImm) & ~KnownZero) == 0); + Op = Op.getOperand(0); + } + + uint64_t ShlImm; + if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm)) + return false; + Op = Op.getOperand(0); + + if (!isShiftedMask_64(NonZeroBits)) + return false; + + ShiftAmount = countTrailingZeros(NonZeroBits); + MaskWidth = CountTrailingOnes_64(NonZeroBits >> ShiftAmount); + + // BFI encompasses sufficiently many nodes that it's worth inserting an extra + // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL + // amount. + Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount); + + return true; +} + +// Given a OR operation, check if we have the following pattern +// ubfm c, b, imm, imm2 (or something that does the same jobs, see +// isBitfieldExtractOp) +// d = e & mask2 ; where mask is a binary sequence of 1..10..0 and +// countTrailingZeros(mask2) == imm2 - imm + 1 +// f = d | c +// if yes, given reference arguments will be update so that one can replace +// the OR instruction with: +// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2 +static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, + SDValue &Src, unsigned &ImmR, + unsigned &ImmS, SelectionDAG *CurDAG) { + assert(N->getOpcode() == ISD::OR && "Expect a OR operation"); + + // Set Opc + EVT VT = N->getValueType(0); + if (VT == MVT::i32) + Opc = AArch64::BFMWri; + else if (VT == MVT::i64) + Opc = AArch64::BFMXri; + else + return false; + + // Because of simplify-demanded-bits in DAGCombine, involved masks may not + // have the expected shape. Try to undo that. + APInt UsefulBits; + getUsefulBits(SDValue(N, 0), UsefulBits); + + unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros(); + unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros(); + + // OR is commutative, check both possibilities (does llvm provide a + // way to do that directely, e.g., via code matcher?) + SDValue OrOpd1Val = N->getOperand(1); + SDNode *OrOpd0 = N->getOperand(0).getNode(); + SDNode *OrOpd1 = N->getOperand(1).getNode(); + for (int i = 0; i < 2; + ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) { + unsigned BFXOpc; + int DstLSB, Width; + if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS, + NumberOfIgnoredLowBits, true)) { + // Check that the returned opcode is compatible with the pattern, + // i.e., same type and zero extended (U and not S) + if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) || + (BFXOpc != AArch64::UBFMWri && VT == MVT::i32)) + continue; + + // Compute the width of the bitfield insertion + DstLSB = 0; + Width = ImmS - ImmR + 1; + // FIXME: This constraint is to catch bitfield insertion we may + // want to widen the pattern if we want to grab general bitfied + // move case + if (Width <= 0) + continue; + + // If the mask on the insertee is correct, we have a BFXIL operation. We + // can share the ImmR and ImmS values from the already-computed UBFM. + } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), Src, + DstLSB, Width)) { + ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits(); + ImmS = Width - 1; + } else + continue; + + // Check the second part of the pattern + EVT VT = OrOpd1->getValueType(0); + assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand"); + + // Compute the Known Zero for the candidate of the first operand. + // This allows to catch more general case than just looking for + // AND with imm. Indeed, simplify-demanded-bits may have removed + // the AND instruction because it proves it was useless. + APInt KnownZero, KnownOne; + CurDAG->computeKnownBits(OrOpd1Val, KnownZero, KnownOne); + + // Check if there is enough room for the second operand to appear + // in the first one + APInt BitsToBeInserted = + APInt::getBitsSet(KnownZero.getBitWidth(), DstLSB, DstLSB + Width); + + if ((BitsToBeInserted & ~KnownZero) != 0) + continue; + + // Set the first operand + uint64_t Imm; + if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) && + isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT)) + // In that case, we can eliminate the AND + Dst = OrOpd1->getOperand(0); + else + // Maybe the AND has been removed by simplify-demanded-bits + // or is useful because it discards more bits + Dst = OrOpd1Val; + + // both parts match + return true; + } + + return false; +} + +SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) { + if (N->getOpcode() != ISD::OR) + return nullptr; + + unsigned Opc; + unsigned LSB, MSB; + SDValue Opd0, Opd1; + + if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG)) + return nullptr; + + EVT VT = N->getValueType(0); + SDValue Ops[] = { Opd0, + Opd1, + CurDAG->getTargetConstant(LSB, VT), + CurDAG->getTargetConstant(MSB, VT) }; + return CurDAG->SelectNodeTo(N, Opc, VT, Ops); +} + +SDNode *AArch64DAGToDAGISel::SelectLIBM(SDNode *N) { + EVT VT = N->getValueType(0); + unsigned Variant; + unsigned Opc; + unsigned FRINTXOpcs[] = { AArch64::FRINTXSr, AArch64::FRINTXDr }; + + if (VT == MVT::f32) { + Variant = 0; + } else if (VT == MVT::f64) { + Variant = 1; + } else + return nullptr; // Unrecognized argument type. Fall back on default codegen. + + // Pick the FRINTX variant needed to set the flags. + unsigned FRINTXOpc = FRINTXOpcs[Variant]; + + switch (N->getOpcode()) { + default: + return nullptr; // Unrecognized libm ISD node. Fall back on default codegen. + case ISD::FCEIL: { + unsigned FRINTPOpcs[] = { AArch64::FRINTPSr, AArch64::FRINTPDr }; + Opc = FRINTPOpcs[Variant]; + break; + } + case ISD::FFLOOR: { + unsigned FRINTMOpcs[] = { AArch64::FRINTMSr, AArch64::FRINTMDr }; + Opc = FRINTMOpcs[Variant]; + break; + } + case ISD::FTRUNC: { + unsigned FRINTZOpcs[] = { AArch64::FRINTZSr, AArch64::FRINTZDr }; + Opc = FRINTZOpcs[Variant]; + break; + } + case ISD::FROUND: { + unsigned FRINTAOpcs[] = { AArch64::FRINTASr, AArch64::FRINTADr }; + Opc = FRINTAOpcs[Variant]; + break; + } + } + + SDLoc dl(N); + SDValue In = N->getOperand(0); + SmallVector Ops; + Ops.push_back(In); + + if (!TM.Options.UnsafeFPMath) { + SDNode *FRINTX = CurDAG->getMachineNode(FRINTXOpc, dl, VT, MVT::Glue, In); + Ops.push_back(SDValue(FRINTX, 1)); + } + + return CurDAG->getMachineNode(Opc, dl, VT, Ops); +} + +bool +AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, + unsigned RegWidth) { + APFloat FVal(0.0); + if (ConstantFPSDNode *CN = dyn_cast(N)) + FVal = CN->getValueAPF(); + else if (LoadSDNode *LN = dyn_cast(N)) { + // Some otherwise illegal constants are allowed in this case. + if (LN->getOperand(1).getOpcode() != AArch64ISD::ADDlow || + !isa(LN->getOperand(1)->getOperand(1))) + return false; + + ConstantPoolSDNode *CN = + dyn_cast(LN->getOperand(1)->getOperand(1)); + FVal = cast(CN->getConstVal())->getValueAPF(); + } else + return false; + + // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits + // is between 1 and 32 for a destination w-register, or 1 and 64 for an + // x-register. + // + // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we + // want THIS_NODE to be 2^fbits. This is much easier to deal with using + // integers. + bool IsExact; + + // fbits is between 1 and 64 in the worst-case, which means the fmul + // could have 2^64 as an actual operand. Need 65 bits of precision. + APSInt IntVal(65, true); + FVal.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact); + + // N.b. isPowerOf2 also checks for > 0. + if (!IsExact || !IntVal.isPowerOf2()) return false; + unsigned FBits = IntVal.logBase2(); + + // Checks above should have guaranteed that we haven't lost information in + // finding FBits, but it must still be in range. + if (FBits == 0 || FBits > RegWidth) return false; + + FixedPos = CurDAG->getTargetConstant(FBits, MVT::i32); + return true; +} + +SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { + // Dump information about the Node being selected + DEBUG(errs() << "Selecting: "); + DEBUG(Node->dump(CurDAG)); + DEBUG(errs() << "\n"); + + // If we have a custom node, we already have selected! + if (Node->isMachineOpcode()) { + DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n"); + Node->setNodeId(-1); + return nullptr; + } + + // Few custom selection stuff. + SDNode *ResNode = nullptr; + EVT VT = Node->getValueType(0); + + switch (Node->getOpcode()) { + default: + break; + + case ISD::ADD: + if (SDNode *I = SelectMLAV64LaneV128(Node)) + return I; + break; + + case ISD::LOAD: { + // Try to select as an indexed load. Fall through to normal processing + // if we can't. + bool Done = false; + SDNode *I = SelectIndexedLoad(Node, Done); + if (Done) + return I; + break; + } + + case ISD::SRL: + case ISD::AND: + case ISD::SRA: + if (SDNode *I = SelectBitfieldExtractOp(Node)) + return I; + break; + + case ISD::OR: + if (SDNode *I = SelectBitfieldInsertOp(Node)) + return I; + break; + + case ISD::EXTRACT_VECTOR_ELT: { + // Extracting lane zero is a special case where we can just use a plain + // EXTRACT_SUBREG instruction, which will become FMOV. This is easier for + // the rest of the compiler, especially the register allocator and copyi + // propagation, to reason about, so is preferred when it's possible to + // use it. + ConstantSDNode *LaneNode = cast(Node->getOperand(1)); + // Bail and use the default Select() for non-zero lanes. + if (LaneNode->getZExtValue() != 0) + break; + // If the element type is not the same as the result type, likewise + // bail and use the default Select(), as there's more to do than just + // a cross-class COPY. This catches extracts of i8 and i16 elements + // since they will need an explicit zext. + if (VT != Node->getOperand(0).getValueType().getVectorElementType()) + break; + unsigned SubReg; + switch (Node->getOperand(0) + .getValueType() + .getVectorElementType() + .getSizeInBits()) { + default: + assert(0 && "Unexpected vector element type!"); + case 64: + SubReg = AArch64::dsub; + break; + case 32: + SubReg = AArch64::ssub; + break; + case 16: // FALLTHROUGH + case 8: + llvm_unreachable("unexpected zext-requiring extract element!"); + } + SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT, + Node->getOperand(0)); + DEBUG(dbgs() << "ISEL: Custom selection!\n=> "); + DEBUG(Extract->dumpr(CurDAG)); + DEBUG(dbgs() << "\n"); + return Extract.getNode(); + } + case ISD::Constant: { + // Materialize zero constants as copies from WZR/XZR. This allows + // the coalescer to propagate these into other instructions. + ConstantSDNode *ConstNode = cast(Node); + if (ConstNode->isNullValue()) { + if (VT == MVT::i32) + return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node), + AArch64::WZR, MVT::i32).getNode(); + else if (VT == MVT::i64) + return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node), + AArch64::XZR, MVT::i64).getNode(); + } + break; + } + + case ISD::FrameIndex: { + // Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm. + int FI = cast(Node)->getIndex(); + unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0); + const TargetLowering *TLI = getTargetLowering(); + SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); + SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32), + CurDAG->getTargetConstant(Shifter, MVT::i32) }; + return CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops); + } + case ISD::INTRINSIC_W_CHAIN: { + unsigned IntNo = cast(Node->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: + break; + case Intrinsic::aarch64_ldaxp: + case Intrinsic::aarch64_ldxp: { + unsigned Op = + IntNo == Intrinsic::aarch64_ldaxp ? AArch64::LDAXPX : AArch64::LDXPX; + SDValue MemAddr = Node->getOperand(2); + SDLoc DL(Node); + SDValue Chain = Node->getOperand(0); + + SDNode *Ld = CurDAG->getMachineNode(Op, DL, MVT::i64, MVT::i64, + MVT::Other, MemAddr, Chain); + + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(Node)->getMemOperand(); + cast(Ld)->setMemRefs(MemOp, MemOp + 1); + return Ld; + } + case Intrinsic::aarch64_stlxp: + case Intrinsic::aarch64_stxp: { + unsigned Op = + IntNo == Intrinsic::aarch64_stlxp ? AArch64::STLXPX : AArch64::STXPX; + SDLoc DL(Node); + SDValue Chain = Node->getOperand(0); + SDValue ValLo = Node->getOperand(2); + SDValue ValHi = Node->getOperand(3); + SDValue MemAddr = Node->getOperand(4); + + // Place arguments in the right order. + SmallVector Ops; + Ops.push_back(ValLo); + Ops.push_back(ValHi); + Ops.push_back(MemAddr); + Ops.push_back(Chain); + + SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops); + // Transfer memoperands. + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast(Node)->getMemOperand(); + cast(St)->setMemRefs(MemOp, MemOp + 1); + + return St; + } + case Intrinsic::aarch64_neon_ld1x2: + if (VT == MVT::v8i8) + return SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0); + else if (VT == MVT::v4i16) + return SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0); + else if (VT == MVT::v8i16) + return SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0); + break; + case Intrinsic::aarch64_neon_ld1x3: + if (VT == MVT::v8i8) + return SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0); + else if (VT == MVT::v4i16) + return SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0); + else if (VT == MVT::v8i16) + return SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0); + break; + case Intrinsic::aarch64_neon_ld1x4: + if (VT == MVT::v8i8) + return SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0); + else if (VT == MVT::v4i16) + return SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0); + else if (VT == MVT::v8i16) + return SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0); + break; + case Intrinsic::aarch64_neon_ld2: + if (VT == MVT::v8i8) + return SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0); + else if (VT == MVT::v4i16) + return SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0); + else if (VT == MVT::v8i16) + return SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0); + break; + case Intrinsic::aarch64_neon_ld3: + if (VT == MVT::v8i8) + return SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0); + else if (VT == MVT::v4i16) + return SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0); + else if (VT == MVT::v8i16) + return SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0); + break; + case Intrinsic::aarch64_neon_ld4: + if (VT == MVT::v8i8) + return SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0); + else if (VT == MVT::v4i16) + return SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0); + else if (VT == MVT::v8i16) + return SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0); + break; + case Intrinsic::aarch64_neon_ld2r: + if (VT == MVT::v8i8) + return SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0); + else if (VT == MVT::v4i16) + return SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0); + else if (VT == MVT::v8i16) + return SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0); + break; + case Intrinsic::aarch64_neon_ld3r: + if (VT == MVT::v8i8) + return SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0); + else if (VT == MVT::v4i16) + return SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0); + else if (VT == MVT::v8i16) + return SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0); + break; + case Intrinsic::aarch64_neon_ld4r: + if (VT == MVT::v8i8) + return SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0); + else if (VT == MVT::v4i16) + return SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0); + else if (VT == MVT::v8i16) + return SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0); + break; + case Intrinsic::aarch64_neon_ld2lane: + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectLoadLane(Node, 2, AArch64::LD2i8); + else if (VT == MVT::v8i16 || VT == MVT::v4i16) + return SelectLoadLane(Node, 2, AArch64::LD2i16); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectLoadLane(Node, 2, AArch64::LD2i32); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectLoadLane(Node, 2, AArch64::LD2i64); + break; + case Intrinsic::aarch64_neon_ld3lane: + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectLoadLane(Node, 3, AArch64::LD3i8); + else if (VT == MVT::v8i16 || VT == MVT::v4i16) + return SelectLoadLane(Node, 3, AArch64::LD3i16); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectLoadLane(Node, 3, AArch64::LD3i32); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectLoadLane(Node, 3, AArch64::LD3i64); + break; + case Intrinsic::aarch64_neon_ld4lane: + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectLoadLane(Node, 4, AArch64::LD4i8); + else if (VT == MVT::v8i16 || VT == MVT::v4i16) + return SelectLoadLane(Node, 4, AArch64::LD4i16); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectLoadLane(Node, 4, AArch64::LD4i32); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectLoadLane(Node, 4, AArch64::LD4i64); + break; + } + } break; + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntNo = cast(Node->getOperand(0))->getZExtValue(); + switch (IntNo) { + default: + break; + case Intrinsic::aarch64_neon_tbl2: + return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBLv8i8Two + : AArch64::TBLv16i8Two, + false); + case Intrinsic::aarch64_neon_tbl3: + return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three + : AArch64::TBLv16i8Three, + false); + case Intrinsic::aarch64_neon_tbl4: + return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four + : AArch64::TBLv16i8Four, + false); + case Intrinsic::aarch64_neon_tbx2: + return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBXv8i8Two + : AArch64::TBXv16i8Two, + true); + case Intrinsic::aarch64_neon_tbx3: + return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three + : AArch64::TBXv16i8Three, + true); + case Intrinsic::aarch64_neon_tbx4: + return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four + : AArch64::TBXv16i8Four, + true); + case Intrinsic::aarch64_neon_smull: + case Intrinsic::aarch64_neon_umull: + if (SDNode *N = SelectMULLV64LaneV128(IntNo, Node)) + return N; + break; + } + break; + } + case ISD::INTRINSIC_VOID: { + unsigned IntNo = cast(Node->getOperand(1))->getZExtValue(); + if (Node->getNumOperands() >= 3) + VT = Node->getOperand(2)->getValueType(0); + switch (IntNo) { + default: + break; + case Intrinsic::aarch64_neon_st1x2: { + if (VT == MVT::v8i8) + return SelectStore(Node, 2, AArch64::ST1Twov8b); + else if (VT == MVT::v16i8) + return SelectStore(Node, 2, AArch64::ST1Twov16b); + else if (VT == MVT::v4i16) + return SelectStore(Node, 2, AArch64::ST1Twov4h); + else if (VT == MVT::v8i16) + return SelectStore(Node, 2, AArch64::ST1Twov8h); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectStore(Node, 2, AArch64::ST1Twov2s); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectStore(Node, 2, AArch64::ST1Twov4s); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectStore(Node, 2, AArch64::ST1Twov2d); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectStore(Node, 2, AArch64::ST1Twov1d); + break; + } + case Intrinsic::aarch64_neon_st1x3: { + if (VT == MVT::v8i8) + return SelectStore(Node, 3, AArch64::ST1Threev8b); + else if (VT == MVT::v16i8) + return SelectStore(Node, 3, AArch64::ST1Threev16b); + else if (VT == MVT::v4i16) + return SelectStore(Node, 3, AArch64::ST1Threev4h); + else if (VT == MVT::v8i16) + return SelectStore(Node, 3, AArch64::ST1Threev8h); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectStore(Node, 3, AArch64::ST1Threev2s); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectStore(Node, 3, AArch64::ST1Threev4s); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectStore(Node, 3, AArch64::ST1Threev2d); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectStore(Node, 3, AArch64::ST1Threev1d); + break; + } + case Intrinsic::aarch64_neon_st1x4: { + if (VT == MVT::v8i8) + return SelectStore(Node, 4, AArch64::ST1Fourv8b); + else if (VT == MVT::v16i8) + return SelectStore(Node, 4, AArch64::ST1Fourv16b); + else if (VT == MVT::v4i16) + return SelectStore(Node, 4, AArch64::ST1Fourv4h); + else if (VT == MVT::v8i16) + return SelectStore(Node, 4, AArch64::ST1Fourv8h); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectStore(Node, 4, AArch64::ST1Fourv2s); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectStore(Node, 4, AArch64::ST1Fourv4s); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectStore(Node, 4, AArch64::ST1Fourv2d); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectStore(Node, 4, AArch64::ST1Fourv1d); + break; + } + case Intrinsic::aarch64_neon_st2: { + if (VT == MVT::v8i8) + return SelectStore(Node, 2, AArch64::ST2Twov8b); + else if (VT == MVT::v16i8) + return SelectStore(Node, 2, AArch64::ST2Twov16b); + else if (VT == MVT::v4i16) + return SelectStore(Node, 2, AArch64::ST2Twov4h); + else if (VT == MVT::v8i16) + return SelectStore(Node, 2, AArch64::ST2Twov8h); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectStore(Node, 2, AArch64::ST2Twov2s); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectStore(Node, 2, AArch64::ST2Twov4s); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectStore(Node, 2, AArch64::ST2Twov2d); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectStore(Node, 2, AArch64::ST1Twov1d); + break; + } + case Intrinsic::aarch64_neon_st3: { + if (VT == MVT::v8i8) + return SelectStore(Node, 3, AArch64::ST3Threev8b); + else if (VT == MVT::v16i8) + return SelectStore(Node, 3, AArch64::ST3Threev16b); + else if (VT == MVT::v4i16) + return SelectStore(Node, 3, AArch64::ST3Threev4h); + else if (VT == MVT::v8i16) + return SelectStore(Node, 3, AArch64::ST3Threev8h); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectStore(Node, 3, AArch64::ST3Threev2s); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectStore(Node, 3, AArch64::ST3Threev4s); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectStore(Node, 3, AArch64::ST3Threev2d); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectStore(Node, 3, AArch64::ST1Threev1d); + break; + } + case Intrinsic::aarch64_neon_st4: { + if (VT == MVT::v8i8) + return SelectStore(Node, 4, AArch64::ST4Fourv8b); + else if (VT == MVT::v16i8) + return SelectStore(Node, 4, AArch64::ST4Fourv16b); + else if (VT == MVT::v4i16) + return SelectStore(Node, 4, AArch64::ST4Fourv4h); + else if (VT == MVT::v8i16) + return SelectStore(Node, 4, AArch64::ST4Fourv8h); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectStore(Node, 4, AArch64::ST4Fourv2s); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectStore(Node, 4, AArch64::ST4Fourv4s); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectStore(Node, 4, AArch64::ST4Fourv2d); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectStore(Node, 4, AArch64::ST1Fourv1d); + break; + } + case Intrinsic::aarch64_neon_st2lane: { + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectStoreLane(Node, 2, AArch64::ST2i8); + else if (VT == MVT::v8i16 || VT == MVT::v4i16) + return SelectStoreLane(Node, 2, AArch64::ST2i16); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectStoreLane(Node, 2, AArch64::ST2i32); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectStoreLane(Node, 2, AArch64::ST2i64); + break; + } + case Intrinsic::aarch64_neon_st3lane: { + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectStoreLane(Node, 3, AArch64::ST3i8); + else if (VT == MVT::v8i16 || VT == MVT::v4i16) + return SelectStoreLane(Node, 3, AArch64::ST3i16); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectStoreLane(Node, 3, AArch64::ST3i32); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectStoreLane(Node, 3, AArch64::ST3i64); + break; + } + case Intrinsic::aarch64_neon_st4lane: { + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectStoreLane(Node, 4, AArch64::ST4i8); + else if (VT == MVT::v8i16 || VT == MVT::v4i16) + return SelectStoreLane(Node, 4, AArch64::ST4i16); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectStoreLane(Node, 4, AArch64::ST4i32); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectStoreLane(Node, 4, AArch64::ST4i64); + break; + } + } + } + case AArch64ISD::LD2post: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0); + else if (VT == MVT::v4i16) + return SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0); + else if (VT == MVT::v8i16) + return SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0); + break; + } + case AArch64ISD::LD3post: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0); + else if (VT == MVT::v4i16) + return SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0); + else if (VT == MVT::v8i16) + return SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0); + break; + } + case AArch64ISD::LD4post: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0); + else if (VT == MVT::v4i16) + return SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0); + else if (VT == MVT::v8i16) + return SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0); + break; + } + case AArch64ISD::LD1x2post: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0); + else if (VT == MVT::v4i16) + return SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0); + else if (VT == MVT::v8i16) + return SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0); + break; + } + case AArch64ISD::LD1x3post: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0); + else if (VT == MVT::v4i16) + return SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0); + else if (VT == MVT::v8i16) + return SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0); + break; + } + case AArch64ISD::LD1x4post: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0); + else if (VT == MVT::v4i16) + return SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0); + else if (VT == MVT::v8i16) + return SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0); + break; + } + case AArch64ISD::LD1DUPpost: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0); + else if (VT == MVT::v4i16) + return SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0); + else if (VT == MVT::v8i16) + return SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0); + break; + } + case AArch64ISD::LD2DUPpost: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0); + else if (VT == MVT::v4i16) + return SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0); + else if (VT == MVT::v8i16) + return SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0); + break; + } + case AArch64ISD::LD3DUPpost: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0); + else if (VT == MVT::v4i16) + return SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0); + else if (VT == MVT::v8i16) + return SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0); + break; + } + case AArch64ISD::LD4DUPpost: { + if (VT == MVT::v8i8) + return SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0); + else if (VT == MVT::v16i8) + return SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0); + else if (VT == MVT::v4i16) + return SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0); + else if (VT == MVT::v8i16) + return SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0); + break; + } + case AArch64ISD::LD1LANEpost: { + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST); + else if (VT == MVT::v8i16 || VT == MVT::v4i16) + return SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST); + break; + } + case AArch64ISD::LD2LANEpost: { + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST); + else if (VT == MVT::v8i16 || VT == MVT::v4i16) + return SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST); + break; + } + case AArch64ISD::LD3LANEpost: { + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST); + else if (VT == MVT::v8i16 || VT == MVT::v4i16) + return SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST); + break; + } + case AArch64ISD::LD4LANEpost: { + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST); + else if (VT == MVT::v8i16 || VT == MVT::v4i16) + return SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST); + break; + } + case AArch64ISD::ST2post: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v8i8) + return SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST); + else if (VT == MVT::v16i8) + return SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST); + else if (VT == MVT::v4i16) + return SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST); + else if (VT == MVT::v8i16) + return SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST); + break; + } + case AArch64ISD::ST3post: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v8i8) + return SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST); + else if (VT == MVT::v16i8) + return SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST); + else if (VT == MVT::v4i16) + return SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST); + else if (VT == MVT::v8i16) + return SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST); + break; + } + case AArch64ISD::ST4post: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v8i8) + return SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST); + else if (VT == MVT::v16i8) + return SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST); + else if (VT == MVT::v4i16) + return SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST); + else if (VT == MVT::v8i16) + return SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST); + break; + } + case AArch64ISD::ST1x2post: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v8i8) + return SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST); + else if (VT == MVT::v16i8) + return SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST); + else if (VT == MVT::v4i16) + return SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST); + else if (VT == MVT::v8i16) + return SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST); + break; + } + case AArch64ISD::ST1x3post: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v8i8) + return SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST); + else if (VT == MVT::v16i8) + return SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST); + else if (VT == MVT::v4i16) + return SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST); + else if (VT == MVT::v8i16) + return SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST); + break; + } + case AArch64ISD::ST1x4post: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v8i8) + return SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST); + else if (VT == MVT::v16i8) + return SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST); + else if (VT == MVT::v4i16) + return SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST); + else if (VT == MVT::v8i16) + return SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST); + else if (VT == MVT::v2i32 || VT == MVT::v2f32) + return SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST); + else if (VT == MVT::v4i32 || VT == MVT::v4f32) + return SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST); + else if (VT == MVT::v1i64 || VT == MVT::v1f64) + return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST); + else if (VT == MVT::v2i64 || VT == MVT::v2f64) + return SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST); + break; + } + case AArch64ISD::ST2LANEpost: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST); + else if (VT == MVT::v8i16 || VT == MVT::v4i16) + return SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST); + break; + } + case AArch64ISD::ST3LANEpost: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST); + else if (VT == MVT::v8i16 || VT == MVT::v4i16) + return SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST); + break; + } + case AArch64ISD::ST4LANEpost: { + VT = Node->getOperand(1).getValueType(); + if (VT == MVT::v16i8 || VT == MVT::v8i8) + return SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST); + else if (VT == MVT::v8i16 || VT == MVT::v4i16) + return SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST); + else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || + VT == MVT::v2f32) + return SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST); + else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || + VT == MVT::v1f64) + return SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST); + break; + } + + case ISD::FCEIL: + case ISD::FFLOOR: + case ISD::FTRUNC: + case ISD::FROUND: + if (SDNode *I = SelectLIBM(Node)) + return I; + break; + } + + // Select the default instruction + ResNode = SelectCode(Node); + + DEBUG(errs() << "=> "); + if (ResNode == nullptr || ResNode == Node) + DEBUG(Node->dump(CurDAG)); + else + DEBUG(ResNode->dump(CurDAG)); + DEBUG(errs() << "\n"); + + return ResNode; +} + +/// createAArch64ISelDag - This pass converts a legalized DAG into a +/// AArch64-specific DAG, ready for instruction scheduling. +FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM, + CodeGenOpt::Level OptLevel) { + return new AArch64DAGToDAGISel(TM, OptLevel); +} diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp new file mode 100644 index 00000000000..4ddba007339 --- /dev/null +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -0,0 +1,7926 @@ +//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the AArch64TargetLowering class. +// +//===----------------------------------------------------------------------===// + +#include "AArch64ISelLowering.h" +#include "AArch64PerfectShuffle.h" +#include "AArch64Subtarget.h" +#include "AArch64CallingConv.h" +#include "AArch64MachineFunctionInfo.h" +#include "AArch64TargetMachine.h" +#include "AArch64TargetObjectFile.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOptions.h" +using namespace llvm; + +#define DEBUG_TYPE "aarch64-lower" + +STATISTIC(NumTailCalls, "Number of tail calls"); +STATISTIC(NumShiftInserts, "Number of vector shift inserts"); + +enum AlignMode { + StrictAlign, + NoStrictAlign +}; + +static cl::opt +Align(cl::desc("Load/store alignment support"), + cl::Hidden, cl::init(NoStrictAlign), + cl::values( + clEnumValN(StrictAlign, "aarch64-strict-align", + "Disallow all unaligned memory accesses"), + clEnumValN(NoStrictAlign, "aarch64-no-strict-align", + "Allow unaligned memory accesses"), + clEnumValEnd)); + +// Place holder until extr generation is tested fully. +static cl::opt +EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden, + cl::desc("Allow AArch64 (or (shift)(shift))->extract"), + cl::init(true)); + +static cl::opt +EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, + cl::desc("Allow AArch64 SLI/SRI formation"), + cl::init(false)); + +//===----------------------------------------------------------------------===// +// AArch64 Lowering public interface. +//===----------------------------------------------------------------------===// +static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) { + if (TM.getSubtarget().isTargetDarwin()) + return new AArch64_MachoTargetObjectFile(); + + return new AArch64_ELFTargetObjectFile(); +} + +AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM) + : TargetLowering(TM, createTLOF(TM)) { + Subtarget = &TM.getSubtarget(); + + // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so + // we have to make something up. Arbitrarily, choose ZeroOrOne. + setBooleanContents(ZeroOrOneBooleanContent); + // When comparing vectors the result sets the different elements in the + // vector to all-one or all-zero. + setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); + + // Set up the register classes. + addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass); + addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass); + + if (Subtarget->hasFPARMv8()) { + addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); + addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); + addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); + addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); + } + + if (Subtarget->hasNEON()) { + addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); + addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); + // Someone set us up the NEON. + addDRTypeForNEON(MVT::v2f32); + addDRTypeForNEON(MVT::v8i8); + addDRTypeForNEON(MVT::v4i16); + addDRTypeForNEON(MVT::v2i32); + addDRTypeForNEON(MVT::v1i64); + addDRTypeForNEON(MVT::v1f64); + + addQRTypeForNEON(MVT::v4f32); + addQRTypeForNEON(MVT::v2f64); + addQRTypeForNEON(MVT::v16i8); + addQRTypeForNEON(MVT::v8i16); + addQRTypeForNEON(MVT::v4i32); + addQRTypeForNEON(MVT::v2i64); + } + + // Compute derived properties from the register classes + computeRegisterProperties(); + + // Provide all sorts of operation actions + setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); + setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); + setOperationAction(ISD::SETCC, MVT::i32, Custom); + setOperationAction(ISD::SETCC, MVT::i64, Custom); + setOperationAction(ISD::SETCC, MVT::f32, Custom); + setOperationAction(ISD::SETCC, MVT::f64, Custom); + setOperationAction(ISD::BRCOND, MVT::Other, Expand); + setOperationAction(ISD::BR_CC, MVT::i32, Custom); + setOperationAction(ISD::BR_CC, MVT::i64, Custom); + setOperationAction(ISD::BR_CC, MVT::f32, Custom); + setOperationAction(ISD::BR_CC, MVT::f64, Custom); + setOperationAction(ISD::SELECT, MVT::i32, Custom); + setOperationAction(ISD::SELECT, MVT::i64, Custom); + setOperationAction(ISD::SELECT, MVT::f32, Custom); + setOperationAction(ISD::SELECT, MVT::f64, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + setOperationAction(ISD::JumpTable, MVT::i64, Custom); + + setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); + setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); + setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); + + setOperationAction(ISD::FREM, MVT::f32, Expand); + setOperationAction(ISD::FREM, MVT::f64, Expand); + setOperationAction(ISD::FREM, MVT::f80, Expand); + + // Custom lowering hooks are needed for XOR + // to fold it into CSINC/CSINV. + setOperationAction(ISD::XOR, MVT::i32, Custom); + setOperationAction(ISD::XOR, MVT::i64, Custom); + + // Virtually no operation on f128 is legal, but LLVM can't expand them when + // there's a valid register class, so we need custom operations in most cases. + setOperationAction(ISD::FABS, MVT::f128, Expand); + setOperationAction(ISD::FADD, MVT::f128, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); + setOperationAction(ISD::FCOS, MVT::f128, Expand); + setOperationAction(ISD::FDIV, MVT::f128, Custom); + setOperationAction(ISD::FMA, MVT::f128, Expand); + setOperationAction(ISD::FMUL, MVT::f128, Custom); + setOperationAction(ISD::FNEG, MVT::f128, Expand); + setOperationAction(ISD::FPOW, MVT::f128, Expand); + setOperationAction(ISD::FREM, MVT::f128, Expand); + setOperationAction(ISD::FRINT, MVT::f128, Expand); + setOperationAction(ISD::FSIN, MVT::f128, Expand); + setOperationAction(ISD::FSINCOS, MVT::f128, Expand); + setOperationAction(ISD::FSQRT, MVT::f128, Expand); + setOperationAction(ISD::FSUB, MVT::f128, Custom); + setOperationAction(ISD::FTRUNC, MVT::f128, Expand); + setOperationAction(ISD::SETCC, MVT::f128, Custom); + setOperationAction(ISD::BR_CC, MVT::f128, Custom); + setOperationAction(ISD::SELECT, MVT::f128, Custom); + setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); + + // Lowering for many of the conversions is actually specified by the non-f128 + // type. The LowerXXX function will be trivial when f128 isn't involved. + setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); + setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); + setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); + + // Variable arguments. + setOperationAction(ISD::VASTART, MVT::Other, Custom); + setOperationAction(ISD::VAARG, MVT::Other, Custom); + setOperationAction(ISD::VACOPY, MVT::Other, Custom); + setOperationAction(ISD::VAEND, MVT::Other, Expand); + + // Variable-sized objects. + setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); + + // Exception handling. + // FIXME: These are guesses. Has this been defined yet? + setExceptionPointerRegister(AArch64::X0); + setExceptionSelectorRegister(AArch64::X1); + + // Constant pool entries + setOperationAction(ISD::ConstantPool, MVT::i64, Custom); + + // BlockAddress + setOperationAction(ISD::BlockAddress, MVT::i64, Custom); + + // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences. + setOperationAction(ISD::ADDC, MVT::i32, Custom); + setOperationAction(ISD::ADDE, MVT::i32, Custom); + setOperationAction(ISD::SUBC, MVT::i32, Custom); + setOperationAction(ISD::SUBE, MVT::i32, Custom); + setOperationAction(ISD::ADDC, MVT::i64, Custom); + setOperationAction(ISD::ADDE, MVT::i64, Custom); + setOperationAction(ISD::SUBC, MVT::i64, Custom); + setOperationAction(ISD::SUBE, MVT::i64, Custom); + + // AArch64 lacks both left-rotate and popcount instructions. + setOperationAction(ISD::ROTL, MVT::i32, Expand); + setOperationAction(ISD::ROTL, MVT::i64, Expand); + + // AArch64 doesn't have {U|S}MUL_LOHI. + setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); + setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); + + + // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero + // counterparts, which AArch64 supports directly. + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); + + setOperationAction(ISD::CTPOP, MVT::i32, Custom); + setOperationAction(ISD::CTPOP, MVT::i64, Custom); + + setOperationAction(ISD::SDIVREM, MVT::i32, Expand); + setOperationAction(ISD::SDIVREM, MVT::i64, Expand); + setOperationAction(ISD::SREM, MVT::i32, Expand); + setOperationAction(ISD::SREM, MVT::i64, Expand); + setOperationAction(ISD::UDIVREM, MVT::i32, Expand); + setOperationAction(ISD::UDIVREM, MVT::i64, Expand); + setOperationAction(ISD::UREM, MVT::i32, Expand); + setOperationAction(ISD::UREM, MVT::i64, Expand); + + // Custom lower Add/Sub/Mul with overflow. + setOperationAction(ISD::SADDO, MVT::i32, Custom); + setOperationAction(ISD::SADDO, MVT::i64, Custom); + setOperationAction(ISD::UADDO, MVT::i32, Custom); + setOperationAction(ISD::UADDO, MVT::i64, Custom); + setOperationAction(ISD::SSUBO, MVT::i32, Custom); + setOperationAction(ISD::SSUBO, MVT::i64, Custom); + setOperationAction(ISD::USUBO, MVT::i32, Custom); + setOperationAction(ISD::USUBO, MVT::i64, Custom); + setOperationAction(ISD::SMULO, MVT::i32, Custom); + setOperationAction(ISD::SMULO, MVT::i64, Custom); + setOperationAction(ISD::UMULO, MVT::i32, Custom); + setOperationAction(ISD::UMULO, MVT::i64, Custom); + + setOperationAction(ISD::FSIN, MVT::f32, Expand); + setOperationAction(ISD::FSIN, MVT::f64, Expand); + setOperationAction(ISD::FCOS, MVT::f32, Expand); + setOperationAction(ISD::FCOS, MVT::f64, Expand); + setOperationAction(ISD::FPOW, MVT::f32, Expand); + setOperationAction(ISD::FPOW, MVT::f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); + + // AArch64 has implementations of a lot of rounding-like FP operations. + static MVT RoundingTypes[] = { MVT::f32, MVT::f64}; + for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) { + MVT Ty = RoundingTypes[I]; + setOperationAction(ISD::FFLOOR, Ty, Legal); + setOperationAction(ISD::FNEARBYINT, Ty, Legal); + setOperationAction(ISD::FCEIL, Ty, Legal); + setOperationAction(ISD::FRINT, Ty, Legal); + setOperationAction(ISD::FTRUNC, Ty, Legal); + setOperationAction(ISD::FROUND, Ty, Legal); + } + + setOperationAction(ISD::PREFETCH, MVT::Other, Custom); + + if (Subtarget->isTargetMachO()) { + // For iOS, we don't want to the normal expansion of a libcall to + // sincos. We want to issue a libcall to __sincos_stret to avoid memory + // traffic. + setOperationAction(ISD::FSINCOS, MVT::f64, Custom); + setOperationAction(ISD::FSINCOS, MVT::f32, Custom); + } else { + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f32, Expand); + } + + // AArch64 does not have floating-point extending loads, i1 sign-extending + // load, floating-point truncating stores, or v2i32->v2i16 truncating store. + setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand); + setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand); + setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + setTruncStoreAction(MVT::f64, MVT::f16, Expand); + setTruncStoreAction(MVT::f128, MVT::f80, Expand); + setTruncStoreAction(MVT::f128, MVT::f64, Expand); + setTruncStoreAction(MVT::f128, MVT::f32, Expand); + setTruncStoreAction(MVT::f128, MVT::f16, Expand); + // Indexed loads and stores are supported. + for (unsigned im = (unsigned)ISD::PRE_INC; + im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { + setIndexedLoadAction(im, MVT::i8, Legal); + setIndexedLoadAction(im, MVT::i16, Legal); + setIndexedLoadAction(im, MVT::i32, Legal); + setIndexedLoadAction(im, MVT::i64, Legal); + setIndexedLoadAction(im, MVT::f64, Legal); + setIndexedLoadAction(im, MVT::f32, Legal); + setIndexedStoreAction(im, MVT::i8, Legal); + setIndexedStoreAction(im, MVT::i16, Legal); + setIndexedStoreAction(im, MVT::i32, Legal); + setIndexedStoreAction(im, MVT::i64, Legal); + setIndexedStoreAction(im, MVT::f64, Legal); + setIndexedStoreAction(im, MVT::f32, Legal); + } + + // Trap. + setOperationAction(ISD::TRAP, MVT::Other, Legal); + + // We combine OR nodes for bitfield operations. + setTargetDAGCombine(ISD::OR); + + // Vector add and sub nodes may conceal a high-half opportunity. + // Also, try to fold ADD into CSINC/CSINV.. + setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::SUB); + + setTargetDAGCombine(ISD::XOR); + setTargetDAGCombine(ISD::SINT_TO_FP); + setTargetDAGCombine(ISD::UINT_TO_FP); + + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); + + setTargetDAGCombine(ISD::ANY_EXTEND); + setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::BITCAST); + setTargetDAGCombine(ISD::CONCAT_VECTORS); + setTargetDAGCombine(ISD::STORE); + + setTargetDAGCombine(ISD::MUL); + + setTargetDAGCombine(ISD::SELECT); + setTargetDAGCombine(ISD::VSELECT); + + setTargetDAGCombine(ISD::INTRINSIC_VOID); + setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); + setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + + MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; + MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; + MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4; + + setStackPointerRegisterToSaveRestore(AArch64::SP); + + setSchedulingPreference(Sched::Hybrid); + + // Enable TBZ/TBNZ + MaskAndBranchFoldingIsLegal = true; + + setMinFunctionAlignment(2); + + RequireStrictAlign = (Align == StrictAlign); + + setHasExtractBitsInsn(true); + + if (Subtarget->hasNEON()) { + // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to + // silliness like this: + setOperationAction(ISD::FABS, MVT::v1f64, Expand); + setOperationAction(ISD::FADD, MVT::v1f64, Expand); + setOperationAction(ISD::FCEIL, MVT::v1f64, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand); + setOperationAction(ISD::FCOS, MVT::v1f64, Expand); + setOperationAction(ISD::FDIV, MVT::v1f64, Expand); + setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand); + setOperationAction(ISD::FMA, MVT::v1f64, Expand); + setOperationAction(ISD::FMUL, MVT::v1f64, Expand); + setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand); + setOperationAction(ISD::FNEG, MVT::v1f64, Expand); + setOperationAction(ISD::FPOW, MVT::v1f64, Expand); + setOperationAction(ISD::FREM, MVT::v1f64, Expand); + setOperationAction(ISD::FROUND, MVT::v1f64, Expand); + setOperationAction(ISD::FRINT, MVT::v1f64, Expand); + setOperationAction(ISD::FSIN, MVT::v1f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand); + setOperationAction(ISD::FSQRT, MVT::v1f64, Expand); + setOperationAction(ISD::FSUB, MVT::v1f64, Expand); + setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand); + setOperationAction(ISD::SETCC, MVT::v1f64, Expand); + setOperationAction(ISD::BR_CC, MVT::v1f64, Expand); + setOperationAction(ISD::SELECT, MVT::v1f64, Expand); + setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand); + + setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand); + setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand); + setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand); + + setOperationAction(ISD::MUL, MVT::v1i64, Expand); + + // AArch64 doesn't have a direct vector ->f32 conversion instructions for + // elements smaller than i32, so promote the input to i32 first. + setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote); + // Similarly, there is no direct i32 -> f64 vector conversion instruction. + setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom); + + // AArch64 doesn't have MUL.2d: + setOperationAction(ISD::MUL, MVT::v2i64, Expand); + setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); + setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); + // Likewise, narrowing and extending vector loads/stores aren't handled + // directly. + for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; + VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { + + setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, + Expand); + + setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); + + setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); + + for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; + InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) + setTruncStoreAction((MVT::SimpleValueType)VT, + (MVT::SimpleValueType)InnerVT, Expand); + setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); + setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); + } + + // AArch64 has implementations of a lot of rounding-like FP operations. + static MVT RoundingVecTypes[] = {MVT::v2f32, MVT::v4f32, MVT::v2f64 }; + for (unsigned I = 0; I < array_lengthof(RoundingVecTypes); ++I) { + MVT Ty = RoundingVecTypes[I]; + setOperationAction(ISD::FFLOOR, Ty, Legal); + setOperationAction(ISD::FNEARBYINT, Ty, Legal); + setOperationAction(ISD::FCEIL, Ty, Legal); + setOperationAction(ISD::FRINT, Ty, Legal); + setOperationAction(ISD::FTRUNC, Ty, Legal); + setOperationAction(ISD::FROUND, Ty, Legal); + } + } +} + +void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { + if (VT == MVT::v2f32) { + setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); + AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32); + + setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); + AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32); + } else if (VT == MVT::v2f64 || VT == MVT::v4f32) { + setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); + AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64); + + setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); + AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64); + } + + // Mark vector float intrinsics as expand. + if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { + setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand); + } + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom); + setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom); + setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom); + setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom); + setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom); + setOperationAction(ISD::AND, VT.getSimpleVT(), Custom); + setOperationAction(ISD::OR, VT.getSimpleVT(), Custom); + setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom); + setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal); + + setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); + setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); + setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand); + setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand); + + // CNT supports only B element sizes. + if (VT != MVT::v8i8 && VT != MVT::v16i8) + setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand); + + setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand); + setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand); + setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand); + setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand); + setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand); + + setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom); + setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom); + + if (Subtarget->isLittleEndian()) { + for (unsigned im = (unsigned)ISD::PRE_INC; + im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { + setIndexedLoadAction(im, VT.getSimpleVT(), Legal); + setIndexedStoreAction(im, VT.getSimpleVT(), Legal); + } + } +} + +void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { + addRegisterClass(VT, &AArch64::FPR64RegClass); + addTypeForNEON(VT, MVT::v2i32); +} + +void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { + addRegisterClass(VT, &AArch64::FPR128RegClass); + addTypeForNEON(VT, MVT::v4i32); +} + +EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { + if (!VT.isVector()) + return MVT::i32; + return VT.changeVectorElementTypeToInteger(); +} + +/// computeKnownBitsForTargetNode - Determine which of the bits specified in +/// Mask are known to be either zero or one and return them in the +/// KnownZero/KnownOne bitsets. +void AArch64TargetLowering::computeKnownBitsForTargetNode( + const SDValue Op, APInt &KnownZero, APInt &KnownOne, + const SelectionDAG &DAG, unsigned Depth) const { + switch (Op.getOpcode()) { + default: + break; + case AArch64ISD::CSEL: { + APInt KnownZero2, KnownOne2; + DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1); + DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1); + KnownZero &= KnownZero2; + KnownOne &= KnownOne2; + break; + } + case ISD::INTRINSIC_W_CHAIN: { + ConstantSDNode *CN = cast(Op->getOperand(1)); + Intrinsic::ID IntID = static_cast(CN->getZExtValue()); + switch (IntID) { + default: return; + case Intrinsic::aarch64_ldaxr: + case Intrinsic::aarch64_ldxr: { + unsigned BitWidth = KnownOne.getBitWidth(); + EVT VT = cast(Op)->getMemoryVT(); + unsigned MemBits = VT.getScalarType().getSizeInBits(); + KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); + return; + } + } + break; + } + case ISD::INTRINSIC_WO_CHAIN: + case ISD::INTRINSIC_VOID: { + unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); + switch (IntNo) { + default: + break; + case Intrinsic::aarch64_neon_umaxv: + case Intrinsic::aarch64_neon_uminv: { + // Figure out the datatype of the vector operand. The UMINV instruction + // will zero extend the result, so we can mark as known zero all the + // bits larger than the element datatype. 32-bit or larget doesn't need + // this as those are legal types and will be handled by isel directly. + MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); + unsigned BitWidth = KnownZero.getBitWidth(); + if (VT == MVT::v8i8 || VT == MVT::v16i8) { + assert(BitWidth >= 8 && "Unexpected width!"); + APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); + KnownZero |= Mask; + } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { + assert(BitWidth >= 16 && "Unexpected width!"); + APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); + KnownZero |= Mask; + } + break; + } break; + } + } + } +} + +MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const { + return MVT::i64; +} + +unsigned AArch64TargetLowering::getMaximalGlobalOffset() const { + // FIXME: On AArch64, this depends on the type. + // Basically, the addressable offsets are o to 4095 * Ty.getSizeInBytes(). + // and the offset has to be a multiple of the related size in bytes. + return 4095; +} + +FastISel * +AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) const { + return AArch64::createFastISel(funcInfo, libInfo); +} + +const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { + switch (Opcode) { + default: + return nullptr; + case AArch64ISD::CALL: return "AArch64ISD::CALL"; + case AArch64ISD::ADRP: return "AArch64ISD::ADRP"; + case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow"; + case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot"; + case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG"; + case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND"; + case AArch64ISD::CSEL: return "AArch64ISD::CSEL"; + case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL"; + case AArch64ISD::CSINV: return "AArch64ISD::CSINV"; + case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG"; + case AArch64ISD::CSINC: return "AArch64ISD::CSINC"; + case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER"; + case AArch64ISD::TLSDESC_CALL: return "AArch64ISD::TLSDESC_CALL"; + case AArch64ISD::ADC: return "AArch64ISD::ADC"; + case AArch64ISD::SBC: return "AArch64ISD::SBC"; + case AArch64ISD::ADDS: return "AArch64ISD::ADDS"; + case AArch64ISD::SUBS: return "AArch64ISD::SUBS"; + case AArch64ISD::ADCS: return "AArch64ISD::ADCS"; + case AArch64ISD::SBCS: return "AArch64ISD::SBCS"; + case AArch64ISD::ANDS: return "AArch64ISD::ANDS"; + case AArch64ISD::FCMP: return "AArch64ISD::FCMP"; + case AArch64ISD::FMIN: return "AArch64ISD::FMIN"; + case AArch64ISD::FMAX: return "AArch64ISD::FMAX"; + case AArch64ISD::DUP: return "AArch64ISD::DUP"; + case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8"; + case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16"; + case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32"; + case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64"; + case AArch64ISD::MOVI: return "AArch64ISD::MOVI"; + case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift"; + case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit"; + case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl"; + case AArch64ISD::FMOV: return "AArch64ISD::FMOV"; + case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift"; + case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl"; + case AArch64ISD::BICi: return "AArch64ISD::BICi"; + case AArch64ISD::ORRi: return "AArch64ISD::ORRi"; + case AArch64ISD::BSL: return "AArch64ISD::BSL"; + case AArch64ISD::NEG: return "AArch64ISD::NEG"; + case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; + case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1"; + case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2"; + case AArch64ISD::UZP1: return "AArch64ISD::UZP1"; + case AArch64ISD::UZP2: return "AArch64ISD::UZP2"; + case AArch64ISD::TRN1: return "AArch64ISD::TRN1"; + case AArch64ISD::TRN2: return "AArch64ISD::TRN2"; + case AArch64ISD::REV16: return "AArch64ISD::REV16"; + case AArch64ISD::REV32: return "AArch64ISD::REV32"; + case AArch64ISD::REV64: return "AArch64ISD::REV64"; + case AArch64ISD::EXT: return "AArch64ISD::EXT"; + case AArch64ISD::VSHL: return "AArch64ISD::VSHL"; + case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR"; + case AArch64ISD::VASHR: return "AArch64ISD::VASHR"; + case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ"; + case AArch64ISD::CMGE: return "AArch64ISD::CMGE"; + case AArch64ISD::CMGT: return "AArch64ISD::CMGT"; + case AArch64ISD::CMHI: return "AArch64ISD::CMHI"; + case AArch64ISD::CMHS: return "AArch64ISD::CMHS"; + case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ"; + case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE"; + case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT"; + case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz"; + case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz"; + case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz"; + case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz"; + case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz"; + case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz"; + case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz"; + case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz"; + case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz"; + case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz"; + case AArch64ISD::NOT: return "AArch64ISD::NOT"; + case AArch64ISD::BIT: return "AArch64ISD::BIT"; + case AArch64ISD::CBZ: return "AArch64ISD::CBZ"; + case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ"; + case AArch64ISD::TBZ: return "AArch64ISD::TBZ"; + case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ"; + case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; + case AArch64ISD::SITOF: return "AArch64ISD::SITOF"; + case AArch64ISD::UITOF: return "AArch64ISD::UITOF"; + case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I"; + case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I"; + case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I"; + case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I"; + case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I"; + case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge"; + case AArch64ISD::LD2post: return "AArch64ISD::LD2post"; + case AArch64ISD::LD3post: return "AArch64ISD::LD3post"; + case AArch64ISD::LD4post: return "AArch64ISD::LD4post"; + case AArch64ISD::ST2post: return "AArch64ISD::ST2post"; + case AArch64ISD::ST3post: return "AArch64ISD::ST3post"; + case AArch64ISD::ST4post: return "AArch64ISD::ST4post"; + case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post"; + case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post"; + case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post"; + case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post"; + case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post"; + case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post"; + case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost"; + case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost"; + case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost"; + case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost"; + case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost"; + case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost"; + case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost"; + case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost"; + case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost"; + case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost"; + case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; + } +} + +MachineBasicBlock * +AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, + MachineBasicBlock *MBB) const { + // We materialise the F128CSEL pseudo-instruction as some control flow and a + // phi node: + + // OrigBB: + // [... previous instrs leading to comparison ...] + // b.ne TrueBB + // b EndBB + // TrueBB: + // ; Fallthrough + // EndBB: + // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] + + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + MachineFunction *MF = MBB->getParent(); + const BasicBlock *LLVM_BB = MBB->getBasicBlock(); + DebugLoc DL = MI->getDebugLoc(); + MachineFunction::iterator It = MBB; + ++It; + + unsigned DestReg = MI->getOperand(0).getReg(); + unsigned IfTrueReg = MI->getOperand(1).getReg(); + unsigned IfFalseReg = MI->getOperand(2).getReg(); + unsigned CondCode = MI->getOperand(3).getImm(); + bool NZCVKilled = MI->getOperand(4).isKill(); + + MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); + MF->insert(It, TrueBB); + MF->insert(It, EndBB); + + // Transfer rest of current basic-block to EndBB + EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), + MBB->end()); + EndBB->transferSuccessorsAndUpdatePHIs(MBB); + + BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB); + BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB); + MBB->addSuccessor(TrueBB); + MBB->addSuccessor(EndBB); + + // TrueBB falls through to the end. + TrueBB->addSuccessor(EndBB); + + if (!NZCVKilled) { + TrueBB->addLiveIn(AArch64::NZCV); + EndBB->addLiveIn(AArch64::NZCV); + } + + BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg) + .addReg(IfTrueReg) + .addMBB(TrueBB) + .addReg(IfFalseReg) + .addMBB(MBB); + + MI->eraseFromParent(); + return EndBB; +} + +MachineBasicBlock * +AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock *BB) const { + switch (MI->getOpcode()) { + default: +#ifndef NDEBUG + MI->dump(); +#endif + assert(0 && "Unexpected instruction for custom inserter!"); + break; + + case AArch64::F128CSEL: + return EmitF128CSEL(MI, BB); + + case TargetOpcode::STACKMAP: + case TargetOpcode::PATCHPOINT: + return emitPatchPoint(MI, BB); + } + llvm_unreachable("Unexpected instruction for custom inserter!"); +} + +//===----------------------------------------------------------------------===// +// AArch64 Lowering private implementation. +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Lowering Code +//===----------------------------------------------------------------------===// + +/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 +/// CC +static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { + switch (CC) { + default: + llvm_unreachable("Unknown condition code!"); + case ISD::SETNE: + return AArch64CC::NE; + case ISD::SETEQ: + return AArch64CC::EQ; + case ISD::SETGT: + return AArch64CC::GT; + case ISD::SETGE: + return AArch64CC::GE; + case ISD::SETLT: + return AArch64CC::LT; + case ISD::SETLE: + return AArch64CC::LE; + case ISD::SETUGT: + return AArch64CC::HI; + case ISD::SETUGE: + return AArch64CC::HS; + case ISD::SETULT: + return AArch64CC::LO; + case ISD::SETULE: + return AArch64CC::LS; + } +} + +/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC. +static void changeFPCCToAArch64CC(ISD::CondCode CC, + AArch64CC::CondCode &CondCode, + AArch64CC::CondCode &CondCode2) { + CondCode2 = AArch64CC::AL; + switch (CC) { + default: + llvm_unreachable("Unknown FP condition!"); + case ISD::SETEQ: + case ISD::SETOEQ: + CondCode = AArch64CC::EQ; + break; + case ISD::SETGT: + case ISD::SETOGT: + CondCode = AArch64CC::GT; + break; + case ISD::SETGE: + case ISD::SETOGE: + CondCode = AArch64CC::GE; + break; + case ISD::SETOLT: + CondCode = AArch64CC::MI; + break; + case ISD::SETOLE: + CondCode = AArch64CC::LS; + break; + case ISD::SETONE: + CondCode = AArch64CC::MI; + CondCode2 = AArch64CC::GT; + break; + case ISD::SETO: + CondCode = AArch64CC::VC; + break; + case ISD::SETUO: + CondCode = AArch64CC::VS; + break; + case ISD::SETUEQ: + CondCode = AArch64CC::EQ; + CondCode2 = AArch64CC::VS; + break; + case ISD::SETUGT: + CondCode = AArch64CC::HI; + break; + case ISD::SETUGE: + CondCode = AArch64CC::PL; + break; + case ISD::SETLT: + case ISD::SETULT: + CondCode = AArch64CC::LT; + break; + case ISD::SETLE: + case ISD::SETULE: + CondCode = AArch64CC::LE; + break; + case ISD::SETNE: + case ISD::SETUNE: + CondCode = AArch64CC::NE; + break; + } +} + +/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 +/// CC usable with the vector instructions. Fewer operations are available +/// without a real NZCV register, so we have to use less efficient combinations +/// to get the same effect. +static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, + AArch64CC::CondCode &CondCode, + AArch64CC::CondCode &CondCode2, + bool &Invert) { + Invert = false; + switch (CC) { + default: + // Mostly the scalar mappings work fine. + changeFPCCToAArch64CC(CC, CondCode, CondCode2); + break; + case ISD::SETUO: + Invert = true; // Fallthrough + case ISD::SETO: + CondCode = AArch64CC::MI; + CondCode2 = AArch64CC::GE; + break; + case ISD::SETUEQ: + case ISD::SETULT: + case ISD::SETULE: + case ISD::SETUGT: + case ISD::SETUGE: + // All of the compare-mask comparisons are ordered, but we can switch + // between the two by a double inversion. E.g. ULE == !OGT. + Invert = true; + changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2); + break; + } +} + +static bool isLegalArithImmed(uint64_t C) { + // Matches AArch64DAGToDAGISel::SelectArithImmed(). + return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); +} + +static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, + SDLoc dl, SelectionDAG &DAG) { + EVT VT = LHS.getValueType(); + + if (VT.isFloatingPoint()) + return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); + + // The CMP instruction is just an alias for SUBS, and representing it as + // SUBS means that it's possible to get CSE with subtract operations. + // A later phase can perform the optimization of setting the destination + // register to WZR/XZR if it ends up being unused. + unsigned Opcode = AArch64ISD::SUBS; + + if (RHS.getOpcode() == ISD::SUB && isa(RHS.getOperand(0)) && + cast(RHS.getOperand(0))->getZExtValue() == 0 && + (CC == ISD::SETEQ || CC == ISD::SETNE)) { + // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on + // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags + // can be set differently by this operation. It comes down to whether + // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then + // everything is fine. If not then the optimization is wrong. Thus general + // comparisons are only valid if op2 != 0. + + // So, finally, the only LLVM-native comparisons that don't mention C and V + // are SETEQ and SETNE. They're the only ones we can safely use CMN for in + // the absence of information about op2. + Opcode = AArch64ISD::ADDS; + RHS = RHS.getOperand(1); + } else if (LHS.getOpcode() == ISD::AND && isa(RHS) && + cast(RHS)->getZExtValue() == 0 && + !isUnsignedIntSetCC(CC)) { + // Similarly, (CMP (and X, Y), 0) can be implemented with a TST + // (a.k.a. ANDS) except that the flags are only guaranteed to work for one + // of the signed comparisons. + Opcode = AArch64ISD::ANDS; + RHS = LHS.getOperand(1); + LHS = LHS.getOperand(0); + } + + return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS) + .getValue(1); +} + +static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, + SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) { + if (ConstantSDNode *RHSC = dyn_cast(RHS.getNode())) { + EVT VT = RHS.getValueType(); + uint64_t C = RHSC->getZExtValue(); + if (!isLegalArithImmed(C)) { + // Constant does not fit, try adjusting it by one? + switch (CC) { + default: + break; + case ISD::SETLT: + case ISD::SETGE: + if ((VT == MVT::i32 && C != 0x80000000 && + isLegalArithImmed((uint32_t)(C - 1))) || + (VT == MVT::i64 && C != 0x80000000ULL && + isLegalArithImmed(C - 1ULL))) { + CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; + C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; + RHS = DAG.getConstant(C, VT); + } + break; + case ISD::SETULT: + case ISD::SETUGE: + if ((VT == MVT::i32 && C != 0 && + isLegalArithImmed((uint32_t)(C - 1))) || + (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) { + CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; + C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; + RHS = DAG.getConstant(C, VT); + } + break; + case ISD::SETLE: + case ISD::SETGT: + if ((VT == MVT::i32 && C != 0x7fffffff && + isLegalArithImmed((uint32_t)(C + 1))) || + (VT == MVT::i64 && C != 0x7ffffffffffffffULL && + isLegalArithImmed(C + 1ULL))) { + CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; + C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; + RHS = DAG.getConstant(C, VT); + } + break; + case ISD::SETULE: + case ISD::SETUGT: + if ((VT == MVT::i32 && C != 0xffffffff && + isLegalArithImmed((uint32_t)(C + 1))) || + (VT == MVT::i64 && C != 0xfffffffffffffffULL && + isLegalArithImmed(C + 1ULL))) { + CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; + C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; + RHS = DAG.getConstant(C, VT); + } + break; + } + } + } + + SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); + AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); + AArch64cc = DAG.getConstant(AArch64CC, MVT::i32); + return Cmp; +} + +static std::pair +getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { + assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) && + "Unsupported value type"); + SDValue Value, Overflow; + SDLoc DL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + unsigned Opc = 0; + switch (Op.getOpcode()) { + default: + llvm_unreachable("Unknown overflow instruction!"); + case ISD::SADDO: + Opc = AArch64ISD::ADDS; + CC = AArch64CC::VS; + break; + case ISD::UADDO: + Opc = AArch64ISD::ADDS; + CC = AArch64CC::HS; + break; + case ISD::SSUBO: + Opc = AArch64ISD::SUBS; + CC = AArch64CC::VS; + break; + case ISD::USUBO: + Opc = AArch64ISD::SUBS; + CC = AArch64CC::LO; + break; + // Multiply needs a little bit extra work. + case ISD::SMULO: + case ISD::UMULO: { + CC = AArch64CC::NE; + bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false; + if (Op.getValueType() == MVT::i32) { + unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + // For a 32 bit multiply with overflow check we want the instruction + // selector to generate a widening multiply (SMADDL/UMADDL). For that we + // need to generate the following pattern: + // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b)) + LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS); + RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS); + SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); + SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul, + DAG.getConstant(0, MVT::i64)); + // On AArch64 the upper 32 bits are always zero extended for a 32 bit + // operation. We need to clear out the upper 32 bits, because we used a + // widening multiply that wrote all 64 bits. In the end this should be a + // noop. + Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add); + if (IsSigned) { + // The signed overflow check requires more than just a simple check for + // any bit set in the upper 32 bits of the result. These bits could be + // just the sign bits of a negative number. To perform the overflow + // check we have to arithmetic shift right the 32nd bit of the result by + // 31 bits. Then we compare the result to the upper 32 bits. + SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add, + DAG.getConstant(32, MVT::i64)); + UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits); + SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value, + DAG.getConstant(31, MVT::i64)); + // It is important that LowerBits is last, otherwise the arithmetic + // shift will not be folded into the compare (SUBS). + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32); + Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) + .getValue(1); + } else { + // The overflow check for unsigned multiply is easy. We only need to + // check if any of the upper 32 bits are set. This can be done with a + // CMP (shifted register). For that we need to generate the following + // pattern: + // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32) + SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul, + DAG.getConstant(32, MVT::i64)); + SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); + Overflow = + DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64), + UpperBits).getValue(1); + } + break; + } + assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type"); + // For the 64 bit multiply + Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); + if (IsSigned) { + SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS); + SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value, + DAG.getConstant(63, MVT::i64)); + // It is important that LowerBits is last, otherwise the arithmetic + // shift will not be folded into the compare (SUBS). + SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); + Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) + .getValue(1); + } else { + SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); + SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); + Overflow = + DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64), + UpperBits).getValue(1); + } + break; + } + } // switch (...) + + if (Opc) { + SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); + + // Emit the AArch64 operation with overflow check. + Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); + Overflow = Value.getValue(1); + } + return std::make_pair(Value, Overflow); +} + +SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, + RTLIB::Libcall Call) const { + SmallVector Ops; + for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) + Ops.push_back(Op.getOperand(i)); + + return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false, + SDLoc(Op)).first; +} + +static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { + SDValue Sel = Op.getOperand(0); + SDValue Other = Op.getOperand(1); + + // If neither operand is a SELECT_CC, give up. + if (Sel.getOpcode() != ISD::SELECT_CC) + std::swap(Sel, Other); + if (Sel.getOpcode() != ISD::SELECT_CC) + return Op; + + // The folding we want to perform is: + // (xor x, (select_cc a, b, cc, 0, -1) ) + // --> + // (csel x, (xor x, -1), cc ...) + // + // The latter will get matched to a CSINV instruction. + + ISD::CondCode CC = cast(Sel.getOperand(4))->get(); + SDValue LHS = Sel.getOperand(0); + SDValue RHS = Sel.getOperand(1); + SDValue TVal = Sel.getOperand(2); + SDValue FVal = Sel.getOperand(3); + SDLoc dl(Sel); + + // FIXME: This could be generalized to non-integer comparisons. + if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) + return Op; + + ConstantSDNode *CFVal = dyn_cast(FVal); + ConstantSDNode *CTVal = dyn_cast(TVal); + + // The the values aren't constants, this isn't the pattern we're looking for. + if (!CFVal || !CTVal) + return Op; + + // We can commute the SELECT_CC by inverting the condition. This + // might be needed to make this fit into a CSINV pattern. + if (CTVal->isAllOnesValue() && CFVal->isNullValue()) { + std::swap(TVal, FVal); + std::swap(CTVal, CFVal); + CC = ISD::getSetCCInverse(CC, true); + } + + // If the constants line up, perform the transform! + if (CTVal->isNullValue() && CFVal->isAllOnesValue()) { + SDValue CCVal; + SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); + + FVal = Other; + TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other, + DAG.getConstant(-1ULL, Other.getValueType())); + + return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal, + CCVal, Cmp); + } + + return Op; +} + +static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { + EVT VT = Op.getValueType(); + + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + SDVTList VTs = DAG.getVTList(VT, MVT::i32); + + unsigned Opc; + bool ExtraOp = false; + switch (Op.getOpcode()) { + default: + assert(0 && "Invalid code"); + case ISD::ADDC: + Opc = AArch64ISD::ADDS; + break; + case ISD::SUBC: + Opc = AArch64ISD::SUBS; + break; + case ISD::ADDE: + Opc = AArch64ISD::ADCS; + ExtraOp = true; + break; + case ISD::SUBE: + Opc = AArch64ISD::SBCS; + ExtraOp = true; + break; + } + + if (!ExtraOp) + return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); + return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), + Op.getOperand(2)); +} + +static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) + return SDValue(); + + AArch64CC::CondCode CC; + // The actual operation that sets the overflow or carry flag. + SDValue Value, Overflow; + std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG); + + // We use 0 and 1 as false and true values. + SDValue TVal = DAG.getConstant(1, MVT::i32); + SDValue FVal = DAG.getConstant(0, MVT::i32); + + // We use an inverted condition, because the conditional select is inverted + // too. This will allow it to be selected to a single instruction: + // CSINC Wd, WZR, WZR, invert(cond). + SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), MVT::i32); + Overflow = DAG.getNode(AArch64ISD::CSEL, SDLoc(Op), MVT::i32, FVal, TVal, + CCVal, Overflow); + + SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); + return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow); +} + +// Prefetch operands are: +// 1: Address to prefetch +// 2: bool isWrite +// 3: int locality (0 = no locality ... 3 = extreme locality) +// 4: bool isDataCache +static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { + SDLoc DL(Op); + unsigned IsWrite = cast(Op.getOperand(2))->getZExtValue(); + unsigned Locality = cast(Op.getOperand(3))->getZExtValue(); + // The data thing is not used. + // unsigned isData = cast(Op.getOperand(4))->getZExtValue(); + + bool IsStream = !Locality; + // When the locality number is set + if (Locality) { + // The front-end should have filtered out the out-of-range values + assert(Locality <= 3 && "Prefetch locality out-of-range"); + // The locality degree is the opposite of the cache speed. + // Put the number the other way around. + // The encoding starts at 0 for level 1 + Locality = 3 - Locality; + } + + // built the mask value encoding the expected behavior. + unsigned PrfOp = (IsWrite << 4) | // Load/Store bit + (Locality << 1) | // Cache level bits + (unsigned)IsStream; // Stream bit + return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), + DAG.getConstant(PrfOp, MVT::i32), Op.getOperand(1)); +} + +SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); + + RTLIB::Libcall LC; + LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); + + return LowerF128Call(Op, DAG, LC); +} + +SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, + SelectionDAG &DAG) const { + if (Op.getOperand(0).getValueType() != MVT::f128) { + // It's legal except when f128 is involved + return Op; + } + + RTLIB::Libcall LC; + LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); + + // FP_ROUND node has a second operand indicating whether it is known to be + // precise. That doesn't take part in the LibCall so we can't directly use + // LowerF128Call. + SDValue SrcVal = Op.getOperand(0); + return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1, + /*isSigned*/ false, SDLoc(Op)).first; +} + +static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { + // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. + // Any additional optimization in this function should be recorded + // in the cost tables. + EVT InVT = Op.getOperand(0).getValueType(); + EVT VT = Op.getValueType(); + + // FP_TO_XINT conversion from the same type are legal. + if (VT.getSizeInBits() == InVT.getSizeInBits()) + return Op; + + if (InVT == MVT::v2f64 || InVT == MVT::v4f32) { + SDLoc dl(Op); + SDValue Cv = + DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), + Op.getOperand(0)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); + } else if (InVT == MVT::v2f32) { + SDLoc dl(Op); + SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0)); + return DAG.getNode(Op.getOpcode(), dl, VT, Ext); + } + + // Type changing conversions are illegal. + return SDValue(); +} + +SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, + SelectionDAG &DAG) const { + if (Op.getOperand(0).getValueType().isVector()) + return LowerVectorFP_TO_INT(Op, DAG); + + if (Op.getOperand(0).getValueType() != MVT::f128) { + // It's legal except when f128 is involved + return Op; + } + + RTLIB::Libcall LC; + if (Op.getOpcode() == ISD::FP_TO_SINT) + LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); + else + LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); + + SmallVector Ops; + for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) + Ops.push_back(Op.getOperand(i)); + + return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false, + SDLoc(Op)).first; +} + +static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { + // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. + // Any additional optimization in this function should be recorded + // in the cost tables. + EVT VT = Op.getValueType(); + SDLoc dl(Op); + SDValue In = Op.getOperand(0); + EVT InVT = In.getValueType(); + + // v2i32 to v2f32 is legal. + if (VT == MVT::v2f32 && InVT == MVT::v2i32) + return Op; + + // This function only handles v2f64 outputs. + if (VT == MVT::v2f64) { + // Extend the input argument to a v2i64 that we can feed into the + // floating point conversion. Zero or sign extend based on whether + // we're doing a signed or unsigned float conversion. + unsigned Opc = + Op.getOpcode() == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; + assert(Op.getNumOperands() == 1 && "FP conversions take one argument"); + SDValue Promoted = DAG.getNode(Opc, dl, MVT::v2i64, Op.getOperand(0)); + return DAG.getNode(Op.getOpcode(), dl, Op.getValueType(), Promoted); + } + + // Scalarize v2i64 to v2f32 conversions. + std::vector BuildVectorOps; + for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { + SDValue Sclr = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, In, + DAG.getConstant(i, MVT::i64)); + Sclr = DAG.getNode(Op->getOpcode(), dl, MVT::f32, Sclr); + BuildVectorOps.push_back(Sclr); + } + + return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, BuildVectorOps); +} + +SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + if (Op.getValueType().isVector()) + return LowerVectorINT_TO_FP(Op, DAG); + + // i128 conversions are libcalls. + if (Op.getOperand(0).getValueType() == MVT::i128) + return SDValue(); + + // Other conversions are legal, unless it's to the completely software-based + // fp128. + if (Op.getValueType() != MVT::f128) + return Op; + + RTLIB::Libcall LC; + if (Op.getOpcode() == ISD::SINT_TO_FP) + LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); + else + LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); + + return LowerF128Call(Op, DAG, LC); +} + +SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, + SelectionDAG &DAG) const { + // For iOS, we want to call an alternative entry point: __sincos_stret, + // which returns the values in two S / D registers. + SDLoc dl(Op); + SDValue Arg = Op.getOperand(0); + EVT ArgVT = Arg.getValueType(); + Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); + + ArgListTy Args; + ArgListEntry Entry; + + Entry.Node = Arg; + Entry.Ty = ArgTy; + Entry.isSExt = false; + Entry.isZExt = false; + Args.push_back(Entry); + + const char *LibcallName = + (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; + SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy()); + + StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL); + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) + .setCallee(CallingConv::Fast, RetTy, Callee, &Args, 0); + + std::pair CallResult = LowerCallTo(CLI); + return CallResult.first; +} + +SDValue AArch64TargetLowering::LowerOperation(SDValue Op, + SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + default: + llvm_unreachable("unimplemented operand"); + return SDValue(); + case ISD::GlobalAddress: + return LowerGlobalAddress(Op, DAG); + case ISD::GlobalTLSAddress: + return LowerGlobalTLSAddress(Op, DAG); + case ISD::SETCC: + return LowerSETCC(Op, DAG); + case ISD::BR_CC: + return LowerBR_CC(Op, DAG); + case ISD::SELECT: + return LowerSELECT(Op, DAG); + case ISD::SELECT_CC: + return LowerSELECT_CC(Op, DAG); + case ISD::JumpTable: + return LowerJumpTable(Op, DAG); + case ISD::ConstantPool: + return LowerConstantPool(Op, DAG); + case ISD::BlockAddress: + return LowerBlockAddress(Op, DAG); + case ISD::VASTART: + return LowerVASTART(Op, DAG); + case ISD::VACOPY: + return LowerVACOPY(Op, DAG); + case ISD::VAARG: + return LowerVAARG(Op, DAG); + case ISD::ADDC: + case ISD::ADDE: + case ISD::SUBC: + case ISD::SUBE: + return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); + case ISD::SADDO: + case ISD::UADDO: + case ISD::SSUBO: + case ISD::USUBO: + case ISD::SMULO: + case ISD::UMULO: + return LowerXALUO(Op, DAG); + case ISD::FADD: + return LowerF128Call(Op, DAG, RTLIB::ADD_F128); + case ISD::FSUB: + return LowerF128Call(Op, DAG, RTLIB::SUB_F128); + case ISD::FMUL: + return LowerF128Call(Op, DAG, RTLIB::MUL_F128); + case ISD::FDIV: + return LowerF128Call(Op, DAG, RTLIB::DIV_F128); + case ISD::FP_ROUND: + return LowerFP_ROUND(Op, DAG); + case ISD::FP_EXTEND: + return LowerFP_EXTEND(Op, DAG); + case ISD::FRAMEADDR: + return LowerFRAMEADDR(Op, DAG); + case ISD::RETURNADDR: + return LowerRETURNADDR(Op, DAG); + case ISD::INSERT_VECTOR_ELT: + return LowerINSERT_VECTOR_ELT(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: + return LowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::BUILD_VECTOR: + return LowerBUILD_VECTOR(Op, DAG); + case ISD::VECTOR_SHUFFLE: + return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::EXTRACT_SUBVECTOR: + return LowerEXTRACT_SUBVECTOR(Op, DAG); + case ISD::SRA: + case ISD::SRL: + case ISD::SHL: + return LowerVectorSRA_SRL_SHL(Op, DAG); + case ISD::SHL_PARTS: + return LowerShiftLeftParts(Op, DAG); + case ISD::SRL_PARTS: + case ISD::SRA_PARTS: + return LowerShiftRightParts(Op, DAG); + case ISD::CTPOP: + return LowerCTPOP(Op, DAG); + case ISD::FCOPYSIGN: + return LowerFCOPYSIGN(Op, DAG); + case ISD::AND: + return LowerVectorAND(Op, DAG); + case ISD::OR: + return LowerVectorOR(Op, DAG); + case ISD::XOR: + return LowerXOR(Op, DAG); + case ISD::PREFETCH: + return LowerPREFETCH(Op, DAG); + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: + return LowerINT_TO_FP(Op, DAG); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + return LowerFP_TO_INT(Op, DAG); + case ISD::FSINCOS: + return LowerFSINCOS(Op, DAG); + } +} + +/// getFunctionAlignment - Return the Log2 alignment of this function. +unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const { + return 2; +} + +//===----------------------------------------------------------------------===// +// Calling Convention Implementation +//===----------------------------------------------------------------------===// + +#include "AArch64GenCallingConv.inc" + +/// Selects the correct CCAssignFn for a the given CallingConvention +/// value. +CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, + bool IsVarArg) const { + switch (CC) { + default: + llvm_unreachable("Unsupported calling convention."); + case CallingConv::WebKit_JS: + return CC_AArch64_WebKit_JS; + case CallingConv::C: + case CallingConv::Fast: + if (!Subtarget->isTargetDarwin()) + return CC_AArch64_AAPCS; + return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; + } +} + +SDValue AArch64TargetLowering::LowerFormalArguments( + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl &Ins, SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl &InVals) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Assign locations to all of the incoming arguments. + SmallVector ArgLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); + + // At this point, Ins[].VT may already be promoted to i32. To correctly + // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and + // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. + // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here + // we use a special version of AnalyzeFormalArguments to pass in ValVT and + // LocVT. + unsigned NumArgs = Ins.size(); + Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); + unsigned CurArgIdx = 0; + for (unsigned i = 0; i != NumArgs; ++i) { + MVT ValVT = Ins[i].VT; + std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx); + CurArgIdx = Ins[i].OrigArgIndex; + + // Get type of the original argument. + EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true); + MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; + // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. + MVT LocVT = ValVT; + if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) + LocVT = MVT::i8; + else if (ActualMVT == MVT::i16) + LocVT = MVT::i16; + + CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); + bool Res = + AssignFn(i, ValVT, LocVT, CCValAssign::Full, Ins[i].Flags, CCInfo); + assert(!Res && "Call operand has unhandled type"); + (void)Res; + } + assert(ArgLocs.size() == Ins.size()); + SmallVector ArgValues; + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + + if (Ins[i].Flags.isByVal()) { + // Byval is used for HFAs in the PCS, but the system should work in a + // non-compliant manner for larger structs. + EVT PtrTy = getPointerTy(); + int Size = Ins[i].Flags.getByValSize(); + unsigned NumRegs = (Size + 7) / 8; + + // FIXME: This works on big-endian for composite byvals, which are the common + // case. It should also work for fundamental types too. + unsigned FrameIdx = + MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); + SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy); + InVals.push_back(FrameIdxN); + + continue; + } if (VA.isRegLoc()) { + // Arguments stored in registers. + EVT RegVT = VA.getLocVT(); + + SDValue ArgValue; + const TargetRegisterClass *RC; + + if (RegVT == MVT::i32) + RC = &AArch64::GPR32RegClass; + else if (RegVT == MVT::i64) + RC = &AArch64::GPR64RegClass; + else if (RegVT == MVT::f32) + RC = &AArch64::FPR32RegClass; + else if (RegVT == MVT::f64 || RegVT.is64BitVector()) + RC = &AArch64::FPR64RegClass; + else if (RegVT == MVT::f128 || RegVT.is128BitVector()) + RC = &AArch64::FPR128RegClass; + else + llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); + + // Transform the arguments in physical registers into virtual ones. + unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); + ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); + + // If this is an 8, 16 or 32-bit value, it is really passed promoted + // to 64 bits. Insert an assert[sz]ext to capture this, then + // truncate to the right size. + switch (VA.getLocInfo()) { + default: + llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: + break; + case CCValAssign::BCvt: + ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); + break; + case CCValAssign::SExt: + ArgValue = DAG.getNode(ISD::AssertSext, DL, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue); + break; + case CCValAssign::ZExt: + ArgValue = DAG.getNode(ISD::AssertZext, DL, RegVT, ArgValue, + DAG.getValueType(VA.getValVT())); + ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue); + break; + } + + InVals.push_back(ArgValue); + + } else { // VA.isRegLoc() + assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); + unsigned ArgOffset = VA.getLocMemOffset(); + unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8; + + uint32_t BEAlign = 0; + if (ArgSize < 8 && !Subtarget->isLittleEndian()) + BEAlign = 8 - ArgSize; + + int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); + + // Create load nodes to retrieve arguments from the stack. + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + SDValue ArgValue; + + // If the loc type and val type are not the same, create an anyext load. + if (VA.getLocVT().getSizeInBits() != VA.getValVT().getSizeInBits()) { + // We should only get here if this is a pure integer. + assert(!VA.getValVT().isVector() && VA.getValVT().isInteger() && + "Only integer extension supported!"); + ArgValue = DAG.getExtLoad(ISD::EXTLOAD, DL, VA.getValVT(), Chain, FIN, + MachinePointerInfo::getFixedStack(FI), + VA.getLocVT(), + false, false, false, 0); + } else { + ArgValue = DAG.getLoad(VA.getValVT(), DL, Chain, FIN, + MachinePointerInfo::getFixedStack(FI), false, + false, false, 0); + } + + InVals.push_back(ArgValue); + } + } + + // varargs + if (isVarArg) { + if (!Subtarget->isTargetDarwin()) { + // The AAPCS variadic function ABI is identical to the non-variadic + // one. As a result there may be more arguments in registers and we should + // save them for future reference. + saveVarArgRegisters(CCInfo, DAG, DL, Chain); + } + + AArch64FunctionInfo *AFI = MF.getInfo(); + // This will point to the next argument passed via stack. + unsigned StackOffset = CCInfo.getNextStackOffset(); + // We currently pass all varargs at 8-byte alignment. + StackOffset = ((StackOffset + 7) & ~7); + AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true)); + } + + AArch64FunctionInfo *FuncInfo = MF.getInfo(); + unsigned StackArgSize = CCInfo.getNextStackOffset(); + bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; + if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { + // This is a non-standard ABI so by fiat I say we're allowed to make full + // use of the stack area to be popped, which must be aligned to 16 bytes in + // any case: + StackArgSize = RoundUpToAlignment(StackArgSize, 16); + + // If we're expected to restore the stack (e.g. fastcc) then we'll be adding + // a multiple of 16. + FuncInfo->setArgumentStackToRestore(StackArgSize); + + // This realignment carries over to the available bytes below. Our own + // callers will guarantee the space is free by giving an aligned value to + // CALLSEQ_START. + } + // Even if we're not expected to free up the space, it's useful to know how + // much is there while considering tail calls (because we can reuse it). + FuncInfo->setBytesInStackArgArea(StackArgSize); + + return Chain; +} + +void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, + SelectionDAG &DAG, SDLoc DL, + SDValue &Chain) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + AArch64FunctionInfo *FuncInfo = MF.getInfo(); + + SmallVector MemOps; + + static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2, + AArch64::X3, AArch64::X4, AArch64::X5, + AArch64::X6, AArch64::X7 }; + static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs); + unsigned FirstVariadicGPR = + CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs); + + unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); + int GPRIdx = 0; + if (GPRSaveSize != 0) { + GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false); + + SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy()); + + for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { + unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); + SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); + SDValue Store = + DAG.getStore(Val.getValue(1), DL, Val, FIN, + MachinePointerInfo::getStack(i * 8), false, false, 0); + MemOps.push_back(Store); + FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, + DAG.getConstant(8, getPointerTy())); + } + } + FuncInfo->setVarArgsGPRIndex(GPRIdx); + FuncInfo->setVarArgsGPRSize(GPRSaveSize); + + if (Subtarget->hasFPARMv8()) { + static const MCPhysReg FPRArgRegs[] = { + AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, + AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; + static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs); + unsigned FirstVariadicFPR = + CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs); + + unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); + int FPRIdx = 0; + if (FPRSaveSize != 0) { + FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false); + + SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy()); + + for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { + unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); + SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); + + SDValue Store = + DAG.getStore(Val.getValue(1), DL, Val, FIN, + MachinePointerInfo::getStack(i * 16), false, false, 0); + MemOps.push_back(Store); + FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, + DAG.getConstant(16, getPointerTy())); + } + } + FuncInfo->setVarArgsFPRIndex(FPRIdx); + FuncInfo->setVarArgsFPRSize(FPRSaveSize); + } + + if (!MemOps.empty()) { + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); + } +} + +/// LowerCallResult - Lower the result values of a call into the +/// appropriate copies out of appropriate physical registers. +SDValue AArch64TargetLowering::LowerCallResult( + SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl &Ins, SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl &InVals, bool isThisReturn, + SDValue ThisVal) const { + CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS + ? RetCC_AArch64_WebKit_JS + : RetCC_AArch64_AAPCS; + // Assign locations to each value returned by this call. + SmallVector RVLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); + CCInfo.AnalyzeCallResult(Ins, RetCC); + + // Copy all of the result registers out of their specified physreg. + for (unsigned i = 0; i != RVLocs.size(); ++i) { + CCValAssign VA = RVLocs[i]; + + // Pass 'this' value directly from the argument to return value, to avoid + // reg unit interference + if (i == 0 && isThisReturn) { + assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && + "unexpected return calling convention register assignment"); + InVals.push_back(ThisVal); + continue; + } + + SDValue Val = + DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); + Chain = Val.getValue(1); + InFlag = Val.getValue(2); + + switch (VA.getLocInfo()) { + default: + llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: + break; + case CCValAssign::BCvt: + Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); + break; + } + + InVals.push_back(Val); + } + + return Chain; +} + +bool AArch64TargetLowering::isEligibleForTailCallOptimization( + SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, + bool isCalleeStructRet, bool isCallerStructRet, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + const SmallVectorImpl &Ins, SelectionDAG &DAG) const { + // For CallingConv::C this function knows whether the ABI needs + // changing. That's not true for other conventions so they will have to opt in + // manually. + if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) + return false; + + const MachineFunction &MF = DAG.getMachineFunction(); + const Function *CallerF = MF.getFunction(); + CallingConv::ID CallerCC = CallerF->getCallingConv(); + bool CCMatch = CallerCC == CalleeCC; + + // Byval parameters hand the function a pointer directly into the stack area + // we want to reuse during a tail call. Working around this *is* possible (see + // X86) but less efficient and uglier in LowerCall. + for (Function::const_arg_iterator i = CallerF->arg_begin(), + e = CallerF->arg_end(); + i != e; ++i) + if (i->hasByValAttr()) + return false; + + if (getTargetMachine().Options.GuaranteedTailCallOpt) { + if (IsTailCallConvention(CalleeCC) && CCMatch) + return true; + return false; + } + + // Now we search for cases where we can use a tail call without changing the + // ABI. Sibcall is used in some places (particularly gcc) to refer to this + // concept. + + // I want anyone implementing a new calling convention to think long and hard + // about this assert. + assert((!isVarArg || CalleeCC == CallingConv::C) && + "Unexpected variadic calling convention"); + + if (isVarArg && !Outs.empty()) { + // At least two cases here: if caller is fastcc then we can't have any + // memory arguments (we'd be expected to clean up the stack afterwards). If + // caller is C then we could potentially use its argument area. + + // FIXME: for now we take the most conservative of these in both cases: + // disallow all variadic memory operands. + SmallVector ArgLocs; + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); + + CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) + if (!ArgLocs[i].isRegLoc()) + return false; + } + + // If the calling conventions do not match, then we'd better make sure the + // results are returned in the same way as what the caller expects. + if (!CCMatch) { + SmallVector RVLocs1; + CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs1, *DAG.getContext()); + CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg)); + + SmallVector RVLocs2; + CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), + getTargetMachine(), RVLocs2, *DAG.getContext()); + CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg)); + + if (RVLocs1.size() != RVLocs2.size()) + return false; + for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { + if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) + return false; + if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) + return false; + if (RVLocs1[i].isRegLoc()) { + if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) + return false; + } else { + if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) + return false; + } + } + } + + // Nothing more to check if the callee is taking no arguments + if (Outs.empty()) + return true; + + SmallVector ArgLocs; + CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); + + CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); + + const AArch64FunctionInfo *FuncInfo = MF.getInfo(); + + // If the stack arguments for this call would fit into our own save area then + // the call can be made tail. + return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea(); +} + +SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, + SelectionDAG &DAG, + MachineFrameInfo *MFI, + int ClobberedFI) const { + SmallVector ArgChains; + int64_t FirstByte = MFI->getObjectOffset(ClobberedFI); + int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1; + + // Include the original chain at the beginning of the list. When this is + // used by target LowerCall hooks, this helps legalize find the + // CALLSEQ_BEGIN node. + ArgChains.push_back(Chain); + + // Add a chain value for each stack argument corresponding + for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), + UE = DAG.getEntryNode().getNode()->use_end(); + U != UE; ++U) + if (LoadSDNode *L = dyn_cast(*U)) + if (FrameIndexSDNode *FI = dyn_cast(L->getBasePtr())) + if (FI->getIndex() < 0) { + int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex()); + int64_t InLastByte = InFirstByte; + InLastByte += MFI->getObjectSize(FI->getIndex()) - 1; + + if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || + (FirstByte <= InFirstByte && InFirstByte <= LastByte)) + ArgChains.push_back(SDValue(L, 1)); + } + + // Build a tokenfactor for all the chains. + return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); +} + +bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, + bool TailCallOpt) const { + return CallCC == CallingConv::Fast && TailCallOpt; +} + +bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const { + return CallCC == CallingConv::Fast; +} + +/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, +/// and add input and output parameter nodes. +SDValue +AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl &InVals) const { + SelectionDAG &DAG = CLI.DAG; + SDLoc &DL = CLI.DL; + SmallVector &Outs = CLI.Outs; + SmallVector &OutVals = CLI.OutVals; + SmallVector &Ins = CLI.Ins; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + bool &IsTailCall = CLI.IsTailCall; + CallingConv::ID CallConv = CLI.CallConv; + bool IsVarArg = CLI.IsVarArg; + + MachineFunction &MF = DAG.getMachineFunction(); + bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); + bool IsThisReturn = false; + + AArch64FunctionInfo *FuncInfo = MF.getInfo(); + bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; + bool IsSibCall = false; + + if (IsTailCall) { + // Check if it's really possible to do a tail call. + IsTailCall = isEligibleForTailCallOptimization( + Callee, CallConv, IsVarArg, IsStructRet, + MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG); + if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall()) + report_fatal_error("failed to perform tail call elimination on a call " + "site marked musttail"); + + // A sibling call is one where we're under the usual C ABI and not planning + // to change that but can still do a tail call: + if (!TailCallOpt && IsTailCall) + IsSibCall = true; + + if (IsTailCall) + ++NumTailCalls; + } + + // Analyze operands of the call, assigning locations to each operand. + SmallVector ArgLocs; + CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); + + if (IsVarArg) { + // Handle fixed and variable vector arguments differently. + // Variable vector arguments always go into memory. + unsigned NumArgs = Outs.size(); + + for (unsigned i = 0; i != NumArgs; ++i) { + MVT ArgVT = Outs[i].VT; + ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; + CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, + /*IsVarArg=*/ !Outs[i].IsFixed); + bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); + assert(!Res && "Call operand has unhandled type"); + (void)Res; + } + } else { + // At this point, Outs[].VT may already be promoted to i32. To correctly + // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and + // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. + // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here + // we use a special version of AnalyzeCallOperands to pass in ValVT and + // LocVT. + unsigned NumArgs = Outs.size(); + for (unsigned i = 0; i != NumArgs; ++i) { + MVT ValVT = Outs[i].VT; + // Get type of the original argument. + EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty, + /*AllowUnknown*/ true); + MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT; + ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; + // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. + MVT LocVT = ValVT; + if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) + LocVT = MVT::i8; + else if (ActualMVT == MVT::i16) + LocVT = MVT::i16; + + CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); + bool Res = AssignFn(i, ValVT, LocVT, CCValAssign::Full, ArgFlags, CCInfo); + assert(!Res && "Call operand has unhandled type"); + (void)Res; + } + } + + // Get a count of how many bytes are to be pushed on the stack. + unsigned NumBytes = CCInfo.getNextStackOffset(); + + if (IsSibCall) { + // Since we're not changing the ABI to make this a tail call, the memory + // operands are already available in the caller's incoming argument space. + NumBytes = 0; + } + + // FPDiff is the byte offset of the call's argument area from the callee's. + // Stores to callee stack arguments will be placed in FixedStackSlots offset + // by this amount for a tail call. In a sibling call it must be 0 because the + // caller will deallocate the entire stack and the callee still expects its + // arguments to begin at SP+0. Completely unused for non-tail calls. + int FPDiff = 0; + + if (IsTailCall && !IsSibCall) { + unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); + + // Since callee will pop argument stack as a tail call, we must keep the + // popped size 16-byte aligned. + NumBytes = RoundUpToAlignment(NumBytes, 16); + + // FPDiff will be negative if this tail call requires more space than we + // would automatically have in our incoming argument space. Positive if we + // can actually shrink the stack. + FPDiff = NumReusableBytes - NumBytes; + + // The stack pointer must be 16-byte aligned at all times it's used for a + // memory operation, which in practice means at *all* times and in + // particular across call boundaries. Therefore our own arguments started at + // a 16-byte aligned SP and the delta applied for the tail call should + // satisfy the same constraint. + assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); + } + + // Adjust the stack pointer for the new arguments... + // These operations are automatically eliminated by the prolog/epilog pass + if (!IsSibCall) + Chain = + DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL); + + SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy()); + + SmallVector, 8> RegsToPass; + SmallVector MemOpChains; + + // Walk the register/memloc assignments, inserting copies/loads. + for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; + ++i, ++realArgIdx) { + CCValAssign &VA = ArgLocs[i]; + SDValue Arg = OutVals[realArgIdx]; + ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; + + // Promote the value if needed. + switch (VA.getLocInfo()) { + default: + llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: + break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); + break; + case CCValAssign::AExt: + Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); + break; + case CCValAssign::BCvt: + Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); + break; + case CCValAssign::FPExt: + Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); + break; + } + + if (VA.isRegLoc()) { + if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) { + assert(VA.getLocVT() == MVT::i64 && + "unexpected calling convention register assignment"); + assert(!Ins.empty() && Ins[0].VT == MVT::i64 && + "unexpected use of 'returned'"); + IsThisReturn = true; + } + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + } else { + assert(VA.isMemLoc()); + + SDValue DstAddr; + MachinePointerInfo DstInfo; + + // FIXME: This works on big-endian for composite byvals, which are the + // common case. It should also work for fundamental types too. + uint32_t BEAlign = 0; + unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 + : VA.getLocVT().getSizeInBits(); + OpSize = (OpSize + 7) / 8; + if (!Subtarget->isLittleEndian() && !Flags.isByVal()) { + if (OpSize < 8) + BEAlign = 8 - OpSize; + } + unsigned LocMemOffset = VA.getLocMemOffset(); + int32_t Offset = LocMemOffset + BEAlign; + SDValue PtrOff = DAG.getIntPtrConstant(Offset); + PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff); + + if (IsTailCall) { + Offset = Offset + FPDiff; + int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); + + DstAddr = DAG.getFrameIndex(FI, getPointerTy()); + DstInfo = MachinePointerInfo::getFixedStack(FI); + + // Make sure any stack arguments overlapping with where we're storing + // are loaded before this eventual operation. Otherwise they'll be + // clobbered. + Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); + } else { + SDValue PtrOff = DAG.getIntPtrConstant(Offset); + + DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff); + DstInfo = MachinePointerInfo::getStack(LocMemOffset); + } + + if (Outs[i].Flags.isByVal()) { + SDValue SizeNode = + DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64); + SDValue Cpy = DAG.getMemcpy( + Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), + /*isVolatile = */ false, + /*alwaysInline = */ false, DstInfo, MachinePointerInfo()); + + MemOpChains.push_back(Cpy); + } else { + // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already + // promoted to a legal register type i32, we should truncate Arg back to + // i1/i8/i16. + if (Arg.getValueType().isSimple() && + Arg.getValueType().getSimpleVT() == MVT::i32 && + (VA.getLocVT() == MVT::i1 || VA.getLocVT() == MVT::i8 || + VA.getLocVT() == MVT::i16)) + Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getLocVT(), Arg); + + SDValue Store = + DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0); + MemOpChains.push_back(Store); + } + } + } + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into the appropriate regs. + SDValue InFlag; + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); + } + + // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every + // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol + // node so that legalize doesn't hack it. + if (getTargetMachine().getCodeModel() == CodeModel::Large && + Subtarget->isTargetMachO()) { + if (GlobalAddressSDNode *G = dyn_cast(Callee)) { + const GlobalValue *GV = G->getGlobal(); + bool InternalLinkage = GV->hasInternalLinkage(); + if (InternalLinkage) + Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0); + else { + Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, + AArch64II::MO_GOT); + Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee); + } + } else if (ExternalSymbolSDNode *S = + dyn_cast(Callee)) { + const char *Sym = S->getSymbol(); + Callee = + DAG.getTargetExternalSymbol(Sym, getPointerTy(), AArch64II::MO_GOT); + Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee); + } + } else if (GlobalAddressSDNode *G = dyn_cast(Callee)) { + const GlobalValue *GV = G->getGlobal(); + Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0); + } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { + const char *Sym = S->getSymbol(); + Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0); + } + + // We don't usually want to end the call-sequence here because we would tidy + // the frame up *after* the call, however in the ABI-changing tail-call case + // we've carefully laid out the parameters so that when sp is reset they'll be + // in the correct location. + if (IsTailCall && !IsSibCall) { + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), + DAG.getIntPtrConstant(0, true), InFlag, DL); + InFlag = Chain.getValue(1); + } + + std::vector Ops; + Ops.push_back(Chain); + Ops.push_back(Callee); + + if (IsTailCall) { + // Each tail call may have to adjust the stack by a different amount, so + // this information must travel along with the operation for eventual + // consumption by emitEpilogue. + Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32)); + } + + // Add argument registers to the end of the list so that they are known live + // into the call. + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) + Ops.push_back(DAG.getRegister(RegsToPass[i].first, + RegsToPass[i].second.getValueType())); + + // Add a register mask operand representing the call-preserved registers. + const uint32_t *Mask; + const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const AArch64RegisterInfo *ARI = + static_cast(TRI); + if (IsThisReturn) { + // For 'this' returns, use the X0-preserving mask if applicable + Mask = ARI->getThisReturnPreservedMask(CallConv); + if (!Mask) { + IsThisReturn = false; + Mask = ARI->getCallPreservedMask(CallConv); + } + } else + Mask = ARI->getCallPreservedMask(CallConv); + + assert(Mask && "Missing call preserved mask for calling convention"); + Ops.push_back(DAG.getRegisterMask(Mask)); + + if (InFlag.getNode()) + Ops.push_back(InFlag); + + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + + // If we're doing a tall call, use a TC_RETURN here rather than an + // actual call instruction. + if (IsTailCall) + return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); + + // Returns a chain and a flag for retval copy to use. + Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); + InFlag = Chain.getValue(1); + + uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt) + ? RoundUpToAlignment(NumBytes, 16) + : 0; + + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), + DAG.getIntPtrConstant(CalleePopBytes, true), + InFlag, DL); + if (!Ins.empty()) + InFlag = Chain.getValue(1); + + // Handle result values, copying them out of physregs into vregs that we + // return. + return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, + InVals, IsThisReturn, + IsThisReturn ? OutVals[0] : SDValue()); +} + +bool AArch64TargetLowering::CanLowerReturn( + CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, + const SmallVectorImpl &Outs, LLVMContext &Context) const { + CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS + ? RetCC_AArch64_WebKit_JS + : RetCC_AArch64_AAPCS; + SmallVector RVLocs; + CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context); + return CCInfo.CheckReturn(Outs, RetCC); +} + +SDValue +AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + SDLoc DL, SelectionDAG &DAG) const { + CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS + ? RetCC_AArch64_WebKit_JS + : RetCC_AArch64_AAPCS; + SmallVector RVLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); + CCInfo.AnalyzeReturn(Outs, RetCC); + + // Copy the result values into the output registers. + SDValue Flag; + SmallVector RetOps(1, Chain); + for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); + ++i, ++realRVLocIdx) { + CCValAssign &VA = RVLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + SDValue Arg = OutVals[realRVLocIdx]; + + switch (VA.getLocInfo()) { + default: + llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: + break; + case CCValAssign::BCvt: + Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); + break; + } + + Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); + Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + } + + RetOps[0] = Chain; // Update chain. + + // Add the flag if we have it. + if (Flag.getNode()) + RetOps.push_back(Flag); + + return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps); +} + +//===----------------------------------------------------------------------===// +// Other Lowering Code +//===----------------------------------------------------------------------===// + +SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, + SelectionDAG &DAG) const { + EVT PtrVT = getPointerTy(); + SDLoc DL(Op); + const GlobalValue *GV = cast(Op)->getGlobal(); + unsigned char OpFlags = + Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); + + assert(cast(Op)->getOffset() == 0 && + "unexpected offset in global node"); + + // This also catched the large code model case for Darwin. + if ((OpFlags & AArch64II::MO_GOT) != 0) { + SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); + // FIXME: Once remat is capable of dealing with instructions with register + // operands, expand this into two nodes instead of using a wrapper node. + return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); + } + + if (getTargetMachine().getCodeModel() == CodeModel::Large) { + const unsigned char MO_NC = AArch64II::MO_NC; + return DAG.getNode( + AArch64ISD::WrapperLarge, DL, PtrVT, + DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3), + DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC), + DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC), + DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); + } else { + // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and + // the only correct model on Darwin. + SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, + OpFlags | AArch64II::MO_PAGE); + unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC; + SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags); + + SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); + return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + } +} + +/// \brief Convert a TLS address reference into the correct sequence of loads +/// and calls to compute the variable's address (for Darwin, currently) and +/// return an SDValue containing the final node. + +/// Darwin only has one TLS scheme which must be capable of dealing with the +/// fully general situation, in the worst case. This means: +/// + "extern __thread" declaration. +/// + Defined in a possibly unknown dynamic library. +/// +/// The general system is that each __thread variable has a [3 x i64] descriptor +/// which contains information used by the runtime to calculate the address. The +/// only part of this the compiler needs to know about is the first xword, which +/// contains a function pointer that must be called with the address of the +/// entire descriptor in "x0". +/// +/// Since this descriptor may be in a different unit, in general even the +/// descriptor must be accessed via an indirect load. The "ideal" code sequence +/// is: +/// adrp x0, _var@TLVPPAGE +/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor +/// ldr x1, [x0] ; x1 contains 1st entry of descriptor, +/// ; the function pointer +/// blr x1 ; Uses descriptor address in x0 +/// ; Address of _var is now in x0. +/// +/// If the address of _var's descriptor *is* known to the linker, then it can +/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for +/// a slight efficiency gain. +SDValue +AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin"); + + SDLoc DL(Op); + MVT PtrVT = getPointerTy(); + const GlobalValue *GV = cast(Op)->getGlobal(); + + SDValue TLVPAddr = + DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); + SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr); + + // The first entry in the descriptor is a function pointer that we must call + // to obtain the address of the variable. + SDValue Chain = DAG.getEntryNode(); + SDValue FuncTLVGet = + DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(), + false, true, true, 8); + Chain = FuncTLVGet.getValue(1); + + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setAdjustsStack(true); + + // TLS calls preserve all registers except those that absolutely must be + // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be + // silly). + const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const AArch64RegisterInfo *ARI = + static_cast(TRI); + const uint32_t *Mask = ARI->getTLSCallPreservedMask(); + + // Finally, we can make the call. This is just a degenerate version of a + // normal AArch64 call node: x0 takes the address of the descriptor, and + // returns the address of the variable in this thread. + Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue()); + Chain = + DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), + Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64), + DAG.getRegisterMask(Mask), Chain.getValue(1)); + return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1)); +} + +/// When accessing thread-local variables under either the general-dynamic or +/// local-dynamic system, we make a "TLS-descriptor" call. The variable will +/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry +/// is a function pointer to carry out the resolution. This function takes the +/// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All +/// other registers (except LR, NZCV) are preserved. +/// +/// Thus, the ideal call sequence on AArch64 is: +/// +/// adrp x0, :tlsdesc:thread_var +/// ldr x8, [x0, :tlsdesc_lo12:thread_var] +/// add x0, x0, :tlsdesc_lo12:thread_var +/// .tlsdesccall thread_var +/// blr x8 +/// (TPIDR_EL0 offset now in x0). +/// +/// The ".tlsdesccall" directive instructs the assembler to insert a particular +/// relocation to help the linker relax this sequence if it turns out to be too +/// conservative. +/// +/// FIXME: we currently produce an extra, duplicated, ADRP instruction, but this +/// is harmless. +SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr, + SDValue DescAddr, SDLoc DL, + SelectionDAG &DAG) const { + EVT PtrVT = getPointerTy(); + + // The function we need to call is simply the first entry in the GOT for this + // descriptor, load it in preparation. + SDValue Func = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, SymAddr); + + // TLS calls preserve all registers except those that absolutely must be + // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be + // silly). + const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); + const AArch64RegisterInfo *ARI = + static_cast(TRI); + const uint32_t *Mask = ARI->getTLSCallPreservedMask(); + + // The function takes only one argument: the address of the descriptor itself + // in X0. + SDValue Glue, Chain; + Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue); + Glue = Chain.getValue(1); + + // We're now ready to populate the argument list, as with a normal call: + SmallVector Ops; + Ops.push_back(Chain); + Ops.push_back(Func); + Ops.push_back(SymAddr); + Ops.push_back(DAG.getRegister(AArch64::X0, PtrVT)); + Ops.push_back(DAG.getRegisterMask(Mask)); + Ops.push_back(Glue); + + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); + Chain = DAG.getNode(AArch64ISD::TLSDESC_CALL, DL, NodeTys, Ops); + Glue = Chain.getValue(1); + + return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); +} + +SDValue +AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget->isTargetELF() && "This function expects an ELF target"); + assert(getTargetMachine().getCodeModel() == CodeModel::Small && + "ELF TLS only supported in small memory model"); + const GlobalAddressSDNode *GA = cast(Op); + + TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); + + SDValue TPOff; + EVT PtrVT = getPointerTy(); + SDLoc DL(Op); + const GlobalValue *GV = GA->getGlobal(); + + SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); + + if (Model == TLSModel::LocalExec) { + SDValue HiVar = DAG.getTargetGlobalAddress( + GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1); + SDValue LoVar = DAG.getTargetGlobalAddress( + GV, DL, PtrVT, 0, + AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC); + + TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar, + DAG.getTargetConstant(16, MVT::i32)), + 0); + TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar, + DAG.getTargetConstant(0, MVT::i32)), + 0); + } else if (Model == TLSModel::InitialExec) { + TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); + TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff); + } else if (Model == TLSModel::LocalDynamic) { + // Local-dynamic accesses proceed in two phases. A general-dynamic TLS + // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate + // the beginning of the module's TLS region, followed by a DTPREL offset + // calculation. + + // These accesses will need deduplicating if there's more than one. + AArch64FunctionInfo *MFI = + DAG.getMachineFunction().getInfo(); + MFI->incNumLocalDynamicTLSAccesses(); + + // Accesses used in this sequence go via the TLS descriptor which lives in + // the GOT. Prepare an address we can use to handle this. + SDValue HiDesc = DAG.getTargetExternalSymbol( + "_TLS_MODULE_BASE_", PtrVT, AArch64II::MO_TLS | AArch64II::MO_PAGE); + SDValue LoDesc = DAG.getTargetExternalSymbol( + "_TLS_MODULE_BASE_", PtrVT, + AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + + // First argument to the descriptor call is the address of the descriptor + // itself. + SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc); + DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc); + + // The call needs a relocation too for linker relaxation. It doesn't make + // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of + // the address. + SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, + AArch64II::MO_TLS); + + // Now we can calculate the offset from TPIDR_EL0 to this module's + // thread-local area. + TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG); + + // Now use :dtprel_whatever: operations to calculate this variable's offset + // in its thread-storage area. + SDValue HiVar = DAG.getTargetGlobalAddress( + GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_G1); + SDValue LoVar = DAG.getTargetGlobalAddress( + GV, DL, MVT::i64, 0, + AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC); + + SDValue DTPOff = + SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar, + DAG.getTargetConstant(16, MVT::i32)), + 0); + DTPOff = + SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, DTPOff, LoVar, + DAG.getTargetConstant(0, MVT::i32)), + 0); + + TPOff = DAG.getNode(ISD::ADD, DL, PtrVT, TPOff, DTPOff); + } else if (Model == TLSModel::GeneralDynamic) { + // Accesses used in this sequence go via the TLS descriptor which lives in + // the GOT. Prepare an address we can use to handle this. + SDValue HiDesc = DAG.getTargetGlobalAddress( + GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGE); + SDValue LoDesc = DAG.getTargetGlobalAddress( + GV, DL, PtrVT, 0, + AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + + // First argument to the descriptor call is the address of the descriptor + // itself. + SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc); + DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc); + + // The call needs a relocation too for linker relaxation. It doesn't make + // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of + // the address. + SDValue SymAddr = + DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); + + // Finally we can make a call to calculate the offset from tpidr_el0. + TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG); + } else + llvm_unreachable("Unsupported ELF TLS access model"); + + return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); +} + +SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, + SelectionDAG &DAG) const { + if (Subtarget->isTargetDarwin()) + return LowerDarwinGlobalTLSAddress(Op, DAG); + else if (Subtarget->isTargetELF()) + return LowerELFGlobalTLSAddress(Op, DAG); + + llvm_unreachable("Unexpected platform trying to use TLS"); +} +SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { + SDValue Chain = Op.getOperand(0); + ISD::CondCode CC = cast(Op.getOperand(1))->get(); + SDValue LHS = Op.getOperand(2); + SDValue RHS = Op.getOperand(3); + SDValue Dest = Op.getOperand(4); + SDLoc dl(Op); + + // Handle f128 first, since lowering it will result in comparing the return + // value of a libcall against zero, which is just what the rest of LowerBR_CC + // is expecting to deal with. + if (LHS.getValueType() == MVT::f128) { + softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); + + // If softenSetCCOperands returned a scalar, we need to compare the result + // against zero to select between true and false values. + if (!RHS.getNode()) { + RHS = DAG.getConstant(0, LHS.getValueType()); + CC = ISD::SETNE; + } + } + + // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch + // instruction. + unsigned Opc = LHS.getOpcode(); + if (LHS.getResNo() == 1 && isa(RHS) && + cast(RHS)->isOne() && + (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || + Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { + assert((CC == ISD::SETEQ || CC == ISD::SETNE) && + "Unexpected condition code."); + // Only lower legal XALUO ops. + if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) + return SDValue(); + + // The actual operation with overflow check. + AArch64CC::CondCode OFCC; + SDValue Value, Overflow; + std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG); + + if (CC == ISD::SETNE) + OFCC = getInvertedCondCode(OFCC); + SDValue CCVal = DAG.getConstant(OFCC, MVT::i32); + + return DAG.getNode(AArch64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest, + CCVal, Overflow); + } + + if (LHS.getValueType().isInteger()) { + assert((LHS.getValueType() == RHS.getValueType()) && + (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); + + // If the RHS of the comparison is zero, we can potentially fold this + // to a specialized branch. + const ConstantSDNode *RHSC = dyn_cast(RHS); + if (RHSC && RHSC->getZExtValue() == 0) { + if (CC == ISD::SETEQ) { + // See if we can use a TBZ to fold in an AND as well. + // TBZ has a smaller branch displacement than CBZ. If the offset is + // out of bounds, a late MI-layer pass rewrites branches. + // 403.gcc is an example that hits this case. + if (LHS.getOpcode() == ISD::AND && + isa(LHS.getOperand(1)) && + isPowerOf2_64(LHS.getConstantOperandVal(1))) { + SDValue Test = LHS.getOperand(0); + uint64_t Mask = LHS.getConstantOperandVal(1); + + // TBZ only operates on i64's, but the ext should be free. + if (Test.getValueType() == MVT::i32) + Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64); + + return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test, + DAG.getConstant(Log2_64(Mask), MVT::i64), Dest); + } + + return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest); + } else if (CC == ISD::SETNE) { + // See if we can use a TBZ to fold in an AND as well. + // TBZ has a smaller branch displacement than CBZ. If the offset is + // out of bounds, a late MI-layer pass rewrites branches. + // 403.gcc is an example that hits this case. + if (LHS.getOpcode() == ISD::AND && + isa(LHS.getOperand(1)) && + isPowerOf2_64(LHS.getConstantOperandVal(1))) { + SDValue Test = LHS.getOperand(0); + uint64_t Mask = LHS.getConstantOperandVal(1); + + // TBNZ only operates on i64's, but the ext should be free. + if (Test.getValueType() == MVT::i32) + Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64); + + return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test, + DAG.getConstant(Log2_64(Mask), MVT::i64), Dest); + } + + return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); + } + } + + SDValue CCVal; + SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); + return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, + Cmp); + } + + assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); + + // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally + // clean. Some of them require two branches to implement. + SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); + AArch64CC::CondCode CC1, CC2; + changeFPCCToAArch64CC(CC, CC1, CC2); + SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); + SDValue BR1 = + DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp); + if (CC2 != AArch64CC::AL) { + SDValue CC2Val = DAG.getConstant(CC2, MVT::i32); + return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val, + Cmp); + } + + return BR1; +} + +SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDLoc DL(Op); + + SDValue In1 = Op.getOperand(0); + SDValue In2 = Op.getOperand(1); + EVT SrcVT = In2.getValueType(); + if (SrcVT != VT) { + if (SrcVT == MVT::f32 && VT == MVT::f64) + In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); + else if (SrcVT == MVT::f64 && VT == MVT::f32) + In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0)); + else + // FIXME: Src type is different, bail out for now. Can VT really be a + // vector type? + return SDValue(); + } + + EVT VecVT; + EVT EltVT; + SDValue EltMask, VecVal1, VecVal2; + if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { + EltVT = MVT::i32; + VecVT = MVT::v4i32; + EltMask = DAG.getConstant(0x80000000ULL, EltVT); + + if (!VT.isVector()) { + VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, + DAG.getUNDEF(VecVT), In1); + VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, + DAG.getUNDEF(VecVT), In2); + } else { + VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); + VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); + } + } else if (VT == MVT::f64 || VT == MVT::v2f64) { + EltVT = MVT::i64; + VecVT = MVT::v2i64; + + // We want to materialize a mask with the the high bit set, but the AdvSIMD + // immediate moves cannot materialize that in a single instruction for + // 64-bit elements. Instead, materialize zero and then negate it. + EltMask = DAG.getConstant(0, EltVT); + + if (!VT.isVector()) { + VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, + DAG.getUNDEF(VecVT), In1); + VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, + DAG.getUNDEF(VecVT), In2); + } else { + VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); + VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); + } + } else { + llvm_unreachable("Invalid type for copysign!"); + } + + std::vector BuildVectorOps; + for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i) + BuildVectorOps.push_back(EltMask); + + SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, BuildVectorOps); + + // If we couldn't materialize the mask above, then the mask vector will be + // the zero vector, and we need to negate it here. + if (VT == MVT::f64 || VT == MVT::v2f64) { + BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec); + BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec); + BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec); + } + + SDValue Sel = + DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); + + if (VT == MVT::f32) + return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel); + else if (VT == MVT::f64) + return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel); + else + return DAG.getNode(ISD::BITCAST, DL, VT, Sel); +} + +SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { + if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute( + AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) + return SDValue(); + + // While there is no integer popcount instruction, it can + // be more efficiently lowered to the following sequence that uses + // AdvSIMD registers/instructions as long as the copies to/from + // the AdvSIMD registers are cheap. + // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd + // CNT V0.8B, V0.8B // 8xbyte pop-counts + // ADDV B0, V0.8B // sum 8xbyte pop-counts + // UMOV X0, V0.B[0] // copy byte result back to integer reg + SDValue Val = Op.getOperand(0); + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8); + + SDValue VecVal; + if (VT == MVT::i32) { + VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val); + VecVal = DAG.getTargetInsertSubreg(AArch64::ssub, DL, MVT::v8i8, ZeroVec, + VecVal); + } else { + VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); + } + + SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal); + SDValue UaddLV = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, + DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, MVT::i32), CtPop); + + if (VT == MVT::i64) + UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); + return UaddLV; +} + +SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { + + if (Op.getValueType().isVector()) + return LowerVSETCC(Op, DAG); + + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + ISD::CondCode CC = cast(Op.getOperand(2))->get(); + SDLoc dl(Op); + + // We chose ZeroOrOneBooleanContents, so use zero and one. + EVT VT = Op.getValueType(); + SDValue TVal = DAG.getConstant(1, VT); + SDValue FVal = DAG.getConstant(0, VT); + + // Handle f128 first, since one possible outcome is a normal integer + // comparison which gets picked up by the next if statement. + if (LHS.getValueType() == MVT::f128) { + softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); + + // If softenSetCCOperands returned a scalar, use it. + if (!RHS.getNode()) { + assert(LHS.getValueType() == Op.getValueType() && + "Unexpected setcc expansion!"); + return LHS; + } + } + + if (LHS.getValueType().isInteger()) { + SDValue CCVal; + SDValue Cmp = + getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl); + + // Note that we inverted the condition above, so we reverse the order of + // the true and false operands here. This will allow the setcc to be + // matched to a single CSINC instruction. + return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); + } + + // Now we know we're dealing with FP values. + assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); + + // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead + // and do the comparison. + SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); + + AArch64CC::CondCode CC1, CC2; + changeFPCCToAArch64CC(CC, CC1, CC2); + if (CC2 == AArch64CC::AL) { + changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2); + SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); + + // Note that we inverted the condition above, so we reverse the order of + // the true and false operands here. This will allow the setcc to be + // matched to a single CSINC instruction. + return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); + } else { + // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't + // totally clean. Some of them require two CSELs to implement. As is in + // this case, we emit the first CSEL and then emit a second using the output + // of the first as the RHS. We're effectively OR'ing the two CC's together. + + // FIXME: It would be nice if we could match the two CSELs to two CSINCs. + SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); + SDValue CS1 = + DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); + + SDValue CC2Val = DAG.getConstant(CC2, MVT::i32); + return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); + } +} + +/// A SELECT_CC operation is really some kind of max or min if both values being +/// compared are, in some sense, equal to the results in either case. However, +/// it is permissible to compare f32 values and produce directly extended f64 +/// values. +/// +/// Extending the comparison operands would also be allowed, but is less likely +/// to happen in practice since their use is right here. Note that truncate +/// operations would *not* be semantically equivalent. +static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) { + if (Cmp == Result) + return true; + + ConstantFPSDNode *CCmp = dyn_cast(Cmp); + ConstantFPSDNode *CResult = dyn_cast(Result); + if (CCmp && CResult && Cmp.getValueType() == MVT::f32 && + Result.getValueType() == MVT::f64) { + bool Lossy; + APFloat CmpVal = CCmp->getValueAPF(); + CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy); + return CResult->getValueAPF().bitwiseIsEqual(CmpVal); + } + + return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp; +} + +SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, + SelectionDAG &DAG) const { + SDValue CC = Op->getOperand(0); + SDValue TVal = Op->getOperand(1); + SDValue FVal = Op->getOperand(2); + SDLoc DL(Op); + + unsigned Opc = CC.getOpcode(); + // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select + // instruction. + if (CC.getResNo() == 1 && + (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || + Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { + // Only lower legal XALUO ops. + if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0))) + return SDValue(); + + AArch64CC::CondCode OFCC; + SDValue Value, Overflow; + std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CC.getValue(0), DAG); + SDValue CCVal = DAG.getConstant(OFCC, MVT::i32); + + return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, + CCVal, Overflow); + } + + if (CC.getOpcode() == ISD::SETCC) + return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal, + cast(CC.getOperand(2))->get()); + else + return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal, + FVal, ISD::SETNE); +} + +SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, + SelectionDAG &DAG) const { + ISD::CondCode CC = cast(Op.getOperand(4))->get(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue TVal = Op.getOperand(2); + SDValue FVal = Op.getOperand(3); + SDLoc dl(Op); + + // Handle f128 first, because it will result in a comparison of some RTLIB + // call result against zero. + if (LHS.getValueType() == MVT::f128) { + softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); + + // If softenSetCCOperands returned a scalar, we need to compare the result + // against zero to select between true and false values. + if (!RHS.getNode()) { + RHS = DAG.getConstant(0, LHS.getValueType()); + CC = ISD::SETNE; + } + } + + // Handle integers first. + if (LHS.getValueType().isInteger()) { + assert((LHS.getValueType() == RHS.getValueType()) && + (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); + + unsigned Opcode = AArch64ISD::CSEL; + + // If both the TVal and the FVal are constants, see if we can swap them in + // order to for a CSINV or CSINC out of them. + ConstantSDNode *CFVal = dyn_cast(FVal); + ConstantSDNode *CTVal = dyn_cast(TVal); + + if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) { + std::swap(TVal, FVal); + std::swap(CTVal, CFVal); + CC = ISD::getSetCCInverse(CC, true); + } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) { + std::swap(TVal, FVal); + std::swap(CTVal, CFVal); + CC = ISD::getSetCCInverse(CC, true); + } else if (TVal.getOpcode() == ISD::XOR) { + // If TVal is a NOT we want to swap TVal and FVal so that we can match + // with a CSINV rather than a CSEL. + ConstantSDNode *CVal = dyn_cast(TVal.getOperand(1)); + + if (CVal && CVal->isAllOnesValue()) { + std::swap(TVal, FVal); + std::swap(CTVal, CFVal); + CC = ISD::getSetCCInverse(CC, true); + } + } else if (TVal.getOpcode() == ISD::SUB) { + // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so + // that we can match with a CSNEG rather than a CSEL. + ConstantSDNode *CVal = dyn_cast(TVal.getOperand(0)); + + if (CVal && CVal->isNullValue()) { + std::swap(TVal, FVal); + std::swap(CTVal, CFVal); + CC = ISD::getSetCCInverse(CC, true); + } + } else if (CTVal && CFVal) { + const int64_t TrueVal = CTVal->getSExtValue(); + const int64_t FalseVal = CFVal->getSExtValue(); + bool Swap = false; + + // If both TVal and FVal are constants, see if FVal is the + // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC + // instead of a CSEL in that case. + if (TrueVal == ~FalseVal) { + Opcode = AArch64ISD::CSINV; + } else if (TrueVal == -FalseVal) { + Opcode = AArch64ISD::CSNEG; + } else if (TVal.getValueType() == MVT::i32) { + // If our operands are only 32-bit wide, make sure we use 32-bit + // arithmetic for the check whether we can use CSINC. This ensures that + // the addition in the check will wrap around properly in case there is + // an overflow (which would not be the case if we do the check with + // 64-bit arithmetic). + const uint32_t TrueVal32 = CTVal->getZExtValue(); + const uint32_t FalseVal32 = CFVal->getZExtValue(); + + if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) { + Opcode = AArch64ISD::CSINC; + + if (TrueVal32 > FalseVal32) { + Swap = true; + } + } + // 64-bit check whether we can use CSINC. + } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) { + Opcode = AArch64ISD::CSINC; + + if (TrueVal > FalseVal) { + Swap = true; + } + } + + // Swap TVal and FVal if necessary. + if (Swap) { + std::swap(TVal, FVal); + std::swap(CTVal, CFVal); + CC = ISD::getSetCCInverse(CC, true); + } + + if (Opcode != AArch64ISD::CSEL) { + // Drop FVal since we can get its value by simply inverting/negating + // TVal. + FVal = TVal; + } + } + + SDValue CCVal; + SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); + + EVT VT = Op.getValueType(); + return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); + } + + // Now we know we're dealing with FP values. + assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); + assert(LHS.getValueType() == RHS.getValueType()); + EVT VT = Op.getValueType(); + + // Try to match this select into a max/min operation, which have dedicated + // opcode in the instruction set. + // FIXME: This is not correct in the presence of NaNs, so we only enable this + // in no-NaNs mode. + if (getTargetMachine().Options.NoNaNsFPMath) { + SDValue MinMaxLHS = TVal, MinMaxRHS = FVal; + if (selectCCOpsAreFMaxCompatible(LHS, MinMaxRHS) && + selectCCOpsAreFMaxCompatible(RHS, MinMaxLHS)) { + CC = ISD::getSetCCSwappedOperands(CC); + std::swap(MinMaxLHS, MinMaxRHS); + } + + if (selectCCOpsAreFMaxCompatible(LHS, MinMaxLHS) && + selectCCOpsAreFMaxCompatible(RHS, MinMaxRHS)) { + switch (CC) { + default: + break; + case ISD::SETGT: + case ISD::SETGE: + case ISD::SETUGT: + case ISD::SETUGE: + case ISD::SETOGT: + case ISD::SETOGE: + return DAG.getNode(AArch64ISD::FMAX, dl, VT, MinMaxLHS, MinMaxRHS); + break; + case ISD::SETLT: + case ISD::SETLE: + case ISD::SETULT: + case ISD::SETULE: + case ISD::SETOLT: + case ISD::SETOLE: + return DAG.getNode(AArch64ISD::FMIN, dl, VT, MinMaxLHS, MinMaxRHS); + break; + } + } + } + + // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead + // and do the comparison. + SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); + + // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally + // clean. Some of them require two CSELs to implement. + AArch64CC::CondCode CC1, CC2; + changeFPCCToAArch64CC(CC, CC1, CC2); + SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); + SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); + + // If we need a second CSEL, emit it, using the output of the first as the + // RHS. We're effectively OR'ing the two CC's together. + if (CC2 != AArch64CC::AL) { + SDValue CC2Val = DAG.getConstant(CC2, MVT::i32); + return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); + } + + // Otherwise, return the output of the first CSEL. + return CS1; +} + +SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, + SelectionDAG &DAG) const { + // Jump table entries as PC relative offsets. No additional tweaking + // is necessary here. Just get the address of the jump table. + JumpTableSDNode *JT = cast(Op); + EVT PtrVT = getPointerTy(); + SDLoc DL(Op); + + if (getTargetMachine().getCodeModel() == CodeModel::Large && + !Subtarget->isTargetMachO()) { + const unsigned char MO_NC = AArch64II::MO_NC; + return DAG.getNode( + AArch64ISD::WrapperLarge, DL, PtrVT, + DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3), + DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC), + DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC), + DAG.getTargetJumpTable(JT->getIndex(), PtrVT, + AArch64II::MO_G0 | MO_NC)); + } + + SDValue Hi = + DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE); + SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, + AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); + return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); +} + +SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, + SelectionDAG &DAG) const { + ConstantPoolSDNode *CP = cast(Op); + EVT PtrVT = getPointerTy(); + SDLoc DL(Op); + + if (getTargetMachine().getCodeModel() == CodeModel::Large) { + // Use the GOT for the large code model on iOS. + if (Subtarget->isTargetMachO()) { + SDValue GotAddr = DAG.getTargetConstantPool( + CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), + AArch64II::MO_GOT); + return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); + } + + const unsigned char MO_NC = AArch64II::MO_NC; + return DAG.getNode( + AArch64ISD::WrapperLarge, DL, PtrVT, + DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), + CP->getOffset(), AArch64II::MO_G3), + DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), + CP->getOffset(), AArch64II::MO_G2 | MO_NC), + DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), + CP->getOffset(), AArch64II::MO_G1 | MO_NC), + DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), + CP->getOffset(), AArch64II::MO_G0 | MO_NC)); + } else { + // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on + // ELF, the only valid one on Darwin. + SDValue Hi = + DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), + CP->getOffset(), AArch64II::MO_PAGE); + SDValue Lo = DAG.getTargetConstantPool( + CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), + AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + + SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); + return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + } +} + +SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, + SelectionDAG &DAG) const { + const BlockAddress *BA = cast(Op)->getBlockAddress(); + EVT PtrVT = getPointerTy(); + SDLoc DL(Op); + if (getTargetMachine().getCodeModel() == CodeModel::Large && + !Subtarget->isTargetMachO()) { + const unsigned char MO_NC = AArch64II::MO_NC; + return DAG.getNode( + AArch64ISD::WrapperLarge, DL, PtrVT, + DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3), + DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC), + DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC), + DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); + } else { + SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE); + SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF | + AArch64II::MO_NC); + SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); + return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); + } +} + +SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, + SelectionDAG &DAG) const { + AArch64FunctionInfo *FuncInfo = + DAG.getMachineFunction().getInfo(); + + SDLoc DL(Op); + SDValue FR = + DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy()); + const Value *SV = cast(Op.getOperand(2))->getValue(); + return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), + MachinePointerInfo(SV), false, false, 0); +} + +SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, + SelectionDAG &DAG) const { + // The layout of the va_list struct is specified in the AArch64 Procedure Call + // Standard, section B.3. + MachineFunction &MF = DAG.getMachineFunction(); + AArch64FunctionInfo *FuncInfo = MF.getInfo(); + SDLoc DL(Op); + + SDValue Chain = Op.getOperand(0); + SDValue VAList = Op.getOperand(1); + const Value *SV = cast(Op.getOperand(2))->getValue(); + SmallVector MemOps; + + // void *__stack at offset 0 + SDValue Stack = + DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy()); + MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, + MachinePointerInfo(SV), false, false, 8)); + + // void *__gr_top at offset 8 + int GPRSize = FuncInfo->getVarArgsGPRSize(); + if (GPRSize > 0) { + SDValue GRTop, GRTopAddr; + + GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, + DAG.getConstant(8, getPointerTy())); + + GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy()); + GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop, + DAG.getConstant(GPRSize, getPointerTy())); + + MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, + MachinePointerInfo(SV, 8), false, false, 8)); + } + + // void *__vr_top at offset 16 + int FPRSize = FuncInfo->getVarArgsFPRSize(); + if (FPRSize > 0) { + SDValue VRTop, VRTopAddr; + VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, + DAG.getConstant(16, getPointerTy())); + + VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy()); + VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop, + DAG.getConstant(FPRSize, getPointerTy())); + + MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, + MachinePointerInfo(SV, 16), false, false, 8)); + } + + // int __gr_offs at offset 24 + SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, + DAG.getConstant(24, getPointerTy())); + MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32), + GROffsAddr, MachinePointerInfo(SV, 24), false, + false, 4)); + + // int __vr_offs at offset 28 + SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, + DAG.getConstant(28, getPointerTy())); + MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32), + VROffsAddr, MachinePointerInfo(SV, 28), false, + false, 4)); + + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); +} + +SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, + SelectionDAG &DAG) const { + return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG) + : LowerAAPCS_VASTART(Op, DAG); +} + +SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, + SelectionDAG &DAG) const { + // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single + // pointer. + unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32; + const Value *DestSV = cast(Op.getOperand(3))->getValue(); + const Value *SrcSV = cast(Op.getOperand(4))->getValue(); + + return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1), + Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32), + 8, false, false, MachinePointerInfo(DestSV), + MachinePointerInfo(SrcSV)); +} + +SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { + assert(Subtarget->isTargetDarwin() && + "automatic va_arg instruction only works on Darwin"); + + const Value *V = cast(Op.getOperand(2))->getValue(); + EVT VT = Op.getValueType(); + SDLoc DL(Op); + SDValue Chain = Op.getOperand(0); + SDValue Addr = Op.getOperand(1); + unsigned Align = Op.getConstantOperandVal(3); + + SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr, + MachinePointerInfo(V), false, false, false, 0); + Chain = VAList.getValue(1); + + if (Align > 8) { + assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); + VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, + DAG.getConstant(Align - 1, getPointerTy())); + VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList, + DAG.getConstant(-(int64_t)Align, getPointerTy())); + } + + Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); + uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy); + + // Scalar integer and FP values smaller than 64 bits are implicitly extended + // up to 64 bits. At the very least, we have to increase the striding of the + // vaargs list to match this, and for FP values we need to introduce + // FP_ROUND nodes as well. + if (VT.isInteger() && !VT.isVector()) + ArgSize = 8; + bool NeedFPTrunc = false; + if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { + ArgSize = 8; + NeedFPTrunc = true; + } + + // Increment the pointer, VAList, to the next vaarg + SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, + DAG.getConstant(ArgSize, getPointerTy())); + // Store the incremented VAList to the legalized pointer + SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V), + false, false, 0); + + // Load the actual argument out of the pointer VAList + if (NeedFPTrunc) { + // Load the value as an f64. + SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList, + MachinePointerInfo(), false, false, false, 0); + // Round the value down to an f32. + SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), + DAG.getIntPtrConstant(1)); + SDValue Ops[] = { NarrowFP, WideFP.getValue(1) }; + // Merge the rounded value with the chain output of the load. + return DAG.getMergeValues(Ops, DL); + } + + return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false, + false, false, 0); +} + +SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, + SelectionDAG &DAG) const { + MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MFI->setFrameAddressIsTaken(true); + + EVT VT = Op.getValueType(); + SDLoc DL(Op); + unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); + SDValue FrameAddr = + DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); + while (Depth--) + FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, + MachinePointerInfo(), false, false, false, 0); + return FrameAddr; +} + +// FIXME? Maybe this could be a TableGen attribute on some registers and +// this table could be generated automatically from RegInfo. +unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, + EVT VT) const { + unsigned Reg = StringSwitch(RegName) + .Case("sp", AArch64::SP) + .Default(0); + if (Reg) + return Reg; + report_fatal_error("Invalid register name global variable"); +} + +SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MFI->setReturnAddressIsTaken(true); + + EVT VT = Op.getValueType(); + SDLoc DL(Op); + unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); + if (Depth) { + SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); + SDValue Offset = DAG.getConstant(8, getPointerTy()); + return DAG.getLoad(VT, DL, DAG.getEntryNode(), + DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), + MachinePointerInfo(), false, false, false, 0); + } + + // Return LR, which contains the return address. Mark it an implicit live-in. + unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); + return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); +} + +/// LowerShiftRightParts - Lower SRA_PARTS, which returns two +/// i64 values and take a 2 x i64 value to shift plus a shift amount. +SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + SDLoc dl(Op); + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + SDValue ARMcc; + unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; + + assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); + + SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, + DAG.getConstant(VTBits, MVT::i64), ShAmt); + SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, + DAG.getConstant(VTBits, MVT::i64)); + SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); + + SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64), + ISD::SETGE, dl, DAG); + SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32); + + SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); + SDValue Lo = + DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp); + + // AArch64 shifts larger than the register width are wrapped rather than + // clamped, so we can't just emit "hi >> x". + SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); + SDValue TrueValHi = Opc == ISD::SRA + ? DAG.getNode(Opc, dl, VT, ShOpHi, + DAG.getConstant(VTBits - 1, MVT::i64)) + : DAG.getConstant(0, VT); + SDValue Hi = + DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); +} + +/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two +/// i64 values and take a 2 x i64 value to shift plus a shift amount. +SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getNumOperands() == 3 && "Not a double-shift!"); + EVT VT = Op.getValueType(); + unsigned VTBits = VT.getSizeInBits(); + SDLoc dl(Op); + SDValue ShOpLo = Op.getOperand(0); + SDValue ShOpHi = Op.getOperand(1); + SDValue ShAmt = Op.getOperand(2); + SDValue ARMcc; + + assert(Op.getOpcode() == ISD::SHL_PARTS); + SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, + DAG.getConstant(VTBits, MVT::i64), ShAmt); + SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, + DAG.getConstant(VTBits, MVT::i64)); + SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); + SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); + + SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + + SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64), + ISD::SETGE, dl, DAG); + SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32); + SDValue Hi = + DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp); + + // AArch64 shifts of larger than register sizes are wrapped rather than + // clamped, so we can't just emit "lo << a" if a is too big. + SDValue TrueValLo = DAG.getConstant(0, VT); + SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + SDValue Lo = + DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp); + + SDValue Ops[2] = { Lo, Hi }; + return DAG.getMergeValues(Ops, dl); +} + +bool AArch64TargetLowering::isOffsetFoldingLegal( + const GlobalAddressSDNode *GA) const { + // The AArch64 target doesn't support folding offsets into global addresses. + return false; +} + +bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { + // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases. + // FIXME: We should be able to handle f128 as well with a clever lowering. + if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32)) + return true; + + if (VT == MVT::f64) + return AArch64_AM::getFP64Imm(Imm) != -1; + else if (VT == MVT::f32) + return AArch64_AM::getFP32Imm(Imm) != -1; + return false; +} + +//===----------------------------------------------------------------------===// +// AArch64 Optimization Hooks +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// AArch64 Inline Assembly Support +//===----------------------------------------------------------------------===// + +// Table of Constraints +// TODO: This is the current set of constraints supported by ARM for the +// compiler, not all of them may make sense, e.g. S may be difficult to support. +// +// r - A general register +// w - An FP/SIMD register of some size in the range v0-v31 +// x - An FP/SIMD register of some size in the range v0-v15 +// I - Constant that can be used with an ADD instruction +// J - Constant that can be used with a SUB instruction +// K - Constant that can be used with a 32-bit logical instruction +// L - Constant that can be used with a 64-bit logical instruction +// M - Constant that can be used as a 32-bit MOV immediate +// N - Constant that can be used as a 64-bit MOV immediate +// Q - A memory reference with base register and no offset +// S - A symbolic address +// Y - Floating point constant zero +// Z - Integer constant zero +// +// Note that general register operands will be output using their 64-bit x +// register name, whatever the size of the variable, unless the asm operand +// is prefixed by the %w modifier. Floating-point and SIMD register operands +// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or +// %q modifier. + +/// getConstraintType - Given a constraint letter, return the type of +/// constraint it is for this target. +AArch64TargetLowering::ConstraintType +AArch64TargetLowering::getConstraintType(const std::string &Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: + break; + case 'z': + return C_Other; + case 'x': + case 'w': + return C_RegisterClass; + // An address with a single base register. Due to the way we + // currently handle addresses it is the same as 'r'. + case 'Q': + return C_Memory; + } + } + return TargetLowering::getConstraintType(Constraint); +} + +/// Examine constraint type and operand type and determine a weight value. +/// This object must already have been set up with the operand type +/// and the current alternative constraint selected. +TargetLowering::ConstraintWeight +AArch64TargetLowering::getSingleConstraintMatchWeight( + AsmOperandInfo &info, const char *constraint) const { + ConstraintWeight weight = CW_Invalid; + Value *CallOperandVal = info.CallOperandVal; + // If we don't have a value, we can't do a match, + // but allow it at the lowest weight. + if (!CallOperandVal) + return CW_Default; + Type *type = CallOperandVal->getType(); + // Look at the constraint type. + switch (*constraint) { + default: + weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); + break; + case 'x': + case 'w': + if (type->isFloatingPointTy() || type->isVectorTy()) + weight = CW_Register; + break; + case 'z': + weight = CW_Constant; + break; + } + return weight; +} + +std::pair +AArch64TargetLowering::getRegForInlineAsmConstraint( + const std::string &Constraint, MVT VT) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + case 'r': + if (VT.getSizeInBits() == 64) + return std::make_pair(0U, &AArch64::GPR64commonRegClass); + return std::make_pair(0U, &AArch64::GPR32commonRegClass); + case 'w': + if (VT == MVT::f32) + return std::make_pair(0U, &AArch64::FPR32RegClass); + if (VT.getSizeInBits() == 64) + return std::make_pair(0U, &AArch64::FPR64RegClass); + if (VT.getSizeInBits() == 128) + return std::make_pair(0U, &AArch64::FPR128RegClass); + break; + // The instructions that this constraint is designed for can + // only take 128-bit registers so just use that regclass. + case 'x': + if (VT.getSizeInBits() == 128) + return std::make_pair(0U, &AArch64::FPR128_loRegClass); + break; + } + } + if (StringRef("{cc}").equals_lower(Constraint)) + return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass); + + // Use the default implementation in TargetLowering to convert the register + // constraint into a member of a register class. + std::pair Res; + Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); + + // Not found as a standard register? + if (!Res.second) { + unsigned Size = Constraint.size(); + if ((Size == 4 || Size == 5) && Constraint[0] == '{' && + tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { + const std::string Reg = + std::string(&Constraint[2], &Constraint[Size - 1]); + int RegNo = atoi(Reg.c_str()); + if (RegNo >= 0 && RegNo <= 31) { + // v0 - v31 are aliases of q0 - q31. + // By default we'll emit v0-v31 for this unless there's a modifier where + // we'll emit the correct register as well. + Res.first = AArch64::FPR128RegClass.getRegister(RegNo); + Res.second = &AArch64::FPR128RegClass; + } + } + } + + return Res; +} + +/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops +/// vector. If it is invalid, don't add anything to Ops. +void AArch64TargetLowering::LowerAsmOperandForConstraint( + SDValue Op, std::string &Constraint, std::vector &Ops, + SelectionDAG &DAG) const { + SDValue Result; + + // Currently only support length 1 constraints. + if (Constraint.length() != 1) + return; + + char ConstraintLetter = Constraint[0]; + switch (ConstraintLetter) { + default: + break; + + // This set of constraints deal with valid constants for various instructions. + // Validate and return a target constant for them if we can. + case 'z': { + // 'z' maps to xzr or wzr so it needs an input of 0. + ConstantSDNode *C = dyn_cast(Op); + if (!C || C->getZExtValue() != 0) + return; + + if (Op.getValueType() == MVT::i64) + Result = DAG.getRegister(AArch64::XZR, MVT::i64); + else + Result = DAG.getRegister(AArch64::WZR, MVT::i32); + break; + } + + case 'I': + case 'J': + case 'K': + case 'L': + case 'M': + case 'N': + ConstantSDNode *C = dyn_cast(Op); + if (!C) + return; + + // Grab the value and do some validation. + uint64_t CVal = C->getZExtValue(); + switch (ConstraintLetter) { + // The I constraint applies only to simple ADD or SUB immediate operands: + // i.e. 0 to 4095 with optional shift by 12 + // The J constraint applies only to ADD or SUB immediates that would be + // valid when negated, i.e. if [an add pattern] were to be output as a SUB + // instruction [or vice versa], in other words -1 to -4095 with optional + // left shift by 12. + case 'I': + if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal)) + break; + return; + case 'J': { + uint64_t NVal = -C->getSExtValue(); + if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) + break; + return; + } + // The K and L constraints apply *only* to logical immediates, including + // what used to be the MOVI alias for ORR (though the MOVI alias has now + // been removed and MOV should be used). So these constraints have to + // distinguish between bit patterns that are valid 32-bit or 64-bit + // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but + // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice + // versa. + case 'K': + if (AArch64_AM::isLogicalImmediate(CVal, 32)) + break; + return; + case 'L': + if (AArch64_AM::isLogicalImmediate(CVal, 64)) + break; + return; + // The M and N constraints are a superset of K and L respectively, for use + // with the MOV (immediate) alias. As well as the logical immediates they + // also match 32 or 64-bit immediates that can be loaded either using a + // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca + // (M) or 64-bit 0x1234000000000000 (N) etc. + // As a note some of this code is liberally stolen from the asm parser. + case 'M': { + if (!isUInt<32>(CVal)) + return; + if (AArch64_AM::isLogicalImmediate(CVal, 32)) + break; + if ((CVal & 0xFFFF) == CVal) + break; + if ((CVal & 0xFFFF0000ULL) == CVal) + break; + uint64_t NCVal = ~(uint32_t)CVal; + if ((NCVal & 0xFFFFULL) == NCVal) + break; + if ((NCVal & 0xFFFF0000ULL) == NCVal) + break; + return; + } + case 'N': { + if (AArch64_AM::isLogicalImmediate(CVal, 64)) + break; + if ((CVal & 0xFFFFULL) == CVal) + break; + if ((CVal & 0xFFFF0000ULL) == CVal) + break; + if ((CVal & 0xFFFF00000000ULL) == CVal) + break; + if ((CVal & 0xFFFF000000000000ULL) == CVal) + break; + uint64_t NCVal = ~CVal; + if ((NCVal & 0xFFFFULL) == NCVal) + break; + if ((NCVal & 0xFFFF0000ULL) == NCVal) + break; + if ((NCVal & 0xFFFF00000000ULL) == NCVal) + break; + if ((NCVal & 0xFFFF000000000000ULL) == NCVal) + break; + return; + } + default: + return; + } + + // All assembler immediates are 64-bit integers. + Result = DAG.getTargetConstant(CVal, MVT::i64); + break; + } + + if (Result.getNode()) { + Ops.push_back(Result); + return; + } + + return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); +} + +//===----------------------------------------------------------------------===// +// AArch64 Advanced SIMD Support +//===----------------------------------------------------------------------===// + +/// WidenVector - Given a value in the V64 register class, produce the +/// equivalent value in the V128 register class. +static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) { + EVT VT = V64Reg.getValueType(); + unsigned NarrowSize = VT.getVectorNumElements(); + MVT EltTy = VT.getVectorElementType().getSimpleVT(); + MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); + SDLoc DL(V64Reg); + + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy), + V64Reg, DAG.getConstant(0, MVT::i32)); +} + +/// getExtFactor - Determine the adjustment factor for the position when +/// generating an "extract from vector registers" instruction. +static unsigned getExtFactor(SDValue &V) { + EVT EltType = V.getValueType().getVectorElementType(); + return EltType.getSizeInBits() / 8; +} + +/// NarrowVector - Given a value in the V128 register class, produce the +/// equivalent value in the V64 register class. +static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { + EVT VT = V128Reg.getValueType(); + unsigned WideSize = VT.getVectorNumElements(); + MVT EltTy = VT.getVectorElementType().getSimpleVT(); + MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); + SDLoc DL(V128Reg); + + return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg); +} + +// Gather data to see if the operation can be modelled as a +// shuffle in combination with VEXTs. +SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + EVT VT = Op.getValueType(); + unsigned NumElts = VT.getVectorNumElements(); + + SmallVector SourceVecs; + SmallVector MinElts; + SmallVector MaxElts; + + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { + // A shuffle can only come from building a vector from various + // elements of other vectors. + return SDValue(); + } + + // Record this extraction against the appropriate vector if possible... + SDValue SourceVec = V.getOperand(0); + unsigned EltNo = cast(V.getOperand(1))->getZExtValue(); + bool FoundSource = false; + for (unsigned j = 0; j < SourceVecs.size(); ++j) { + if (SourceVecs[j] == SourceVec) { + if (MinElts[j] > EltNo) + MinElts[j] = EltNo; + if (MaxElts[j] < EltNo) + MaxElts[j] = EltNo; + FoundSource = true; + break; + } + } + + // Or record a new source if not... + if (!FoundSource) { + SourceVecs.push_back(SourceVec); + MinElts.push_back(EltNo); + MaxElts.push_back(EltNo); + } + } + + // Currently only do something sane when at most two source vectors + // involved. + if (SourceVecs.size() > 2) + return SDValue(); + + SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) }; + int VEXTOffsets[2] = { 0, 0 }; + + // This loop extracts the usage patterns of the source vectors + // and prepares appropriate SDValues for a shuffle if possible. + for (unsigned i = 0; i < SourceVecs.size(); ++i) { + if (SourceVecs[i].getValueType() == VT) { + // No VEXT necessary + ShuffleSrcs[i] = SourceVecs[i]; + VEXTOffsets[i] = 0; + continue; + } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) { + // We can pad out the smaller vector for free, so if it's part of a + // shuffle... + ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, SourceVecs[i], + DAG.getUNDEF(SourceVecs[i].getValueType())); + continue; + } + + // Don't attempt to extract subvectors from BUILD_VECTOR sources + // that expand or trunc the original value. + // TODO: We can try to bitcast and ANY_EXTEND the result but + // we need to consider the cost of vector ANY_EXTEND, and the + // legality of all the types. + if (SourceVecs[i].getValueType().getVectorElementType() != + VT.getVectorElementType()) + return SDValue(); + + // Since only 64-bit and 128-bit vectors are legal on ARM and + // we've eliminated the other cases... + assert(SourceVecs[i].getValueType().getVectorNumElements() == 2 * NumElts && + "unexpected vector sizes in ReconstructShuffle"); + + if (MaxElts[i] - MinElts[i] >= NumElts) { + // Span too large for a VEXT to cope + return SDValue(); + } + + if (MinElts[i] >= NumElts) { + // The extraction can just take the second half + VEXTOffsets[i] = NumElts; + ShuffleSrcs[i] = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i], + DAG.getIntPtrConstant(NumElts)); + } else if (MaxElts[i] < NumElts) { + // The extraction can just take the first half + VEXTOffsets[i] = 0; + ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + SourceVecs[i], DAG.getIntPtrConstant(0)); + } else { + // An actual VEXT is needed + VEXTOffsets[i] = MinElts[i]; + SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + SourceVecs[i], DAG.getIntPtrConstant(0)); + SDValue VEXTSrc2 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i], + DAG.getIntPtrConstant(NumElts)); + unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1); + ShuffleSrcs[i] = DAG.getNode(AArch64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2, + DAG.getConstant(Imm, MVT::i32)); + } + } + + SmallVector Mask; + + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Entry = Op.getOperand(i); + if (Entry.getOpcode() == ISD::UNDEF) { + Mask.push_back(-1); + continue; + } + + SDValue ExtractVec = Entry.getOperand(0); + int ExtractElt = + cast(Op.getOperand(i).getOperand(1))->getSExtValue(); + if (ExtractVec == SourceVecs[0]) { + Mask.push_back(ExtractElt - VEXTOffsets[0]); + } else { + Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]); + } + } + + // Final check before we try to produce nonsense... + if (isShuffleMaskLegal(Mask, VT)) + return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1], + &Mask[0]); + + return SDValue(); +} + +// check if an EXT instruction can handle the shuffle mask when the +// vector sources of the shuffle are the same. +static bool isSingletonEXTMask(ArrayRef M, EVT VT, unsigned &Imm) { + unsigned NumElts = VT.getVectorNumElements(); + + // Assume that the first shuffle index is not UNDEF. Fail if it is. + if (M[0] < 0) + return false; + + Imm = M[0]; + + // If this is a VEXT shuffle, the immediate value is the index of the first + // element. The other shuffle indices must be the successive elements after + // the first one. + unsigned ExpectedElt = Imm; + for (unsigned i = 1; i < NumElts; ++i) { + // Increment the expected index. If it wraps around, just follow it + // back to index zero and keep going. + ++ExpectedElt; + if (ExpectedElt == NumElts) + ExpectedElt = 0; + + if (M[i] < 0) + continue; // ignore UNDEF indices + if (ExpectedElt != static_cast(M[i])) + return false; + } + + return true; +} + +// check if an EXT instruction can handle the shuffle mask when the +// vector sources of the shuffle are different. +static bool isEXTMask(ArrayRef M, EVT VT, bool &ReverseEXT, + unsigned &Imm) { + // Look for the first non-undef element. + const int *FirstRealElt = std::find_if(M.begin(), M.end(), + [](int Elt) {return Elt >= 0;}); + + // Benefit form APInt to handle overflow when calculating expected element. + unsigned NumElts = VT.getVectorNumElements(); + unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); + APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); + // The following shuffle indices must be the successive elements after the + // first real element. + const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(), + [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;}); + if (FirstWrongElt != M.end()) + return false; + + // The index of an EXT is the first element if it is not UNDEF. + // Watch out for the beginning UNDEFs. The EXT index should be the expected + // value of the first element. E.g. + // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. + // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. + // ExpectedElt is the last mask index plus 1. + Imm = ExpectedElt.getZExtValue(); + + // There are two difference cases requiring to reverse input vectors. + // For example, for vector <4 x i32> we have the following cases, + // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) + // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) + // For both cases, we finally use mask <5, 6, 7, 0>, which requires + // to reverse two input vectors. + if (Imm < NumElts) + ReverseEXT = true; + else + Imm -= NumElts; + + return true; +} + +/// isREVMask - Check if a vector shuffle corresponds to a REV +/// instruction with the specified blocksize. (The order of the elements +/// within each block of the vector is reversed.) +static bool isREVMask(ArrayRef M, EVT VT, unsigned BlockSize) { + assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && + "Only possible block sizes for REV are: 16, 32, 64"); + + unsigned EltSz = VT.getVectorElementType().getSizeInBits(); + if (EltSz == 64) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + unsigned BlockElts = M[0] + 1; + // If the first shuffle index is UNDEF, be optimistic. + if (M[0] < 0) + BlockElts = BlockSize / EltSz; + + if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) + return false; + + for (unsigned i = 0; i < NumElts; ++i) { + if (M[i] < 0) + continue; // ignore UNDEF indices + if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) + return false; + } + + return true; +} + +static bool isZIPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned i = 0; i != NumElts; i += 2) { + if ((M[i] >= 0 && (unsigned)M[i] != Idx) || + (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts)) + return false; + Idx += 1; + } + + return true; +} + +static bool isUZPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i != NumElts; ++i) { + if (M[i] < 0) + continue; // ignore UNDEF indices + if ((unsigned)M[i] != 2 * i + WhichResult) + return false; + } + + return true; +} + +static bool isTRNMask(ArrayRef M, EVT VT, unsigned &WhichResult) { + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i < NumElts; i += 2) { + if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || + (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) + return false; + } + return true; +} + +/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of +/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". +/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. +static bool isZIP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned i = 0; i != NumElts; i += 2) { + if ((M[i] >= 0 && (unsigned)M[i] != Idx) || + (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx)) + return false; + Idx += 1; + } + + return true; +} + +/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of +/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". +/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, +static bool isUZP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { + unsigned Half = VT.getVectorNumElements() / 2; + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned j = 0; j != 2; ++j) { + unsigned Idx = WhichResult; + for (unsigned i = 0; i != Half; ++i) { + int MIdx = M[i + j * Half]; + if (MIdx >= 0 && (unsigned)MIdx != Idx) + return false; + Idx += 2; + } + } + + return true; +} + +/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of +/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". +/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. +static bool isTRN_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i < NumElts; i += 2) { + if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || + (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult)) + return false; + } + return true; +} + +static bool isINSMask(ArrayRef M, int NumInputElements, + bool &DstIsLeft, int &Anomaly) { + if (M.size() != static_cast(NumInputElements)) + return false; + + int NumLHSMatch = 0, NumRHSMatch = 0; + int LastLHSMismatch = -1, LastRHSMismatch = -1; + + for (int i = 0; i < NumInputElements; ++i) { + if (M[i] == -1) { + ++NumLHSMatch; + ++NumRHSMatch; + continue; + } + + if (M[i] == i) + ++NumLHSMatch; + else + LastLHSMismatch = i; + + if (M[i] == i + NumInputElements) + ++NumRHSMatch; + else + LastRHSMismatch = i; + } + + if (NumLHSMatch == NumInputElements - 1) { + DstIsLeft = true; + Anomaly = LastLHSMismatch; + return true; + } else if (NumRHSMatch == NumInputElements - 1) { + DstIsLeft = false; + Anomaly = LastRHSMismatch; + return true; + } + + return false; +} + +static bool isConcatMask(ArrayRef Mask, EVT VT, bool SplitLHS) { + if (VT.getSizeInBits() != 128) + return false; + + unsigned NumElts = VT.getVectorNumElements(); + + for (int I = 0, E = NumElts / 2; I != E; I++) { + if (Mask[I] != I) + return false; + } + + int Offset = NumElts / 2; + for (int I = NumElts / 2, E = NumElts; I != E; I++) { + if (Mask[I] != I + SplitLHS * Offset) + return false; + } + + return true; +} + +static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + SDValue V0 = Op.getOperand(0); + SDValue V1 = Op.getOperand(1); + ArrayRef Mask = cast(Op)->getMask(); + + if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() || + VT.getVectorElementType() != V1.getValueType().getVectorElementType()) + return SDValue(); + + bool SplitV0 = V0.getValueType().getSizeInBits() == 128; + + if (!isConcatMask(Mask, VT, SplitV0)) + return SDValue(); + + EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorNumElements() / 2); + if (SplitV0) { + V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, + DAG.getConstant(0, MVT::i64)); + } + if (V1.getValueType().getSizeInBits() == 128) { + V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, + DAG.getConstant(0, MVT::i64)); + } + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); +} + +/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit +/// the specified operations to build the shuffle. +static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, + SDValue RHS, SelectionDAG &DAG, + SDLoc dl) { + unsigned OpNum = (PFEntry >> 26) & 0x0F; + unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); + unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); + + enum { + OP_COPY = 0, // Copy, used for things like to say it is <0,1,2,3> + OP_VREV, + OP_VDUP0, + OP_VDUP1, + OP_VDUP2, + OP_VDUP3, + OP_VEXT1, + OP_VEXT2, + OP_VEXT3, + OP_VUZPL, // VUZP, left result + OP_VUZPR, // VUZP, right result + OP_VZIPL, // VZIP, left result + OP_VZIPR, // VZIP, right result + OP_VTRNL, // VTRN, left result + OP_VTRNR // VTRN, right result + }; + + if (OpNum == OP_COPY) { + if (LHSID == (1 * 9 + 2) * 9 + 3) + return LHS; + assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); + return RHS; + } + + SDValue OpLHS, OpRHS; + OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); + OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); + EVT VT = OpLHS.getValueType(); + + switch (OpNum) { + default: + llvm_unreachable("Unknown shuffle opcode!"); + case OP_VREV: + // VREV divides the vector in half and swaps within the half. + if (VT.getVectorElementType() == MVT::i32 || + VT.getVectorElementType() == MVT::f32) + return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); + // vrev <4 x i16> -> REV32 + if (VT.getVectorElementType() == MVT::i16) + return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); + // vrev <4 x i8> -> REV16 + assert(VT.getVectorElementType() == MVT::i8); + return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS); + case OP_VDUP0: + case OP_VDUP1: + case OP_VDUP2: + case OP_VDUP3: { + EVT EltTy = VT.getVectorElementType(); + unsigned Opcode; + if (EltTy == MVT::i8) + Opcode = AArch64ISD::DUPLANE8; + else if (EltTy == MVT::i16) + Opcode = AArch64ISD::DUPLANE16; + else if (EltTy == MVT::i32 || EltTy == MVT::f32) + Opcode = AArch64ISD::DUPLANE32; + else if (EltTy == MVT::i64 || EltTy == MVT::f64) + Opcode = AArch64ISD::DUPLANE64; + else + llvm_unreachable("Invalid vector element type?"); + + if (VT.getSizeInBits() == 64) + OpLHS = WidenVector(OpLHS, DAG); + SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, MVT::i64); + return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); + } + case OP_VEXT1: + case OP_VEXT2: + case OP_VEXT3: { + unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); + return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, + DAG.getConstant(Imm, MVT::i32)); + } + case OP_VUZPL: + return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, + OpRHS); + case OP_VUZPR: + return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, + OpRHS); + case OP_VZIPL: + return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, + OpRHS); + case OP_VZIPR: + return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, + OpRHS); + case OP_VTRNL: + return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, + OpRHS); + case OP_VTRNR: + return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, + OpRHS); + } +} + +static SDValue GenerateTBL(SDValue Op, ArrayRef ShuffleMask, + SelectionDAG &DAG) { + // Check to see if we can use the TBL instruction. + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + SDLoc DL(Op); + + EVT EltVT = Op.getValueType().getVectorElementType(); + unsigned BytesPerElt = EltVT.getSizeInBits() / 8; + + SmallVector TBLMask; + for (int Val : ShuffleMask) { + for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { + unsigned Offset = Byte + Val * BytesPerElt; + TBLMask.push_back(DAG.getConstant(Offset, MVT::i32)); + } + } + + MVT IndexVT = MVT::v8i8; + unsigned IndexLen = 8; + if (Op.getValueType().getSizeInBits() == 128) { + IndexVT = MVT::v16i8; + IndexLen = 16; + } + + SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); + SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); + + SDValue Shuffle; + if (V2.getNode()->getOpcode() == ISD::UNDEF) { + if (IndexLen == 8) + V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); + Shuffle = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, + DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst, + DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, + makeArrayRef(TBLMask.data(), IndexLen))); + } else { + if (IndexLen == 8) { + V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); + Shuffle = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, + DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst, + DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, + makeArrayRef(TBLMask.data(), IndexLen))); + } else { + // FIXME: We cannot, for the moment, emit a TBL2 instruction because we + // cannot currently represent the register constraints on the input + // table registers. + // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, + // DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, + // &TBLMask[0], IndexLen)); + Shuffle = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, + DAG.getConstant(Intrinsic::aarch64_neon_tbl2, MVT::i32), V1Cst, V2Cst, + DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, + makeArrayRef(TBLMask.data(), IndexLen))); + } + } + return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); +} + +static unsigned getDUPLANEOp(EVT EltType) { + if (EltType == MVT::i8) + return AArch64ISD::DUPLANE8; + if (EltType == MVT::i16) + return AArch64ISD::DUPLANE16; + if (EltType == MVT::i32 || EltType == MVT::f32) + return AArch64ISD::DUPLANE32; + if (EltType == MVT::i64 || EltType == MVT::f64) + return AArch64ISD::DUPLANE64; + + llvm_unreachable("Invalid vector element type?"); +} + +SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + EVT VT = Op.getValueType(); + + ShuffleVectorSDNode *SVN = cast(Op.getNode()); + + // Convert shuffles that are directly supported on NEON to target-specific + // DAG nodes, instead of keeping them as shuffles and matching them again + // during code selection. This is more efficient and avoids the possibility + // of inconsistencies between legalization and selection. + ArrayRef ShuffleMask = SVN->getMask(); + + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + + if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], + V1.getValueType().getSimpleVT())) { + int Lane = SVN->getSplatIndex(); + // If this is undef splat, generate it via "just" vdup, if possible. + if (Lane == -1) + Lane = 0; + + if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) + return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(), + V1.getOperand(0)); + // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non- + // constant. If so, we can just reference the lane's definition directly. + if (V1.getOpcode() == ISD::BUILD_VECTOR && + !isa(V1.getOperand(Lane))) + return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane)); + + // Otherwise, duplicate from the lane of the input vector. + unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); + + // SelectionDAGBuilder may have "helpfully" already extracted or conatenated + // to make a vector of the same size as this SHUFFLE. We can ignore the + // extract entirely, and canonicalise the concat using WidenVector. + if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { + Lane += cast(V1.getOperand(1))->getZExtValue(); + V1 = V1.getOperand(0); + } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) { + unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; + Lane -= Idx * VT.getVectorNumElements() / 2; + V1 = WidenVector(V1.getOperand(Idx), DAG); + } else if (VT.getSizeInBits() == 64) + V1 = WidenVector(V1, DAG); + + return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, MVT::i64)); + } + + if (isREVMask(ShuffleMask, VT, 64)) + return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2); + if (isREVMask(ShuffleMask, VT, 32)) + return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2); + if (isREVMask(ShuffleMask, VT, 16)) + return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2); + + bool ReverseEXT = false; + unsigned Imm; + if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) { + if (ReverseEXT) + std::swap(V1, V2); + Imm *= getExtFactor(V1); + return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, + DAG.getConstant(Imm, MVT::i32)); + } else if (V2->getOpcode() == ISD::UNDEF && + isSingletonEXTMask(ShuffleMask, VT, Imm)) { + Imm *= getExtFactor(V1); + return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, + DAG.getConstant(Imm, MVT::i32)); + } + + unsigned WhichResult; + if (isZIPMask(ShuffleMask, VT, WhichResult)) { + unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; + return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); + } + if (isUZPMask(ShuffleMask, VT, WhichResult)) { + unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; + return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); + } + if (isTRNMask(ShuffleMask, VT, WhichResult)) { + unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; + return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); + } + + if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { + unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; + return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); + } + if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { + unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; + return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); + } + if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { + unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; + return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); + } + + SDValue Concat = tryFormConcatFromShuffle(Op, DAG); + if (Concat.getNode()) + return Concat; + + bool DstIsLeft; + int Anomaly; + int NumInputElements = V1.getValueType().getVectorNumElements(); + if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) { + SDValue DstVec = DstIsLeft ? V1 : V2; + SDValue DstLaneV = DAG.getConstant(Anomaly, MVT::i64); + + SDValue SrcVec = V1; + int SrcLane = ShuffleMask[Anomaly]; + if (SrcLane >= NumInputElements) { + SrcVec = V2; + SrcLane -= VT.getVectorNumElements(); + } + SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64); + + EVT ScalarVT = VT.getVectorElementType(); + if (ScalarVT.getSizeInBits() < 32) + ScalarVT = MVT::i32; + + return DAG.getNode( + ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV), + DstLaneV); + } + + // If the shuffle is not directly supported and it has 4 elements, use + // the PerfectShuffle-generated table to synthesize it from other shuffles. + unsigned NumElts = VT.getVectorNumElements(); + if (NumElts == 4) { + unsigned PFIndexes[4]; + for (unsigned i = 0; i != 4; ++i) { + if (ShuffleMask[i] < 0) + PFIndexes[i] = 8; + else + PFIndexes[i] = ShuffleMask[i]; + } + + // Compute the index in the perfect shuffle table. + unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + + PFIndexes[2] * 9 + PFIndexes[3]; + unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; + unsigned Cost = (PFEntry >> 30); + + if (Cost <= 4) + return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); + } + + return GenerateTBL(Op, ShuffleMask, DAG); +} + +static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, + APInt &UndefBits) { + EVT VT = BVN->getValueType(0); + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { + unsigned NumSplats = VT.getSizeInBits() / SplatBitSize; + + for (unsigned i = 0; i < NumSplats; ++i) { + CnstBits <<= SplatBitSize; + UndefBits <<= SplatBitSize; + CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits()); + UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits()); + } + + return true; + } + + return false; +} + +SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, + SelectionDAG &DAG) const { + BuildVectorSDNode *BVN = + dyn_cast(Op.getOperand(1).getNode()); + SDValue LHS = Op.getOperand(0); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + + if (!BVN) + return Op; + + APInt CnstBits(VT.getSizeInBits(), 0); + APInt UndefBits(VT.getSizeInBits(), 0); + if (resolveBuildVector(BVN, CnstBits, UndefBits)) { + // We only have BIC vector immediate instruction, which is and-not. + CnstBits = ~CnstBits; + + // We make use of a little bit of goto ickiness in order to avoid having to + // duplicate the immediate matching logic for the undef toggled case. + bool SecondTry = false; + AttemptModImm: + + if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { + CnstBits = CnstBits.zextOrTrunc(64); + uint64_t CnstVal = CnstBits.getZExtValue(); + + if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(0, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(8, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(16, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(24, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; + SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(0, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; + SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(8, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + } + + if (SecondTry) + goto FailedModImm; + SecondTry = true; + CnstBits = ~UndefBits; + goto AttemptModImm; + } + +// We can always fall back to a non-immediate AND. +FailedModImm: + return Op; +} + +// Specialized code to quickly find if PotentialBVec is a BuildVector that +// consists of only the same constant int value, returned in reference arg +// ConstVal +static bool isAllConstantBuildVector(const SDValue &PotentialBVec, + uint64_t &ConstVal) { + BuildVectorSDNode *Bvec = dyn_cast(PotentialBVec); + if (!Bvec) + return false; + ConstantSDNode *FirstElt = dyn_cast(Bvec->getOperand(0)); + if (!FirstElt) + return false; + EVT VT = Bvec->getValueType(0); + unsigned NumElts = VT.getVectorNumElements(); + for (unsigned i = 1; i < NumElts; ++i) + if (dyn_cast(Bvec->getOperand(i)) != FirstElt) + return false; + ConstVal = FirstElt->getZExtValue(); + return true; +} + +static unsigned getIntrinsicID(const SDNode *N) { + unsigned Opcode = N->getOpcode(); + switch (Opcode) { + default: + return Intrinsic::not_intrinsic; + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IID = cast(N->getOperand(0))->getZExtValue(); + if (IID < Intrinsic::num_intrinsics) + return IID; + return Intrinsic::not_intrinsic; + } + } +} + +// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), +// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a +// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2. +// Also, logical shift right -> sri, with the same structure. +static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + + if (!VT.isVector()) + return SDValue(); + + SDLoc DL(N); + + // Is the first op an AND? + const SDValue And = N->getOperand(0); + if (And.getOpcode() != ISD::AND) + return SDValue(); + + // Is the second op an shl or lshr? + SDValue Shift = N->getOperand(1); + // This will have been turned into: AArch64ISD::VSHL vector, #shift + // or AArch64ISD::VLSHR vector, #shift + unsigned ShiftOpc = Shift.getOpcode(); + if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR)) + return SDValue(); + bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR; + + // Is the shift amount constant? + ConstantSDNode *C2node = dyn_cast(Shift.getOperand(1)); + if (!C2node) + return SDValue(); + + // Is the and mask vector all constant? + uint64_t C1; + if (!isAllConstantBuildVector(And.getOperand(1), C1)) + return SDValue(); + + // Is C1 == ~C2, taking into account how much one can shift elements of a + // particular size? + uint64_t C2 = C2node->getZExtValue(); + unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits(); + if (C2 > ElemSizeInBits) + return SDValue(); + unsigned ElemMask = (1 << ElemSizeInBits) - 1; + if ((C1 & ElemMask) != (~C2 & ElemMask)) + return SDValue(); + + SDValue X = And.getOperand(0); + SDValue Y = Shift.getOperand(0); + + unsigned Intrin = + IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli; + SDValue ResultSLI = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrin, MVT::i32), X, Y, Shift.getOperand(1)); + + DEBUG(dbgs() << "aarch64-lower: transformed: \n"); + DEBUG(N->dump(&DAG)); + DEBUG(dbgs() << "into: \n"); + DEBUG(ResultSLI->dump(&DAG)); + + ++NumShiftInserts; + return ResultSLI; +} + +SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, + SelectionDAG &DAG) const { + // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) + if (EnableAArch64SlrGeneration) { + SDValue Res = tryLowerToSLI(Op.getNode(), DAG); + if (Res.getNode()) + return Res; + } + + BuildVectorSDNode *BVN = + dyn_cast(Op.getOperand(0).getNode()); + SDValue LHS = Op.getOperand(1); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + + // OR commutes, so try swapping the operands. + if (!BVN) { + LHS = Op.getOperand(0); + BVN = dyn_cast(Op.getOperand(1).getNode()); + } + if (!BVN) + return Op; + + APInt CnstBits(VT.getSizeInBits(), 0); + APInt UndefBits(VT.getSizeInBits(), 0); + if (resolveBuildVector(BVN, CnstBits, UndefBits)) { + // We make use of a little bit of goto ickiness in order to avoid having to + // duplicate the immediate matching logic for the undef toggled case. + bool SecondTry = false; + AttemptModImm: + + if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { + CnstBits = CnstBits.zextOrTrunc(64); + uint64_t CnstVal = CnstBits.getZExtValue(); + + if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(0, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(8, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(16, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(24, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; + SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(0, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; + SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(8, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + } + + if (SecondTry) + goto FailedModImm; + SecondTry = true; + CnstBits = UndefBits; + goto AttemptModImm; + } + +// We can always fall back to a non-immediate OR. +FailedModImm: + return Op; +} + +SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + BuildVectorSDNode *BVN = cast(Op.getNode()); + SDLoc dl(Op); + EVT VT = Op.getValueType(); + + APInt CnstBits(VT.getSizeInBits(), 0); + APInt UndefBits(VT.getSizeInBits(), 0); + if (resolveBuildVector(BVN, CnstBits, UndefBits)) { + // We make use of a little bit of goto ickiness in order to avoid having to + // duplicate the immediate matching logic for the undef toggled case. + bool SecondTry = false; + AttemptModImm: + + if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { + CnstBits = CnstBits.zextOrTrunc(64); + uint64_t CnstVal = CnstBits.getZExtValue(); + + // Certain magic vector constants (used to express things like NOT + // and NEG) are passed through unmodified. This allows codegen patterns + // for these operations to match. Special-purpose patterns will lower + // these immediates to MOVIs if it proves necessary. + if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL)) + return Op; + + // The many faces of MOVI... + if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal); + if (VT.getSizeInBits() == 128) { + SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64, + DAG.getConstant(CnstVal, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + // Support the V64 version via subregister insertion. + SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64, + DAG.getConstant(CnstVal, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(0, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(8, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(16, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(24, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; + SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(0, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; + SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(8, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(264, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(272, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8; + SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy, + DAG.getConstant(CnstVal, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + // The few faces of FMOV... + if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32; + SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy, + DAG.getConstant(CnstVal, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) && + VT.getSizeInBits() == 128) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal); + SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64, + DAG.getConstant(CnstVal, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + // The many faces of MVNI... + CnstVal = ~CnstVal; + if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(0, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(8, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(16, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(24, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; + SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(0, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; + SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(8, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(264, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + + if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) { + CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal); + MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; + SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy, + DAG.getConstant(CnstVal, MVT::i32), + DAG.getConstant(272, MVT::i32)); + return DAG.getNode(ISD::BITCAST, dl, VT, Mov); + } + } + + if (SecondTry) + goto FailedModImm; + SecondTry = true; + CnstBits = UndefBits; + goto AttemptModImm; + } +FailedModImm: + + // Scan through the operands to find some interesting properties we can + // exploit: + // 1) If only one value is used, we can use a DUP, or + // 2) if only the low element is not undef, we can just insert that, or + // 3) if only one constant value is used (w/ some non-constant lanes), + // we can splat the constant value into the whole vector then fill + // in the non-constant lanes. + // 4) FIXME: If different constant values are used, but we can intelligently + // select the values we'll be overwriting for the non-constant + // lanes such that we can directly materialize the vector + // some other way (MOVI, e.g.), we can be sneaky. + unsigned NumElts = VT.getVectorNumElements(); + bool isOnlyLowElement = true; + bool usesOnlyOneValue = true; + bool usesOnlyOneConstantValue = true; + bool isConstant = true; + unsigned NumConstantLanes = 0; + SDValue Value; + SDValue ConstantValue; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + if (i > 0) + isOnlyLowElement = false; + if (!isa(V) && !isa(V)) + isConstant = false; + + if (isa(V) || isa(V)) { + ++NumConstantLanes; + if (!ConstantValue.getNode()) + ConstantValue = V; + else if (ConstantValue != V) + usesOnlyOneConstantValue = false; + } + + if (!Value.getNode()) + Value = V; + else if (V != Value) + usesOnlyOneValue = false; + } + + if (!Value.getNode()) + return DAG.getUNDEF(VT); + + if (isOnlyLowElement) + return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); + + // Use DUP for non-constant splats. For f32 constant splats, reduce to + // i32 and try again. + if (usesOnlyOneValue) { + if (!isConstant) { + if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + Value.getValueType() != VT) + return DAG.getNode(AArch64ISD::DUP, dl, VT, Value); + + // This is actually a DUPLANExx operation, which keeps everything vectory. + + // DUPLANE works on 128-bit vectors, widen it if necessary. + SDValue Lane = Value.getOperand(1); + Value = Value.getOperand(0); + if (Value.getValueType().getSizeInBits() == 64) + Value = WidenVector(Value, DAG); + + unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); + return DAG.getNode(Opcode, dl, VT, Value, Lane); + } + + if (VT.getVectorElementType().isFloatingPoint()) { + SmallVector Ops; + MVT NewType = + (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64; + for (unsigned i = 0; i < NumElts; ++i) + Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); + SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops); + Val = LowerBUILD_VECTOR(Val, DAG); + if (Val.getNode()) + return DAG.getNode(ISD::BITCAST, dl, VT, Val); + } + } + + // If there was only one constant value used and for more than one lane, + // start by splatting that value, then replace the non-constant lanes. This + // is better than the default, which will perform a separate initialization + // for each lane. + if (NumConstantLanes > 0 && usesOnlyOneConstantValue) { + SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue); + // Now insert the non-constant lanes. + for (unsigned i = 0; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + SDValue LaneIdx = DAG.getConstant(i, MVT::i64); + if (!isa(V) && !isa(V)) { + // Note that type legalization likely mucked about with the VT of the + // source operand, so we may have to convert it here before inserting. + Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx); + } + } + return Val; + } + + // If all elements are constants and the case above didn't get hit, fall back + // to the default expansion, which will generate a load from the constant + // pool. + if (isConstant) + return SDValue(); + + // Empirical tests suggest this is rarely worth it for vectors of length <= 2. + if (NumElts >= 4) { + SDValue shuffle = ReconstructShuffle(Op, DAG); + if (shuffle != SDValue()) + return shuffle; + } + + // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we + // know the default expansion would otherwise fall back on something even + // worse. For a vector with one or two non-undef values, that's + // scalar_to_vector for the elements followed by a shuffle (provided the + // shuffle is valid for the target) and materialization element by element + // on the stack followed by a load for everything else. + if (!isConstant && !usesOnlyOneValue) { + SDValue Vec = DAG.getUNDEF(VT); + SDValue Op0 = Op.getOperand(0); + unsigned ElemSize = VT.getVectorElementType().getSizeInBits(); + unsigned i = 0; + // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to + // a) Avoid a RMW dependency on the full vector register, and + // b) Allow the register coalescer to fold away the copy if the + // value is already in an S or D register. + if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) { + unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub; + MachineSDNode *N = + DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0, + DAG.getTargetConstant(SubIdx, MVT::i32)); + Vec = SDValue(N, 0); + ++i; + } + for (; i < NumElts; ++i) { + SDValue V = Op.getOperand(i); + if (V.getOpcode() == ISD::UNDEF) + continue; + SDValue LaneIdx = DAG.getConstant(i, MVT::i64); + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); + } + return Vec; + } + + // Just use the default expansion. We failed to find a better alternative. + return SDValue(); +} + +SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); + + // Check for non-constant lane. + if (!isa(Op.getOperand(2))) + return SDValue(); + + EVT VT = Op.getOperand(0).getValueType(); + + // Insertion/extraction are legal for V128 types. + if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || + VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64) + return Op; + + if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && + VT != MVT::v1i64 && VT != MVT::v2f32) + return SDValue(); + + // For V64 types, we perform insertion by expanding the value + // to a V128 type and perform the insertion on that. + SDLoc DL(Op); + SDValue WideVec = WidenVector(Op.getOperand(0), DAG); + EVT WideTy = WideVec.getValueType(); + + SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec, + Op.getOperand(1), Op.getOperand(2)); + // Re-narrow the resultant vector. + return NarrowVector(Node, DAG); +} + +SDValue +AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); + + // Check for non-constant lane. + if (!isa(Op.getOperand(1))) + return SDValue(); + + EVT VT = Op.getOperand(0).getValueType(); + + // Insertion/extraction are legal for V128 types. + if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || + VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64) + return Op; + + if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && + VT != MVT::v1i64 && VT != MVT::v2f32) + return SDValue(); + + // For V64 types, we perform extraction by expanding the value + // to a V128 type and perform the extraction on that. + SDLoc DL(Op); + SDValue WideVec = WidenVector(Op.getOperand(0), DAG); + EVT WideTy = WideVec.getValueType(); + + EVT ExtrTy = WideTy.getVectorElementType(); + if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8) + ExtrTy = MVT::i32; + + // For extractions, we just return the result directly. + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec, + Op.getOperand(1)); +} + +SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getOperand(0).getValueType(); + SDLoc dl(Op); + // Just in case... + if (!VT.isVector()) + return SDValue(); + + ConstantSDNode *Cst = dyn_cast(Op.getOperand(1)); + if (!Cst) + return SDValue(); + unsigned Val = Cst->getZExtValue(); + + unsigned Size = Op.getValueType().getSizeInBits(); + if (Val == 0) { + switch (Size) { + case 8: + return DAG.getTargetExtractSubreg(AArch64::bsub, dl, Op.getValueType(), + Op.getOperand(0)); + case 16: + return DAG.getTargetExtractSubreg(AArch64::hsub, dl, Op.getValueType(), + Op.getOperand(0)); + case 32: + return DAG.getTargetExtractSubreg(AArch64::ssub, dl, Op.getValueType(), + Op.getOperand(0)); + case 64: + return DAG.getTargetExtractSubreg(AArch64::dsub, dl, Op.getValueType(), + Op.getOperand(0)); + default: + llvm_unreachable("Unexpected vector type in extract_subvector!"); + } + } + // If this is extracting the upper 64-bits of a 128-bit vector, we match + // that directly. + if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64) + return Op; + + return SDValue(); +} + +bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, + EVT VT) const { + if (VT.getVectorNumElements() == 4 && + (VT.is128BitVector() || VT.is64BitVector())) { + unsigned PFIndexes[4]; + for (unsigned i = 0; i != 4; ++i) { + if (M[i] < 0) + PFIndexes[i] = 8; + else + PFIndexes[i] = M[i]; + } + + // Compute the index in the perfect shuffle table. + unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + + PFIndexes[2] * 9 + PFIndexes[3]; + unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; + unsigned Cost = (PFEntry >> 30); + + if (Cost <= 4) + return true; + } + + bool DummyBool; + int DummyInt; + unsigned DummyUnsigned; + + return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) || + isREVMask(M, VT, 32) || isREVMask(M, VT, 16) || + isEXTMask(M, VT, DummyBool, DummyUnsigned) || + // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM. + isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) || + isZIPMask(M, VT, DummyUnsigned) || + isTRN_v_undef_Mask(M, VT, DummyUnsigned) || + isUZP_v_undef_Mask(M, VT, DummyUnsigned) || + isZIP_v_undef_Mask(M, VT, DummyUnsigned) || + isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) || + isConcatMask(M, VT, VT.getSizeInBits() == 128)); +} + +/// getVShiftImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift operation, where all the elements of the +/// build_vector must have the same constant integer value. +static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { + // Ignore bit_converts. + while (Op.getOpcode() == ISD::BITCAST) + Op = Op.getOperand(0); + BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, + HasAnyUndefs, ElementBits) || + SplatBitSize > ElementBits) + return false; + Cnt = SplatBits.getSExtValue(); + return true; +} + +/// isVShiftLImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift left operation. That value must be in the range: +/// 0 <= Value < ElementBits for a left shift; or +/// 0 <= Value <= ElementBits for a long left shift. +static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + if (!getVShiftImm(Op, ElementBits, Cnt)) + return false; + return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); +} + +/// isVShiftRImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift right operation. For a shift opcode, the value +/// is positive, but for an intrinsic the value count must be negative. The +/// absolute value must be in the range: +/// 1 <= |Value| <= ElementBits for a right shift; or +/// 1 <= |Value| <= ElementBits/2 for a narrow right shift. +static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, + int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + if (!getVShiftImm(Op, ElementBits, Cnt)) + return false; + if (isIntrinsic) + Cnt = -Cnt; + return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); +} + +SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDLoc DL(Op); + int64_t Cnt; + + if (!Op.getOperand(1).getValueType().isVector()) + return Op; + unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + + switch (Op.getOpcode()) { + default: + llvm_unreachable("unexpected shift opcode"); + + case ISD::SHL: + if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) + return DAG.getNode(AArch64ISD::VSHL, SDLoc(Op), VT, Op.getOperand(0), + DAG.getConstant(Cnt, MVT::i32)); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Intrinsic::aarch64_neon_ushl, MVT::i32), + Op.getOperand(0), Op.getOperand(1)); + case ISD::SRA: + case ISD::SRL: + // Right shift immediate + if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) && + Cnt < EltSize) { + unsigned Opc = + (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; + return DAG.getNode(Opc, SDLoc(Op), VT, Op.getOperand(0), + DAG.getConstant(Cnt, MVT::i32)); + } + + // Right shift register. Note, there is not a shift right register + // instruction, but the shift left register instruction takes a signed + // value, where negative numbers specify a right shift. + unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl + : Intrinsic::aarch64_neon_ushl; + // negate the shift amount + SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1)); + SDValue NegShiftLeft = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getConstant(Opc, MVT::i32), Op.getOperand(0), NegShift); + return NegShiftLeft; + } + + return SDValue(); +} + +static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, + AArch64CC::CondCode CC, bool NoNans, EVT VT, + SDLoc dl, SelectionDAG &DAG) { + EVT SrcVT = LHS.getValueType(); + + BuildVectorSDNode *BVN = dyn_cast(RHS.getNode()); + APInt CnstBits(VT.getSizeInBits(), 0); + APInt UndefBits(VT.getSizeInBits(), 0); + bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits); + bool IsZero = IsCnst && (CnstBits == 0); + + if (SrcVT.getVectorElementType().isFloatingPoint()) { + switch (CC) { + default: + return SDValue(); + case AArch64CC::NE: { + SDValue Fcmeq; + if (IsZero) + Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); + else + Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); + return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq); + } + case AArch64CC::EQ: + if (IsZero) + return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); + return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); + case AArch64CC::GE: + if (IsZero) + return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS); + return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS); + case AArch64CC::GT: + if (IsZero) + return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS); + return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS); + case AArch64CC::LS: + if (IsZero) + return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS); + return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS); + case AArch64CC::LT: + if (!NoNans) + return SDValue(); + // If we ignore NaNs then we can use to the MI implementation. + // Fallthrough. + case AArch64CC::MI: + if (IsZero) + return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS); + return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS); + } + } + + switch (CC) { + default: + return SDValue(); + case AArch64CC::NE: { + SDValue Cmeq; + if (IsZero) + Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); + else + Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); + return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq); + } + case AArch64CC::EQ: + if (IsZero) + return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); + return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); + case AArch64CC::GE: + if (IsZero) + return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS); + return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS); + case AArch64CC::GT: + if (IsZero) + return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS); + return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS); + case AArch64CC::LE: + if (IsZero) + return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS); + return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS); + case AArch64CC::LS: + return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS); + case AArch64CC::LO: + return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS); + case AArch64CC::LT: + if (IsZero) + return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS); + return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS); + case AArch64CC::HI: + return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS); + case AArch64CC::HS: + return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS); + } +} + +SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, + SelectionDAG &DAG) const { + ISD::CondCode CC = cast(Op.getOperand(2))->get(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDLoc dl(Op); + + if (LHS.getValueType().getVectorElementType().isInteger()) { + assert(LHS.getValueType() == RHS.getValueType()); + AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); + return EmitVectorComparison(LHS, RHS, AArch64CC, false, Op.getValueType(), + dl, DAG); + } + + assert(LHS.getValueType().getVectorElementType() == MVT::f32 || + LHS.getValueType().getVectorElementType() == MVT::f64); + + // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally + // clean. Some of them require two branches to implement. + AArch64CC::CondCode CC1, CC2; + bool ShouldInvert; + changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert); + + bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath; + SDValue Cmp = + EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG); + if (!Cmp.getNode()) + return SDValue(); + + if (CC2 != AArch64CC::AL) { + SDValue Cmp2 = + EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG); + if (!Cmp2.getNode()) + return SDValue(); + + Cmp = DAG.getNode(ISD::OR, dl, Cmp.getValueType(), Cmp, Cmp2); + } + + if (ShouldInvert) + return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType()); + + return Cmp; +} + +/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as +/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment +/// specified in the intrinsic calls. +bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, + unsigned Intrinsic) const { + switch (Intrinsic) { + case Intrinsic::aarch64_neon_ld2: + case Intrinsic::aarch64_neon_ld3: + case Intrinsic::aarch64_neon_ld4: + case Intrinsic::aarch64_neon_ld1x2: + case Intrinsic::aarch64_neon_ld1x3: + case Intrinsic::aarch64_neon_ld1x4: + case Intrinsic::aarch64_neon_ld2lane: + case Intrinsic::aarch64_neon_ld3lane: + case Intrinsic::aarch64_neon_ld4lane: + case Intrinsic::aarch64_neon_ld2r: + case Intrinsic::aarch64_neon_ld3r: + case Intrinsic::aarch64_neon_ld4r: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + // Conservatively set memVT to the entire set of vectors loaded. + uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8; + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); + Info.offset = 0; + Info.align = 0; + Info.vol = false; // volatile loads with NEON intrinsics not supported + Info.readMem = true; + Info.writeMem = false; + return true; + } + case Intrinsic::aarch64_neon_st2: + case Intrinsic::aarch64_neon_st3: + case Intrinsic::aarch64_neon_st4: + case Intrinsic::aarch64_neon_st1x2: + case Intrinsic::aarch64_neon_st1x3: + case Intrinsic::aarch64_neon_st1x4: + case Intrinsic::aarch64_neon_st2lane: + case Intrinsic::aarch64_neon_st3lane: + case Intrinsic::aarch64_neon_st4lane: { + Info.opc = ISD::INTRINSIC_VOID; + // Conservatively set memVT to the entire set of vectors stored. + unsigned NumElts = 0; + for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { + Type *ArgTy = I.getArgOperand(ArgI)->getType(); + if (!ArgTy->isVectorTy()) + break; + NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8; + } + Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); + Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); + Info.offset = 0; + Info.align = 0; + Info.vol = false; // volatile stores with NEON intrinsics not supported + Info.readMem = false; + Info.writeMem = true; + return true; + } + case Intrinsic::aarch64_ldaxr: + case Intrinsic::aarch64_ldxr: { + PointerType *PtrTy = cast(I.getArgOperand(0)->getType()); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); + Info.vol = true; + Info.readMem = true; + Info.writeMem = false; + return true; + } + case Intrinsic::aarch64_stlxr: + case Intrinsic::aarch64_stxr: { + PointerType *PtrTy = cast(I.getArgOperand(1)->getType()); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.ptrVal = I.getArgOperand(1); + Info.offset = 0; + Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); + Info.vol = true; + Info.readMem = false; + Info.writeMem = true; + return true; + } + case Intrinsic::aarch64_ldaxp: + case Intrinsic::aarch64_ldxp: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::i128; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.align = 16; + Info.vol = true; + Info.readMem = true; + Info.writeMem = false; + return true; + } + case Intrinsic::aarch64_stlxp: + case Intrinsic::aarch64_stxp: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::i128; + Info.ptrVal = I.getArgOperand(2); + Info.offset = 0; + Info.align = 16; + Info.vol = true; + Info.readMem = false; + Info.writeMem = true; + return true; + } + default: + break; + } + + return false; +} + +// Truncations from 64-bit GPR to 32-bit GPR is free. +bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { + if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) + return false; + unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); + unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); + if (NumBits1 <= NumBits2) + return false; + return true; +} +bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { + if (!VT1.isInteger() || !VT2.isInteger()) + return false; + unsigned NumBits1 = VT1.getSizeInBits(); + unsigned NumBits2 = VT2.getSizeInBits(); + if (NumBits1 <= NumBits2) + return false; + return true; +} + +// All 32-bit GPR operations implicitly zero the high-half of the corresponding +// 64-bit GPR. +bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { + if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) + return false; + unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); + unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); + if (NumBits1 == 32 && NumBits2 == 64) + return true; + return false; +} +bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { + if (!VT1.isInteger() || !VT2.isInteger()) + return false; + unsigned NumBits1 = VT1.getSizeInBits(); + unsigned NumBits2 = VT2.getSizeInBits(); + if (NumBits1 == 32 && NumBits2 == 64) + return true; + return false; +} + +bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { + EVT VT1 = Val.getValueType(); + if (isZExtFree(VT1, VT2)) { + return true; + } + + if (Val.getOpcode() != ISD::LOAD) + return false; + + // 8-, 16-, and 32-bit integer loads all implicitly zero-extend. + return (VT1.isSimple() && VT1.isInteger() && VT2.isSimple() && + VT2.isInteger() && VT1.getSizeInBits() <= 32); +} + +bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType, + unsigned &RequiredAligment) const { + if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy()) + return false; + // Cyclone supports unaligned accesses. + RequiredAligment = 0; + unsigned NumBits = LoadedType->getPrimitiveSizeInBits(); + return NumBits == 32 || NumBits == 64; +} + +bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, + unsigned &RequiredAligment) const { + if (!LoadedType.isSimple() || + (!LoadedType.isInteger() && !LoadedType.isFloatingPoint())) + return false; + // Cyclone supports unaligned accesses. + RequiredAligment = 0; + unsigned NumBits = LoadedType.getSizeInBits(); + return NumBits == 32 || NumBits == 64; +} + +static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, + unsigned AlignCheck) { + return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && + (DstAlign == 0 || DstAlign % AlignCheck == 0)); +} + +EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, + unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, + bool MemcpyStrSrc, + MachineFunction &MF) const { + // Don't use AdvSIMD to implement 16-byte memset. It would have taken one + // instruction to materialize the v2i64 zero and one store (with restrictive + // addressing mode). Just do two i64 store of zero-registers. + bool Fast; + const Function *F = MF.getFunction(); + if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 && + !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::NoImplicitFloat) && + (memOpAlign(SrcAlign, DstAlign, 16) || + (allowsUnalignedMemoryAccesses(MVT::f128, 0, &Fast) && Fast))) + return MVT::f128; + + return Size >= 8 ? MVT::i64 : MVT::i32; +} + +// 12-bit optionally shifted immediates are legal for adds. +bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { + if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0)) + return true; + return false; +} + +// Integer comparisons are implemented with ADDS/SUBS, so the range of valid +// immediates is the same as for an add or a sub. +bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { + if (Immed < 0) + Immed *= -1; + return isLegalAddImmediate(Immed); +} + +/// isLegalAddressingMode - Return true if the addressing mode represented +/// by AM is legal for this target, for a load/store of the specified type. +bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM, + Type *Ty) const { + // AArch64 has five basic addressing modes: + // reg + // reg + 9-bit signed offset + // reg + SIZE_IN_BYTES * 12-bit unsigned offset + // reg1 + reg2 + // reg + SIZE_IN_BYTES * reg + + // No global is ever allowed as a base. + if (AM.BaseGV) + return false; + + // No reg+reg+imm addressing. + if (AM.HasBaseReg && AM.BaseOffs && AM.Scale) + return false; + + // check reg + imm case: + // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 + uint64_t NumBytes = 0; + if (Ty->isSized()) { + uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty); + NumBytes = NumBits / 8; + if (!isPowerOf2_64(NumBits)) + NumBytes = 0; + } + + if (!AM.Scale) { + int64_t Offset = AM.BaseOffs; + + // 9-bit signed offset + if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1) + return true; + + // 12-bit unsigned offset + unsigned shift = Log2_64(NumBytes); + if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 && + // Must be a multiple of NumBytes (NumBytes is a power of 2) + (Offset >> shift) << shift == Offset) + return true; + return false; + } + + // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 + + if (!AM.Scale || AM.Scale == 1 || + (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes)) + return true; + return false; +} + +int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM, + Type *Ty) const { + // Scaling factors are not free at all. + // Operands | Rt Latency + // ------------------------------------------- + // Rt, [Xn, Xm] | 4 + // ------------------------------------------- + // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 + // Rt, [Xn, Wm, #imm] | + if (isLegalAddressingMode(AM, Ty)) + // Scale represents reg2 * scale, thus account for 1 if + // it is not equal to 0 or 1. + return AM.Scale != 0 && AM.Scale != 1; + return -1; +} + +bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { + VT = VT.getScalarType(); + + if (!VT.isSimple()) + return false; + + switch (VT.getSimpleVT().SimpleTy) { + case MVT::f32: + case MVT::f64: + return true; + default: + break; + } + + return false; +} + +const MCPhysReg * +AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const { + // LR is a callee-save register, but we must treat it as clobbered by any call + // site. Hence we include LR in the scratch registers, which are in turn added + // as implicit-defs for stackmaps and patchpoints. + static const MCPhysReg ScratchRegs[] = { + AArch64::X16, AArch64::X17, AArch64::LR, 0 + }; + return ScratchRegs; +} + +bool +AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const { + EVT VT = N->getValueType(0); + // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine + // it with shift to let it be lowered to UBFX. + if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) && + isa(N->getOperand(1))) { + uint64_t TruncMask = N->getConstantOperandVal(1); + if (isMask_64(TruncMask) && + N->getOperand(0).getOpcode() == ISD::SRL && + isa(N->getOperand(0)->getOperand(1))) + return false; + } + return true; +} + +bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const { + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + if (BitSize == 0) + return false; + + int64_t Val = Imm.getSExtValue(); + if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize)) + return true; + + if ((int64_t)Val < 0) + Val = ~Val; + if (BitSize == 32) + Val &= (1LL << 32) - 1; + + unsigned LZ = countLeadingZeros((uint64_t)Val); + unsigned Shift = (63 - LZ) / 16; + // MOVZ is free so return true for one or fewer MOVK. + return (Shift < 3) ? true : false; +} + +// Generate SUBS and CSEL for integer abs. +static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDLoc DL(N); + + // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) + // and change it to SUB and CSEL. + if (VT.isInteger() && N->getOpcode() == ISD::XOR && + N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 && + N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) + if (ConstantSDNode *Y1C = dyn_cast(N1.getOperand(1))) + if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) { + SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), + N0.getOperand(0)); + // Generate SUBS & CSEL. + SDValue Cmp = + DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), + N0.getOperand(0), DAG.getConstant(0, VT)); + return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg, + DAG.getConstant(AArch64CC::PL, MVT::i32), + SDValue(Cmp.getNode(), 1)); + } + return SDValue(); +} + +// performXorCombine - Attempts to handle integer ABS. +static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + return performIntegerAbsCombine(N, DAG); +} + +static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + // Multiplication of a power of two plus/minus one can be done more + // cheaply as as shift+add/sub. For now, this is true unilaterally. If + // future CPUs have a cheaper MADD instruction, this may need to be + // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and + // 64-bit is 5 cycles, so this is always a win. + if (ConstantSDNode *C = dyn_cast(N->getOperand(1))) { + APInt Value = C->getAPIntValue(); + EVT VT = N->getValueType(0); + APInt VP1 = Value + 1; + if (VP1.isPowerOf2()) { + // Multiplying by one less than a power of two, replace with a shift + // and a subtract. + SDValue ShiftedVal = + DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), + DAG.getConstant(VP1.logBase2(), MVT::i64)); + return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal, N->getOperand(0)); + } + APInt VM1 = Value - 1; + if (VM1.isPowerOf2()) { + // Multiplying by one more than a power of two, replace with a shift + // and an add. + SDValue ShiftedVal = + DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), + DAG.getConstant(VM1.logBase2(), MVT::i64)); + return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0)); + } + } + return SDValue(); +} + +static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + if (VT != MVT::f32 && VT != MVT::f64) + return SDValue(); + // Only optimize when the source and destination types have the same width. + if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits()) + return SDValue(); + + // If the result of an integer load is only used by an integer-to-float + // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. + // This eliminates an "integer-to-vector-move UOP and improve throughput. + SDValue N0 = N->getOperand(0); + if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && + // Do not change the width of a volatile load. + !cast(N0)->isVolatile()) { + LoadSDNode *LN0 = cast(N0); + SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), + LN0->getPointerInfo(), LN0->isVolatile(), + LN0->isNonTemporal(), LN0->isInvariant(), + LN0->getAlignment()); + + // Make sure successors of the original load stay after it by updating them + // to use the new Chain. + DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1)); + + unsigned Opcode = + (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF; + return DAG.getNode(Opcode, SDLoc(N), VT, Load); + } + + return SDValue(); +} + +/// An EXTR instruction is made up of two shifts, ORed together. This helper +/// searches for and classifies those shifts. +static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, + bool &FromHi) { + if (N.getOpcode() == ISD::SHL) + FromHi = false; + else if (N.getOpcode() == ISD::SRL) + FromHi = true; + else + return false; + + if (!isa(N.getOperand(1))) + return false; + + ShiftAmount = N->getConstantOperandVal(1); + Src = N->getOperand(0); + return true; +} + +/// EXTR instruction extracts a contiguous chunk of bits from two existing +/// registers viewed as a high/low pair. This function looks for the pattern: +/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an +/// EXTR. Can't quite be done in TableGen because the two immediates aren't +/// independent. +static SDValue tryCombineToEXTR(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + EVT VT = N->getValueType(0); + + assert(N->getOpcode() == ISD::OR && "Unexpected root"); + + if (VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + SDValue LHS; + uint32_t ShiftLHS = 0; + bool LHSFromHi = 0; + if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) + return SDValue(); + + SDValue RHS; + uint32_t ShiftRHS = 0; + bool RHSFromHi = 0; + if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) + return SDValue(); + + // If they're both trying to come from the high part of the register, they're + // not really an EXTR. + if (LHSFromHi == RHSFromHi) + return SDValue(); + + if (ShiftLHS + ShiftRHS != VT.getSizeInBits()) + return SDValue(); + + if (LHSFromHi) { + std::swap(LHS, RHS); + std::swap(ShiftLHS, ShiftRHS); + } + + return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS, + DAG.getConstant(ShiftRHS, MVT::i64)); +} + +static SDValue tryCombineToBSL(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + EVT VT = N->getValueType(0); + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + if (!VT.isVector()) + return SDValue(); + + SDValue N0 = N->getOperand(0); + if (N0.getOpcode() != ISD::AND) + return SDValue(); + + SDValue N1 = N->getOperand(1); + if (N1.getOpcode() != ISD::AND) + return SDValue(); + + // We only have to look for constant vectors here since the general, variable + // case can be handled in TableGen. + unsigned Bits = VT.getVectorElementType().getSizeInBits(); + uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); + for (int i = 1; i >= 0; --i) + for (int j = 1; j >= 0; --j) { + BuildVectorSDNode *BVN0 = dyn_cast(N0->getOperand(i)); + BuildVectorSDNode *BVN1 = dyn_cast(N1->getOperand(j)); + if (!BVN0 || !BVN1) + continue; + + bool FoundMatch = true; + for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) { + ConstantSDNode *CN0 = dyn_cast(BVN0->getOperand(k)); + ConstantSDNode *CN1 = dyn_cast(BVN1->getOperand(k)); + if (!CN0 || !CN1 || + CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) { + FoundMatch = false; + break; + } + } + + if (FoundMatch) + return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0), + N0->getOperand(1 - i), N1->getOperand(1 - j)); + } + + return SDValue(); +} + +static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *Subtarget) { + // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) + if (!EnableAArch64ExtrGeneration) + return SDValue(); + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + SDValue Res = tryCombineToEXTR(N, DCI); + if (Res.getNode()) + return Res; + + Res = tryCombineToBSL(N, DCI); + if (Res.getNode()) + return Res; + + return SDValue(); +} + +static SDValue performBitcastCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + // Wait 'til after everything is legalized to try this. That way we have + // legal vector types and such. + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + // Remove extraneous bitcasts around an extract_subvector. + // For example, + // (v4i16 (bitconvert + // (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1))))) + // becomes + // (extract_subvector ((v8i16 ...), (i64 4))) + + // Only interested in 64-bit vectors as the ultimate result. + EVT VT = N->getValueType(0); + if (!VT.isVector()) + return SDValue(); + if (VT.getSimpleVT().getSizeInBits() != 64) + return SDValue(); + // Is the operand an extract_subvector starting at the beginning or halfway + // point of the vector? A low half may also come through as an + // EXTRACT_SUBREG, so look for that, too. + SDValue Op0 = N->getOperand(0); + if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR && + !(Op0->isMachineOpcode() && + Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG)) + return SDValue(); + uint64_t idx = cast(Op0->getOperand(1))->getZExtValue(); + if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) { + if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0) + return SDValue(); + } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) { + if (idx != AArch64::dsub) + return SDValue(); + // The dsub reference is equivalent to a lane zero subvector reference. + idx = 0; + } + // Look through the bitcast of the input to the extract. + if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST) + return SDValue(); + SDValue Source = Op0->getOperand(0)->getOperand(0); + // If the source type has twice the number of elements as our destination + // type, we know this is an extract of the high or low half of the vector. + EVT SVT = Source->getValueType(0); + if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2) + return SDValue(); + + DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n"); + + // Create the simplified form to just extract the low or high half of the + // vector directly rather than bothering with the bitcasts. + SDLoc dl(N); + unsigned NumElements = VT.getVectorNumElements(); + if (idx) { + SDValue HalfIdx = DAG.getConstant(NumElements, MVT::i64); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx); + } else { + SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, MVT::i32); + return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT, + Source, SubReg), + 0); + } +} + +static SDValue performConcatVectorsCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + // Wait 'til after everything is legalized to try this. That way we have + // legal vector types and such. + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDLoc dl(N); + EVT VT = N->getValueType(0); + + // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector + // splat. The indexed instructions are going to be expecting a DUPLANE64, so + // canonicalise to that. + if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) { + assert(VT.getVectorElementType().getSizeInBits() == 64); + return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, + WidenVector(N->getOperand(0), DAG), + DAG.getConstant(0, MVT::i64)); + } + + // Canonicalise concat_vectors so that the right-hand vector has as few + // bit-casts as possible before its real operation. The primary matching + // destination for these operations will be the narrowing "2" instructions, + // which depend on the operation being performed on this right-hand vector. + // For example, + // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS)))) + // becomes + // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) + + SDValue Op1 = N->getOperand(1); + if (Op1->getOpcode() != ISD::BITCAST) + return SDValue(); + SDValue RHS = Op1->getOperand(0); + MVT RHSTy = RHS.getValueType().getSimpleVT(); + // If the RHS is not a vector, this is not the pattern we're looking for. + if (!RHSTy.isVector()) + return SDValue(); + + DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n"); + + MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(), + RHSTy.getVectorNumElements() * 2); + return DAG.getNode( + ISD::BITCAST, dl, VT, + DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy, + DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS)); +} + +static SDValue tryCombineFixedPointConvert(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + // Wait 'til after everything is legalized to try this. That way we have + // legal vector types and such. + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + // Transform a scalar conversion of a value from a lane extract into a + // lane extract of a vector conversion. E.g., from foo1 to foo2: + // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); } + // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; } + // + // The second form interacts better with instruction selection and the + // register allocator to avoid cross-class register copies that aren't + // coalescable due to a lane reference. + + // Check the operand and see if it originates from a lane extract. + SDValue Op1 = N->getOperand(1); + if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + // Yep, no additional predication needed. Perform the transform. + SDValue IID = N->getOperand(0); + SDValue Shift = N->getOperand(2); + SDValue Vec = Op1.getOperand(0); + SDValue Lane = Op1.getOperand(1); + EVT ResTy = N->getValueType(0); + EVT VecResTy; + SDLoc DL(N); + + // The vector width should be 128 bits by the time we get here, even + // if it started as 64 bits (the extract_vector handling will have + // done so). + assert(Vec.getValueType().getSizeInBits() == 128 && + "unexpected vector size on extract_vector_elt!"); + if (Vec.getValueType() == MVT::v4i32) + VecResTy = MVT::v4f32; + else if (Vec.getValueType() == MVT::v2i64) + VecResTy = MVT::v2f64; + else + assert(0 && "unexpected vector type!"); + + SDValue Convert = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane); + } + return SDValue(); +} + +// AArch64 high-vector "long" operations are formed by performing the non-high +// version on an extract_subvector of each operand which gets the high half: +// +// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS)) +// +// However, there are cases which don't have an extract_high explicitly, but +// have another operation that can be made compatible with one for free. For +// example: +// +// (dupv64 scalar) --> (extract_high (dup128 scalar)) +// +// This routine does the actual conversion of such DUPs, once outer routines +// have determined that everything else is in order. +static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { + // We can handle most types of duplicate, but the lane ones have an extra + // operand saying *which* lane, so we need to know. + bool IsDUPLANE; + switch (N.getOpcode()) { + case AArch64ISD::DUP: + IsDUPLANE = false; + break; + case AArch64ISD::DUPLANE8: + case AArch64ISD::DUPLANE16: + case AArch64ISD::DUPLANE32: + case AArch64ISD::DUPLANE64: + IsDUPLANE = true; + break; + default: + return SDValue(); + } + + MVT NarrowTy = N.getSimpleValueType(); + if (!NarrowTy.is64BitVector()) + return SDValue(); + + MVT ElementTy = NarrowTy.getVectorElementType(); + unsigned NumElems = NarrowTy.getVectorNumElements(); + MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2); + + SDValue NewDUP; + if (IsDUPLANE) + NewDUP = DAG.getNode(N.getOpcode(), SDLoc(N), NewDUPVT, N.getOperand(0), + N.getOperand(1)); + else + NewDUP = DAG.getNode(AArch64ISD::DUP, SDLoc(N), NewDUPVT, N.getOperand(0)); + + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N.getNode()), NarrowTy, + NewDUP, DAG.getConstant(NumElems, MVT::i64)); +} + +static bool isEssentiallyExtractSubvector(SDValue N) { + if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR) + return true; + + return N.getOpcode() == ISD::BITCAST && + N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR; +} + +/// \brief Helper structure to keep track of ISD::SET_CC operands. +struct GenericSetCCInfo { + const SDValue *Opnd0; + const SDValue *Opnd1; + ISD::CondCode CC; +}; + +/// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code. +struct AArch64SetCCInfo { + const SDValue *Cmp; + AArch64CC::CondCode CC; +}; + +/// \brief Helper structure to keep track of SetCC information. +union SetCCInfo { + GenericSetCCInfo Generic; + AArch64SetCCInfo AArch64; +}; + +/// \brief Helper structure to be able to read SetCC information. If set to +/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a +/// GenericSetCCInfo. +struct SetCCInfoAndKind { + SetCCInfo Info; + bool IsAArch64; +}; + +/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or +/// an +/// AArch64 lowered one. +/// \p SetCCInfo is filled accordingly. +/// \post SetCCInfo is meanginfull only when this function returns true. +/// \return True when Op is a kind of SET_CC operation. +static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) { + // If this is a setcc, this is straight forward. + if (Op.getOpcode() == ISD::SETCC) { + SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0); + SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1); + SetCCInfo.Info.Generic.CC = cast(Op.getOperand(2))->get(); + SetCCInfo.IsAArch64 = false; + return true; + } + // Otherwise, check if this is a matching csel instruction. + // In other words: + // - csel 1, 0, cc + // - csel 0, 1, !cc + if (Op.getOpcode() != AArch64ISD::CSEL) + return false; + // Set the information about the operands. + // TODO: we want the operands of the Cmp not the csel + SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3); + SetCCInfo.IsAArch64 = true; + SetCCInfo.Info.AArch64.CC = static_cast( + cast(Op.getOperand(2))->getZExtValue()); + + // Check that the operands matches the constraints: + // (1) Both operands must be constants. + // (2) One must be 1 and the other must be 0. + ConstantSDNode *TValue = dyn_cast(Op.getOperand(0)); + ConstantSDNode *FValue = dyn_cast(Op.getOperand(1)); + + // Check (1). + if (!TValue || !FValue) + return false; + + // Check (2). + if (!TValue->isOne()) { + // Update the comparison when we are interested in !cc. + std::swap(TValue, FValue); + SetCCInfo.Info.AArch64.CC = + AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC); + } + return TValue->isOne() && FValue->isNullValue(); +} + +// Returns true if Op is setcc or zext of setcc. +static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) { + if (isSetCC(Op, Info)) + return true; + return ((Op.getOpcode() == ISD::ZERO_EXTEND) && + isSetCC(Op->getOperand(0), Info)); +} + +// The folding we want to perform is: +// (add x, [zext] (setcc cc ...) ) +// --> +// (csel x, (add x, 1), !cc ...) +// +// The latter will get matched to a CSINC instruction. +static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { + assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!"); + SDValue LHS = Op->getOperand(0); + SDValue RHS = Op->getOperand(1); + SetCCInfoAndKind InfoAndKind; + + // If neither operand is a SET_CC, give up. + if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) { + std::swap(LHS, RHS); + if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) + return SDValue(); + } + + // FIXME: This could be generatized to work for FP comparisons. + EVT CmpVT = InfoAndKind.IsAArch64 + ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType() + : InfoAndKind.Info.Generic.Opnd0->getValueType(); + if (CmpVT != MVT::i32 && CmpVT != MVT::i64) + return SDValue(); + + SDValue CCVal; + SDValue Cmp; + SDLoc dl(Op); + if (InfoAndKind.IsAArch64) { + CCVal = DAG.getConstant( + AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), MVT::i32); + Cmp = *InfoAndKind.Info.AArch64.Cmp; + } else + Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0, + *InfoAndKind.Info.Generic.Opnd1, + ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true), + CCVal, DAG, dl); + + EVT VT = Op->getValueType(0); + LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, VT)); + return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp); +} + +// The basic add/sub long vector instructions have variants with "2" on the end +// which act on the high-half of their inputs. They are normally matched by +// patterns like: +// +// (add (zeroext (extract_high LHS)), +// (zeroext (extract_high RHS))) +// -> uaddl2 vD, vN, vM +// +// However, if one of the extracts is something like a duplicate, this +// instruction can still be used profitably. This function puts the DAG into a +// more appropriate form for those patterns to trigger. +static SDValue performAddSubLongCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + MVT VT = N->getSimpleValueType(0); + if (!VT.is128BitVector()) { + if (N->getOpcode() == ISD::ADD) + return performSetccAddFolding(N, DAG); + return SDValue(); + } + + // Make sure both branches are extended in the same way. + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + if ((LHS.getOpcode() != ISD::ZERO_EXTEND && + LHS.getOpcode() != ISD::SIGN_EXTEND) || + LHS.getOpcode() != RHS.getOpcode()) + return SDValue(); + + unsigned ExtType = LHS.getOpcode(); + + // It's not worth doing if at least one of the inputs isn't already an + // extract, but we don't know which it'll be so we have to try both. + if (isEssentiallyExtractSubvector(LHS.getOperand(0))) { + RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG); + if (!RHS.getNode()) + return SDValue(); + + RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS); + } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) { + LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG); + if (!LHS.getNode()) + return SDValue(); + + LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS); + } + + return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS); +} + +// Massage DAGs which we can use the high-half "long" operations on into +// something isel will recognize better. E.g. +// +// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) --> +// (aarch64_neon_umull (extract_high (v2i64 vec))) +// (extract_high (v2i64 (dup128 scalar))))) +// +static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + assert(LHS.getValueType().is64BitVector() && + RHS.getValueType().is64BitVector() && + "unexpected shape for long operation"); + + // Either node could be a DUP, but it's not worth doing both of them (you'd + // just as well use the non-high version) so look for a corresponding extract + // operation on the other "wing". + if (isEssentiallyExtractSubvector(LHS)) { + RHS = tryExtendDUPToExtractHigh(RHS, DAG); + if (!RHS.getNode()) + return SDValue(); + } else if (isEssentiallyExtractSubvector(RHS)) { + LHS = tryExtendDUPToExtractHigh(LHS, DAG); + if (!LHS.getNode()) + return SDValue(); + } + + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0), + N->getOperand(0), LHS, RHS); +} + +static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { + MVT ElemTy = N->getSimpleValueType(0).getScalarType(); + unsigned ElemBits = ElemTy.getSizeInBits(); + + int64_t ShiftAmount; + if (BuildVectorSDNode *BVN = dyn_cast(N->getOperand(2))) { + APInt SplatValue, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, + HasAnyUndefs, ElemBits) || + SplatBitSize != ElemBits) + return SDValue(); + + ShiftAmount = SplatValue.getSExtValue(); + } else if (ConstantSDNode *CVN = dyn_cast(N->getOperand(2))) { + ShiftAmount = CVN->getSExtValue(); + } else + return SDValue(); + + unsigned Opcode; + bool IsRightShift; + switch (IID) { + default: + llvm_unreachable("Unknown shift intrinsic"); + case Intrinsic::aarch64_neon_sqshl: + Opcode = AArch64ISD::SQSHL_I; + IsRightShift = false; + break; + case Intrinsic::aarch64_neon_uqshl: + Opcode = AArch64ISD::UQSHL_I; + IsRightShift = false; + break; + case Intrinsic::aarch64_neon_srshl: + Opcode = AArch64ISD::SRSHR_I; + IsRightShift = true; + break; + case Intrinsic::aarch64_neon_urshl: + Opcode = AArch64ISD::URSHR_I; + IsRightShift = true; + break; + case Intrinsic::aarch64_neon_sqshlu: + Opcode = AArch64ISD::SQSHLU_I; + IsRightShift = false; + break; + } + + if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) + return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1), + DAG.getConstant(-ShiftAmount, MVT::i32)); + else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount <= ElemBits) + return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1), + DAG.getConstant(ShiftAmount, MVT::i32)); + + return SDValue(); +} + +// The CRC32[BH] instructions ignore the high bits of their data operand. Since +// the intrinsics must be legal and take an i32, this means there's almost +// certainly going to be a zext in the DAG which we can eliminate. +static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) { + SDValue AndN = N->getOperand(2); + if (AndN.getOpcode() != ISD::AND) + return SDValue(); + + ConstantSDNode *CMask = dyn_cast(AndN.getOperand(1)); + if (!CMask || CMask->getZExtValue() != Mask) + return SDValue(); + + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32, + N->getOperand(0), N->getOperand(1), AndN.getOperand(0)); +} + +static SDValue performIntrinsicCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const AArch64Subtarget *Subtarget) { + SelectionDAG &DAG = DCI.DAG; + unsigned IID = getIntrinsicID(N); + switch (IID) { + default: + break; + case Intrinsic::aarch64_neon_vcvtfxs2fp: + case Intrinsic::aarch64_neon_vcvtfxu2fp: + return tryCombineFixedPointConvert(N, DCI, DAG); + break; + case Intrinsic::aarch64_neon_fmax: + return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_neon_fmin: + return DAG.getNode(AArch64ISD::FMIN, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_neon_smull: + case Intrinsic::aarch64_neon_umull: + case Intrinsic::aarch64_neon_pmull: + case Intrinsic::aarch64_neon_sqdmull: + return tryCombineLongOpWithDup(IID, N, DCI, DAG); + case Intrinsic::aarch64_neon_sqshl: + case Intrinsic::aarch64_neon_uqshl: + case Intrinsic::aarch64_neon_sqshlu: + case Intrinsic::aarch64_neon_srshl: + case Intrinsic::aarch64_neon_urshl: + return tryCombineShiftImm(IID, N, DAG); + case Intrinsic::aarch64_crc32b: + case Intrinsic::aarch64_crc32cb: + return tryCombineCRC32(0xff, N, DAG); + case Intrinsic::aarch64_crc32h: + case Intrinsic::aarch64_crc32ch: + return tryCombineCRC32(0xffff, N, DAG); + } + return SDValue(); +} + +static SDValue performExtendCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then + // we can convert that DUP into another extract_high (of a bigger DUP), which + // helps the backend to decide that an sabdl2 would be useful, saving a real + // extract_high operation. + if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND && + N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) { + SDNode *ABDNode = N->getOperand(0).getNode(); + unsigned IID = getIntrinsicID(ABDNode); + if (IID == Intrinsic::aarch64_neon_sabd || + IID == Intrinsic::aarch64_neon_uabd) { + SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG); + if (!NewABD.getNode()) + return SDValue(); + + return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), + NewABD); + } + } + + // This is effectively a custom type legalization for AArch64. + // + // Type legalization will split an extend of a small, legal, type to a larger + // illegal type by first splitting the destination type, often creating + // illegal source types, which then get legalized in isel-confusing ways, + // leading to really terrible codegen. E.g., + // %result = v8i32 sext v8i8 %value + // becomes + // %losrc = extract_subreg %value, ... + // %hisrc = extract_subreg %value, ... + // %lo = v4i32 sext v4i8 %losrc + // %hi = v4i32 sext v4i8 %hisrc + // Things go rapidly downhill from there. + // + // For AArch64, the [sz]ext vector instructions can only go up one element + // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32 + // take two instructions. + // + // This implies that the most efficient way to do the extend from v8i8 + // to two v4i32 values is to first extend the v8i8 to v8i16, then do + // the normal splitting to happen for the v8i16->v8i32. + + // This is pre-legalization to catch some cases where the default + // type legalization will create ill-tempered code. + if (!DCI.isBeforeLegalizeOps()) + return SDValue(); + + // We're only interested in cleaning things up for non-legal vector types + // here. If both the source and destination are legal, things will just + // work naturally without any fiddling. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT ResVT = N->getValueType(0); + if (!ResVT.isVector() || TLI.isTypeLegal(ResVT)) + return SDValue(); + // If the vector type isn't a simple VT, it's beyond the scope of what + // we're worried about here. Let legalization do its thing and hope for + // the best. + if (!ResVT.isSimple()) + return SDValue(); + + SDValue Src = N->getOperand(0); + MVT SrcVT = Src->getValueType(0).getSimpleVT(); + // If the source VT is a 64-bit vector, we can play games and get the + // better results we want. + if (SrcVT.getSizeInBits() != 64) + return SDValue(); + + unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits(); + unsigned ElementCount = SrcVT.getVectorNumElements(); + SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount); + SDLoc DL(N); + Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src); + + // Now split the rest of the operation into two halves, each with a 64 + // bit source. + EVT LoVT, HiVT; + SDValue Lo, Hi; + unsigned NumElements = ResVT.getVectorNumElements(); + assert(!(NumElements & 1) && "Splitting vector, but not in half!"); + LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(), + ResVT.getVectorElementType(), NumElements / 2); + + EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), + LoVT.getVectorNumElements()); + Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, + DAG.getIntPtrConstant(0)); + Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, + DAG.getIntPtrConstant(InNVT.getVectorNumElements())); + Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo); + Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi); + + // Now combine the parts back together so we still have a single result + // like the combiner expects. + return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi); +} + +/// Replace a splat of a scalar to a vector store by scalar stores of the scalar +/// value. The load store optimizer pass will merge them to store pair stores. +/// This has better performance than a splat of the scalar followed by a split +/// vector store. Even if the stores are not merged it is four stores vs a dup, +/// followed by an ext.b and two stores. +static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { + SDValue StVal = St->getValue(); + EVT VT = StVal.getValueType(); + + // Don't replace floating point stores, they possibly won't be transformed to + // stp because of the store pair suppress pass. + if (VT.isFloatingPoint()) + return SDValue(); + + // Check for insert vector elements. + if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT) + return SDValue(); + + // We can express a splat as store pair(s) for 2 or 4 elements. + unsigned NumVecElts = VT.getVectorNumElements(); + if (NumVecElts != 4 && NumVecElts != 2) + return SDValue(); + SDValue SplatVal = StVal.getOperand(1); + unsigned RemainInsertElts = NumVecElts - 1; + + // Check that this is a splat. + while (--RemainInsertElts) { + SDValue NextInsertElt = StVal.getOperand(0); + if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT) + return SDValue(); + if (NextInsertElt.getOperand(1) != SplatVal) + return SDValue(); + StVal = NextInsertElt; + } + unsigned OrigAlignment = St->getAlignment(); + unsigned EltOffset = NumVecElts == 4 ? 4 : 8; + unsigned Alignment = std::min(OrigAlignment, EltOffset); + + // Create scalar stores. This is at least as good as the code sequence for a + // split unaligned store wich is a dup.s, ext.b, and two stores. + // Most of the time the three stores should be replaced by store pair + // instructions (stp). + SDLoc DL(St); + SDValue BasePtr = St->getBasePtr(); + SDValue NewST1 = + DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(), + St->isVolatile(), St->isNonTemporal(), St->getAlignment()); + + unsigned Offset = EltOffset; + while (--NumVecElts) { + SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, + DAG.getConstant(Offset, MVT::i64)); + NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), Alignment); + Offset += EltOffset; + } + return NewST1; +} + +static SDValue performSTORECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + StoreSDNode *S = cast(N); + if (S->isVolatile()) + return SDValue(); + + // Cyclone has bad performance on unaligned 16B stores when crossing line and + // page boundries. We want to split such stores. + if (!Subtarget->isCyclone()) + return SDValue(); + + // Don't split at Oz. + MachineFunction &MF = DAG.getMachineFunction(); + bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute( + AttributeSet::FunctionIndex, Attribute::MinSize); + if (IsMinSize) + return SDValue(); + + SDValue StVal = S->getValue(); + EVT VT = StVal.getValueType(); + + // Don't split v2i64 vectors. Memcpy lowering produces those and splitting + // those up regresses performance on micro-benchmarks and olden/bh. + if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64) + return SDValue(); + + // Split unaligned 16B stores. They are terrible for performance. + // Don't split stores with alignment of 1 or 2. Code that uses clang vector + // extensions can use this to mark that it does not want splitting to happen + // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of + // eliminating alignment hazards is only 1 in 8 for alignment of 2. + if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 || + S->getAlignment() <= 2) + return SDValue(); + + // If we get a splat of a scalar convert this vector store to a store of + // scalars. They will be merged into store pairs thereby removing two + // instructions. + SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S); + if (ReplacedSplat != SDValue()) + return ReplacedSplat; + + SDLoc DL(S); + unsigned NumElts = VT.getVectorNumElements() / 2; + // Split VT into two. + EVT HalfVT = + EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts); + SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, + DAG.getIntPtrConstant(0)); + SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, + DAG.getIntPtrConstant(NumElts)); + SDValue BasePtr = S->getBasePtr(); + SDValue NewST1 = + DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), + S->isVolatile(), S->isNonTemporal(), S->getAlignment()); + SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, + DAG.getConstant(8, MVT::i64)); + return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr, + S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(), + S->getAlignment()); +} + +/// Target-specific DAG combine function for post-increment LD1 (lane) and +/// post-increment LD1R. +static SDValue performPostLD1Combine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + bool IsLaneOp) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + + unsigned LoadIdx = IsLaneOp ? 1 : 0; + SDNode *LD = N->getOperand(LoadIdx).getNode(); + // If it is not LOAD, can not do such combine. + if (LD->getOpcode() != ISD::LOAD) + return SDValue(); + + LoadSDNode *LoadSDN = cast(LD); + EVT MemVT = LoadSDN->getMemoryVT(); + // Check if memory operand is the same type as the vector element. + if (MemVT != VT.getVectorElementType()) + return SDValue(); + + // Check if there are other uses. If so, do not combine as it will introduce + // an extra load. + for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE; + ++UI) { + if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result. + continue; + if (*UI != N) + return SDValue(); + } + + SDValue Addr = LD->getOperand(1); + SDValue Vector = N->getOperand(0); + // Search for a use of the address operand that is an increment. + for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = + Addr.getNode()->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + if (User->getOpcode() != ISD::ADD + || UI.getUse().getResNo() != Addr.getResNo()) + continue; + + // Check that the add is independent of the load. Otherwise, folding it + // would create a cycle. + if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User)) + continue; + // Also check that add is not used in the vector operand. This would also + // create a cycle. + if (User->isPredecessorOf(Vector.getNode())) + continue; + + // If the increment is a constant, it must match the memory ref size. + SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); + if (ConstantSDNode *CInc = dyn_cast(Inc.getNode())) { + uint32_t IncVal = CInc->getZExtValue(); + unsigned NumBytes = VT.getScalarSizeInBits() / 8; + if (IncVal != NumBytes) + continue; + Inc = DAG.getRegister(AArch64::XZR, MVT::i64); + } + + SmallVector Ops; + Ops.push_back(LD->getOperand(0)); // Chain + if (IsLaneOp) { + Ops.push_back(Vector); // The vector to be inserted + Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector + } + Ops.push_back(Addr); + Ops.push_back(Inc); + + EVT Tys[3] = { VT, MVT::i64, MVT::Other }; + SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, 3)); + unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost; + SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops, + MemVT, + LoadSDN->getMemOperand()); + + // Update the uses. + std::vector NewResults; + NewResults.push_back(SDValue(LD, 0)); // The result of load + NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain + DCI.CombineTo(LD, NewResults); + DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result + DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register + + break; + } + return SDValue(); +} + +/// Target-specific DAG combine function for NEON load/store intrinsics +/// to merge base address updates. +static SDValue performNEONPostLDSTCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + + unsigned AddrOpIdx = N->getNumOperands() - 1; + SDValue Addr = N->getOperand(AddrOpIdx); + + // Search for a use of the address operand that is an increment. + for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), + UE = Addr.getNode()->use_end(); UI != UE; ++UI) { + SDNode *User = *UI; + if (User->getOpcode() != ISD::ADD || + UI.getUse().getResNo() != Addr.getResNo()) + continue; + + // Check that the add is independent of the load/store. Otherwise, folding + // it would create a cycle. + if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) + continue; + + // Find the new opcode for the updating load/store. + bool IsStore = false; + bool IsLaneOp = false; + bool IsDupOp = false; + unsigned NewOpc = 0; + unsigned NumVecs = 0; + unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); + switch (IntNo) { + default: llvm_unreachable("unexpected intrinsic for Neon base update"); + case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post; + NumVecs = 2; break; + case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post; + NumVecs = 3; break; + case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post; + NumVecs = 4; break; + case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post; + NumVecs = 2; IsStore = true; break; + case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post; + NumVecs = 3; IsStore = true; break; + case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post; + NumVecs = 4; IsStore = true; break; + case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post; + NumVecs = 2; break; + case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post; + NumVecs = 3; break; + case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post; + NumVecs = 4; break; + case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post; + NumVecs = 2; IsStore = true; break; + case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post; + NumVecs = 3; IsStore = true; break; + case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post; + NumVecs = 4; IsStore = true; break; + case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost; + NumVecs = 2; IsDupOp = true; break; + case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost; + NumVecs = 3; IsDupOp = true; break; + case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost; + NumVecs = 4; IsDupOp = true; break; + case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost; + NumVecs = 2; IsLaneOp = true; break; + case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost; + NumVecs = 3; IsLaneOp = true; break; + case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost; + NumVecs = 4; IsLaneOp = true; break; + case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost; + NumVecs = 2; IsStore = true; IsLaneOp = true; break; + case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost; + NumVecs = 3; IsStore = true; IsLaneOp = true; break; + case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost; + NumVecs = 4; IsStore = true; IsLaneOp = true; break; + } + + EVT VecTy; + if (IsStore) + VecTy = N->getOperand(2).getValueType(); + else + VecTy = N->getValueType(0); + + // If the increment is a constant, it must match the memory ref size. + SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); + if (ConstantSDNode *CInc = dyn_cast(Inc.getNode())) { + uint32_t IncVal = CInc->getZExtValue(); + unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; + if (IsLaneOp || IsDupOp) + NumBytes /= VecTy.getVectorNumElements(); + if (IncVal != NumBytes) + continue; + Inc = DAG.getRegister(AArch64::XZR, MVT::i64); + } + SmallVector Ops; + Ops.push_back(N->getOperand(0)); // Incoming chain + // Load lane and store have vector list as input. + if (IsLaneOp || IsStore) + for (unsigned i = 2; i < AddrOpIdx; ++i) + Ops.push_back(N->getOperand(i)); + Ops.push_back(Addr); // Base register + Ops.push_back(Inc); + + // Return Types. + EVT Tys[6]; + unsigned NumResultVecs = (IsStore ? 0 : NumVecs); + unsigned n; + for (n = 0; n < NumResultVecs; ++n) + Tys[n] = VecTy; + Tys[n++] = MVT::i64; // Type of write back register + Tys[n] = MVT::Other; // Type of the chain + SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2)); + + MemIntrinsicSDNode *MemInt = cast(N); + SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops, + MemInt->getMemoryVT(), + MemInt->getMemOperand()); + + // Update the uses. + std::vector NewResults; + for (unsigned i = 0; i < NumResultVecs; ++i) { + NewResults.push_back(SDValue(UpdN.getNode(), i)); + } + NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); + DCI.CombineTo(N, NewResults); + DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); + + break; + } + return SDValue(); +} + +// Optimize compare with zero and branch. +static SDValue performBRCONDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + SDValue Chain = N->getOperand(0); + SDValue Dest = N->getOperand(1); + SDValue CCVal = N->getOperand(2); + SDValue Cmp = N->getOperand(3); + + assert(isa(CCVal) && "Expected a ConstantSDNode here!"); + unsigned CC = cast(CCVal)->getZExtValue(); + if (CC != AArch64CC::EQ && CC != AArch64CC::NE) + return SDValue(); + + unsigned CmpOpc = Cmp.getOpcode(); + if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS) + return SDValue(); + + // Only attempt folding if there is only one use of the flag and no use of the + // value. + if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1)) + return SDValue(); + + SDValue LHS = Cmp.getOperand(0); + SDValue RHS = Cmp.getOperand(1); + + assert(LHS.getValueType() == RHS.getValueType() && + "Expected the value type to be the same for both operands!"); + if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) + return SDValue(); + + if (isa(LHS) && cast(LHS)->isNullValue()) + std::swap(LHS, RHS); + + if (!isa(RHS) || !cast(RHS)->isNullValue()) + return SDValue(); + + if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA || + LHS.getOpcode() == ISD::SRL) + return SDValue(); + + // Fold the compare into the branch instruction. + SDValue BR; + if (CC == AArch64CC::EQ) + BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); + else + BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); + + // Do not add new nodes to DAG combiner worklist. + DCI.CombineTo(N, BR, false); + + return SDValue(); +} + +// vselect (v1i1 setcc) -> +// vselect (v1iXX setcc) (XX is the size of the compared operand type) +// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as +// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine +// such VSELECT. +static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + EVT CCVT = N0.getValueType(); + + if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 || + CCVT.getVectorElementType() != MVT::i1) + return SDValue(); + + EVT ResVT = N->getValueType(0); + EVT CmpVT = N0.getOperand(0).getValueType(); + // Only combine when the result type is of the same size as the compared + // operands. + if (ResVT.getSizeInBits() != CmpVT.getSizeInBits()) + return SDValue(); + + SDValue IfTrue = N->getOperand(1); + SDValue IfFalse = N->getOperand(2); + SDValue SetCC = + DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(), + N0.getOperand(0), N0.getOperand(1), + cast(N0.getOperand(2))->get()); + return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC, + IfTrue, IfFalse); +} + +/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with +/// the compare-mask instructions rather than going via NZCV, even if LHS and +/// RHS are really scalar. This replaces any scalar setcc in the above pattern +/// with a vector one followed by a DUP shuffle on the result. +static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + EVT ResVT = N->getValueType(0); + + if (!N->getOperand(1).getValueType().isVector()) + return SDValue(); + + if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1) + return SDValue(); + + SDLoc DL(N0); + + EVT SrcVT = N0.getOperand(0).getValueType(); + SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, + ResVT.getSizeInBits() / SrcVT.getSizeInBits()); + EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); + + // First perform a vector comparison, where lane 0 is the one we're interested + // in. + SDValue LHS = + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0)); + SDValue RHS = + DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1)); + SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2)); + + // Now duplicate the comparison mask we want across all other lanes. + SmallVector DUPMask(CCVT.getVectorNumElements(), 0); + SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data()); + Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(), + Mask); + + return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); +} + +SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + switch (N->getOpcode()) { + default: + break; + case ISD::ADD: + case ISD::SUB: + return performAddSubLongCombine(N, DCI, DAG); + case ISD::XOR: + return performXorCombine(N, DAG, DCI, Subtarget); + case ISD::MUL: + return performMulCombine(N, DAG, DCI, Subtarget); + case ISD::SINT_TO_FP: + case ISD::UINT_TO_FP: + return performIntToFpCombine(N, DAG); + case ISD::OR: + return performORCombine(N, DCI, Subtarget); + case ISD::INTRINSIC_WO_CHAIN: + return performIntrinsicCombine(N, DCI, Subtarget); + case ISD::ANY_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: + return performExtendCombine(N, DCI, DAG); + case ISD::BITCAST: + return performBitcastCombine(N, DCI, DAG); + case ISD::CONCAT_VECTORS: + return performConcatVectorsCombine(N, DCI, DAG); + case ISD::SELECT: + return performSelectCombine(N, DAG); + case ISD::VSELECT: + return performVSelectCombine(N, DCI.DAG); + case ISD::STORE: + return performSTORECombine(N, DCI, DAG, Subtarget); + case AArch64ISD::BRCOND: + return performBRCONDCombine(N, DCI, DAG); + case AArch64ISD::DUP: + return performPostLD1Combine(N, DCI, false); + case ISD::INSERT_VECTOR_ELT: + return performPostLD1Combine(N, DCI, true); + case ISD::INTRINSIC_VOID: + case ISD::INTRINSIC_W_CHAIN: + switch (cast(N->getOperand(1))->getZExtValue()) { + case Intrinsic::aarch64_neon_ld2: + case Intrinsic::aarch64_neon_ld3: + case Intrinsic::aarch64_neon_ld4: + case Intrinsic::aarch64_neon_ld1x2: + case Intrinsic::aarch64_neon_ld1x3: + case Intrinsic::aarch64_neon_ld1x4: + case Intrinsic::aarch64_neon_ld2lane: + case Intrinsic::aarch64_neon_ld3lane: + case Intrinsic::aarch64_neon_ld4lane: + case Intrinsic::aarch64_neon_ld2r: + case Intrinsic::aarch64_neon_ld3r: + case Intrinsic::aarch64_neon_ld4r: + case Intrinsic::aarch64_neon_st2: + case Intrinsic::aarch64_neon_st3: + case Intrinsic::aarch64_neon_st4: + case Intrinsic::aarch64_neon_st1x2: + case Intrinsic::aarch64_neon_st1x3: + case Intrinsic::aarch64_neon_st1x4: + case Intrinsic::aarch64_neon_st2lane: + case Intrinsic::aarch64_neon_st3lane: + case Intrinsic::aarch64_neon_st4lane: + return performNEONPostLDSTCombine(N, DCI, DAG); + default: + break; + } + } + return SDValue(); +} + +// Check if the return value is used as only a return value, as otherwise +// we can't perform a tail-call. In particular, we need to check for +// target ISD nodes that are returns and any other "odd" constructs +// that the generic analysis code won't necessarily catch. +bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N, + SDValue &Chain) const { + if (N->getNumValues() != 1) + return false; + if (!N->hasNUsesOfValue(1, 0)) + return false; + + SDValue TCChain = Chain; + SDNode *Copy = *N->use_begin(); + if (Copy->getOpcode() == ISD::CopyToReg) { + // If the copy has a glue operand, we conservatively assume it isn't safe to + // perform a tail call. + if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() == + MVT::Glue) + return false; + TCChain = Copy->getOperand(0); + } else if (Copy->getOpcode() != ISD::FP_EXTEND) + return false; + + bool HasRet = false; + for (SDNode *Node : Copy->uses()) { + if (Node->getOpcode() != AArch64ISD::RET_FLAG) + return false; + HasRet = true; + } + + if (!HasRet) + return false; + + Chain = TCChain; + return true; +} + +// Return whether the an instruction can potentially be optimized to a tail +// call. This will cause the optimizers to attempt to move, or duplicate, +// return instructions to help enable tail call optimizations for this +// instruction. +bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { + if (!CI->isTailCall()) + return false; + + return true; +} + +bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base, + SDValue &Offset, + ISD::MemIndexedMode &AM, + bool &IsInc, + SelectionDAG &DAG) const { + if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) + return false; + + Base = Op->getOperand(0); + // All of the indexed addressing mode instructions take a signed + // 9 bit immediate offset. + if (ConstantSDNode *RHS = dyn_cast(Op->getOperand(1))) { + int64_t RHSC = (int64_t)RHS->getZExtValue(); + if (RHSC >= 256 || RHSC <= -256) + return false; + IsInc = (Op->getOpcode() == ISD::ADD); + Offset = Op->getOperand(1); + return true; + } + return false; +} + +bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, + SDValue &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG) const { + EVT VT; + SDValue Ptr; + if (LoadSDNode *LD = dyn_cast(N)) { + VT = LD->getMemoryVT(); + Ptr = LD->getBasePtr(); + } else if (StoreSDNode *ST = dyn_cast(N)) { + VT = ST->getMemoryVT(); + Ptr = ST->getBasePtr(); + } else + return false; + + bool IsInc; + if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG)) + return false; + AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC; + return true; +} + +bool AArch64TargetLowering::getPostIndexedAddressParts( + SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, + ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { + EVT VT; + SDValue Ptr; + if (LoadSDNode *LD = dyn_cast(N)) { + VT = LD->getMemoryVT(); + Ptr = LD->getBasePtr(); + } else if (StoreSDNode *ST = dyn_cast(N)) { + VT = ST->getMemoryVT(); + Ptr = ST->getBasePtr(); + } else + return false; + + bool IsInc; + if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG)) + return false; + // Post-indexing updates the base, so it's not a valid transform + // if that's not the same as the load's pointer. + if (Ptr != Base) + return false; + AM = IsInc ? ISD::POST_INC : ISD::POST_DEC; + return true; +} + +void AArch64TargetLowering::ReplaceNodeResults( + SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { + switch (N->getOpcode()) { + default: + llvm_unreachable("Don't know how to custom expand this"); + case ISD::FP_TO_UINT: + case ISD::FP_TO_SINT: + assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); + // Let normal code take care of it by not adding anything to Results. + return; + } +} + +bool AArch64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const { + // Loads and stores less than 128-bits are already atomic; ones above that + // are doomed anyway, so defer to the default libcall and blame the OS when + // things go wrong: + if (StoreInst *SI = dyn_cast(Inst)) + return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128; + else if (LoadInst *LI = dyn_cast(Inst)) + return LI->getType()->getPrimitiveSizeInBits() == 128; + + // For the real atomic operations, we have ldxr/stxr up to 128 bits. + return Inst->getType()->getPrimitiveSizeInBits() <= 128; +} + +Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, + AtomicOrdering Ord) const { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Type *ValTy = cast(Addr->getType())->getElementType(); + bool IsAcquire = + Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent; + + // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd + // intrinsic must return {i64, i64} and we have to recombine them into a + // single i128 here. + if (ValTy->getPrimitiveSizeInBits() == 128) { + Intrinsic::ID Int = + IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp; + Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int); + + Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); + Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi"); + + Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); + Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); + Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); + Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); + return Builder.CreateOr( + Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64"); + } + + Type *Tys[] = { Addr->getType() }; + Intrinsic::ID Int = + IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; + Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys); + + return Builder.CreateTruncOrBitCast( + Builder.CreateCall(Ldxr, Addr), + cast(Addr->getType())->getElementType()); +} + +Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, + Value *Val, Value *Addr, + AtomicOrdering Ord) const { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + bool IsRelease = + Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent; + + // Since the intrinsics must have legal type, the i128 intrinsics take two + // parameters: "i64, i64". We must marshal Val into the appropriate form + // before the call. + if (Val->getType()->getPrimitiveSizeInBits() == 128) { + Intrinsic::ID Int = + IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp; + Function *Stxr = Intrinsic::getDeclaration(M, Int); + Type *Int64Ty = Type::getInt64Ty(M->getContext()); + + Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); + Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); + Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); + return Builder.CreateCall3(Stxr, Lo, Hi, Addr); + } + + Intrinsic::ID Int = + IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr; + Type *Tys[] = { Addr->getType() }; + Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); + + return Builder.CreateCall2( + Stxr, Builder.CreateZExtOrBitCast( + Val, Stxr->getFunctionType()->getParamType(0)), + Addr); +} diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h new file mode 100644 index 00000000000..de16c4d9d4b --- /dev/null +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -0,0 +1,464 @@ +//==-- AArch64ISelLowering.h - AArch64 DAG Lowering Interface ----*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the interfaces that AArch64 uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_AArch64_ISELLOWERING_H +#define LLVM_TARGET_AArch64_ISELLOWERING_H + +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/Target/TargetLowering.h" + +namespace llvm { + +namespace AArch64ISD { + +enum { + FIRST_NUMBER = ISD::BUILTIN_OP_END, + WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses. + CALL, // Function call. + + // Almost the same as a normal call node, except that a TLSDesc relocation is + // needed so the linker can relax it correctly if possible. + TLSDESC_CALL, + ADRP, // Page address of a TargetGlobalAddress operand. + ADDlow, // Add the low 12 bits of a TargetGlobalAddress operand. + LOADgot, // Load from automatically generated descriptor (e.g. Global + // Offset Table, TLS record). + RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand. + BRCOND, // Conditional branch instruction; "b.cond". + CSEL, + FCSEL, // Conditional move instruction. + CSINV, // Conditional select invert. + CSNEG, // Conditional select negate. + CSINC, // Conditional select increment. + + // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on + // ELF. + THREAD_POINTER, + ADC, + SBC, // adc, sbc instructions + + // Arithmetic instructions which write flags. + ADDS, + SUBS, + ADCS, + SBCS, + ANDS, + + // Floating point comparison + FCMP, + + // Floating point max and min instructions. + FMAX, + FMIN, + + // Scalar extract + EXTR, + + // Scalar-to-vector duplication + DUP, + DUPLANE8, + DUPLANE16, + DUPLANE32, + DUPLANE64, + + // Vector immedate moves + MOVI, + MOVIshift, + MOVIedit, + MOVImsl, + FMOV, + MVNIshift, + MVNImsl, + + // Vector immediate ops + BICi, + ORRi, + + // Vector bit select: similar to ISD::VSELECT but not all bits within an + // element must be identical. + BSL, + + // Vector arithmetic negation + NEG, + + // Vector shuffles + ZIP1, + ZIP2, + UZP1, + UZP2, + TRN1, + TRN2, + REV16, + REV32, + REV64, + EXT, + + // Vector shift by scalar + VSHL, + VLSHR, + VASHR, + + // Vector shift by scalar (again) + SQSHL_I, + UQSHL_I, + SQSHLU_I, + SRSHR_I, + URSHR_I, + + // Vector comparisons + CMEQ, + CMGE, + CMGT, + CMHI, + CMHS, + FCMEQ, + FCMGE, + FCMGT, + + // Vector zero comparisons + CMEQz, + CMGEz, + CMGTz, + CMLEz, + CMLTz, + FCMEQz, + FCMGEz, + FCMGTz, + FCMLEz, + FCMLTz, + + // Vector bitwise negation + NOT, + + // Vector bitwise selection + BIT, + + // Compare-and-branch + CBZ, + CBNZ, + TBZ, + TBNZ, + + // Tail calls + TC_RETURN, + + // Custom prefetch handling + PREFETCH, + + // {s|u}int to FP within a FP register. + SITOF, + UITOF, + + // NEON Load/Store with post-increment base updates + LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, + LD3post, + LD4post, + ST2post, + ST3post, + ST4post, + LD1x2post, + LD1x3post, + LD1x4post, + ST1x2post, + ST1x3post, + ST1x4post, + LD1DUPpost, + LD2DUPpost, + LD3DUPpost, + LD4DUPpost, + LD1LANEpost, + LD2LANEpost, + LD3LANEpost, + LD4LANEpost, + ST2LANEpost, + ST3LANEpost, + ST4LANEpost +}; + +} // end namespace AArch64ISD + +class AArch64Subtarget; +class AArch64TargetMachine; + +class AArch64TargetLowering : public TargetLowering { + bool RequireStrictAlign; + +public: + explicit AArch64TargetLowering(AArch64TargetMachine &TM); + + /// Selects the correct CCAssignFn for a the given CallingConvention + /// value. + CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const; + + /// computeKnownBitsForTargetNode - Determine which of the bits specified in + /// Mask are known to be either zero or one and return them in the + /// KnownZero/KnownOne bitsets. + void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero, + APInt &KnownOne, const SelectionDAG &DAG, + unsigned Depth = 0) const override; + + MVT getScalarShiftAmountTy(EVT LHSTy) const override; + + /// allowsUnalignedMemoryAccesses - Returns true if the target allows + /// unaligned memory accesses. of the specified type. + bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0, + bool *Fast = nullptr) const override { + if (RequireStrictAlign) + return false; + // FIXME: True for Cyclone, but not necessary others. + if (Fast) + *Fast = true; + return true; + } + + /// LowerOperation - Provide custom lowering hooks for some operations. + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + + const char *getTargetNodeName(unsigned Opcode) const override; + + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + + /// getFunctionAlignment - Return the Log2 alignment of this function. + unsigned getFunctionAlignment(const Function *F) const; + + /// getMaximalGlobalOffset - Returns the maximal possible offset which can + /// be used for loads / stores from the global. + unsigned getMaximalGlobalOffset() const override; + + /// Returns true if a cast between SrcAS and DestAS is a noop. + bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { + // Addrspacecasts are always noops. + return true; + } + + /// createFastISel - This method returns a target specific FastISel object, + /// or null if the target does not support "fast" ISel. + FastISel *createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo) const override; + + bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; + + bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; + + /// isShuffleMaskLegal - Return true if the given shuffle mask can be + /// codegen'd directly, or if it should be stack expanded. + bool isShuffleMaskLegal(const SmallVectorImpl &M, EVT VT) const override; + + /// getSetCCResultType - Return the ISD::SETCC ValueType + EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; + + SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const; + + MachineBasicBlock *EmitF128CSEL(MachineInstr *MI, + MachineBasicBlock *BB) const; + + MachineBasicBlock * + EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock *MBB) const override; + + bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, + unsigned Intrinsic) const override; + + bool isTruncateFree(Type *Ty1, Type *Ty2) const override; + bool isTruncateFree(EVT VT1, EVT VT2) const override; + + bool isZExtFree(Type *Ty1, Type *Ty2) const override; + bool isZExtFree(EVT VT1, EVT VT2) const override; + bool isZExtFree(SDValue Val, EVT VT2) const override; + + bool hasPairedLoad(Type *LoadedType, + unsigned &RequiredAligment) const override; + bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override; + + bool isLegalAddImmediate(int64_t) const override; + bool isLegalICmpImmediate(int64_t) const override; + + EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, + bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, + MachineFunction &MF) const override; + + /// isLegalAddressingMode - Return true if the addressing mode represented + /// by AM is legal for this target, for a load/store of the specified type. + bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override; + + /// \brief Return the cost of the scaling factor used in the addressing + /// mode represented by AM for this target, for a load/store + /// of the specified type. + /// If the AM is supported, the return value must be >= 0. + /// If the AM is not supported, it returns a negative value. + int getScalingFactorCost(const AddrMode &AM, Type *Ty) const override; + + /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster + /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be + /// expanded to FMAs when this method returns true, otherwise fmuladd is + /// expanded to fmul + fadd. + bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; + + const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; + + /// \brief Returns false if N is a bit extraction pattern of (X >> C) & Mask. + bool isDesirableToCommuteWithShift(const SDNode *N) const override; + + /// \brief Returns true if it is beneficial to convert a load of a constant + /// to just the constant itself. + bool shouldConvertConstantLoadToIntImm(const APInt &Imm, + Type *Ty) const override; + + Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, + AtomicOrdering Ord) const override; + Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, + Value *Addr, AtomicOrdering Ord) const override; + + bool shouldExpandAtomicInIR(Instruction *Inst) const override; + +private: + /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can + /// make the right decision when generating code for different targets. + const AArch64Subtarget *Subtarget; + + void addTypeForNEON(EVT VT, EVT PromotedBitwiseVT); + void addDRTypeForNEON(MVT VT); + void addQRTypeForNEON(MVT VT); + + SDValue + LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl &Ins, SDLoc DL, + SelectionDAG &DAG, + SmallVectorImpl &InVals) const override; + + SDValue LowerCall(CallLoweringInfo & /*CLI*/, + SmallVectorImpl &InVals) const override; + + SDValue LowerCallResult(SDValue Chain, SDValue InFlag, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl &Ins, SDLoc DL, + SelectionDAG &DAG, SmallVectorImpl &InVals, + bool isThisReturn, SDValue ThisVal) const; + + bool isEligibleForTailCallOptimization( + SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, + bool isCalleeStructRet, bool isCallerStructRet, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + const SmallVectorImpl &Ins, SelectionDAG &DAG) const; + + /// Finds the incoming stack arguments which overlap the given fixed stack + /// object and incorporates their load into the current chain. This prevents + /// an upcoming store from clobbering the stack argument before it's used. + SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, + MachineFrameInfo *MFI, int ClobberedFI) const; + + bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const; + + bool IsTailCallConvention(CallingConv::ID CallCC) const; + + void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL, + SDValue &Chain) const; + + bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, + bool isVarArg, + const SmallVectorImpl &Outs, + LLVMContext &Context) const override; + + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, SDLoc DL, + SelectionDAG &DAG) const override; + + SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerELFTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL, + SelectionDAG &DAG) const; + SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG, + RTLIB::Libcall Call) const; + SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; + + ConstraintType + getConstraintType(const std::string &Constraint) const override; + unsigned getRegisterByName(const char* RegName, EVT VT) const override; + + /// Examine constraint string and operand type and determine a weight value. + /// The operand object must already have been set up with the operand type. + ConstraintWeight + getSingleConstraintMatchWeight(AsmOperandInfo &info, + const char *constraint) const override; + + std::pair + getRegForInlineAsmConstraint(const std::string &Constraint, + MVT VT) const override; + void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, + std::vector &Ops, + SelectionDAG &DAG) const override; + + bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; + bool mayBeEmittedAsTailCall(CallInst *CI) const override; + bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset, + ISD::MemIndexedMode &AM, bool &IsInc, + SelectionDAG &DAG) const; + bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG) const override; + bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, + SDValue &Offset, ISD::MemIndexedMode &AM, + SelectionDAG &DAG) const override; + + void ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, + SelectionDAG &DAG) const override; +}; + +namespace AArch64 { +FastISel *createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo); +} // end namespace AArch64 + +} // end namespace llvm + +#endif // LLVM_TARGET_AArch64_ISELLOWERING_H diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td new file mode 100644 index 00000000000..3b9e3c63059 --- /dev/null +++ b/lib/Target/AArch64/AArch64InstrAtomics.td @@ -0,0 +1,364 @@ +//=- AArch64InstrAtomics.td - AArch64 Atomic codegen support -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// AArch64 Atomic operand code-gen constructs. +// +//===----------------------------------------------------------------------===// + +//===---------------------------------- +// Atomic fences +//===---------------------------------- +def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>; +def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>; + +//===---------------------------------- +// Atomic loads +//===---------------------------------- + +// When they're actually atomic, only one addressing mode (GPR64sp) is +// supported, but when they're relaxed and anything can be used, all the +// standard modes would be valid and may give efficiency gains. + +// A atomic load operation that actually needs acquire semantics. +class acquiring_load + : PatFrag<(ops node:$ptr), (base node:$ptr), [{ + AtomicOrdering Ordering = cast(N)->getOrdering(); + assert(Ordering != AcquireRelease && "unexpected load ordering"); + return Ordering == Acquire || Ordering == SequentiallyConsistent; +}]>; + +// An atomic load operation that does not need either acquire or release +// semantics. +class relaxed_load + : PatFrag<(ops node:$ptr), (base node:$ptr), [{ + AtomicOrdering Ordering = cast(N)->getOrdering(); + return Ordering == Monotonic || Ordering == Unordered; +}]>; + +// 8-bit loads +def : Pat<(acquiring_load GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>; +def : Pat<(relaxed_load (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend8:$offset)), + (LDRBBroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$offset)>; +def : Pat<(relaxed_load (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend8:$offset)), + (LDRBBroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$offset)>; +def : Pat<(relaxed_load (am_indexed8 GPR64sp:$Rn, + uimm12s1:$offset)), + (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>; +def : Pat<(relaxed_load + (am_unscaled8 GPR64sp:$Rn, simm9:$offset)), + (LDURBBi GPR64sp:$Rn, simm9:$offset)>; + +// 16-bit loads +def : Pat<(acquiring_load GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>; +def : Pat<(relaxed_load (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend16:$extend)), + (LDRHHroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>; +def : Pat<(relaxed_load (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend16:$extend)), + (LDRHHroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>; +def : Pat<(relaxed_load (am_indexed16 GPR64sp:$Rn, + uimm12s2:$offset)), + (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>; +def : Pat<(relaxed_load + (am_unscaled16 GPR64sp:$Rn, simm9:$offset)), + (LDURHHi GPR64sp:$Rn, simm9:$offset)>; + +// 32-bit loads +def : Pat<(acquiring_load GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>; +def : Pat<(relaxed_load (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend32:$extend)), + (LDRWroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>; +def : Pat<(relaxed_load (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend32:$extend)), + (LDRWroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>; +def : Pat<(relaxed_load (am_indexed32 GPR64sp:$Rn, + uimm12s4:$offset)), + (LDRWui GPR64sp:$Rn, uimm12s4:$offset)>; +def : Pat<(relaxed_load + (am_unscaled32 GPR64sp:$Rn, simm9:$offset)), + (LDURWi GPR64sp:$Rn, simm9:$offset)>; + +// 64-bit loads +def : Pat<(acquiring_load GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>; +def : Pat<(relaxed_load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend64:$extend)), + (LDRXroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>; +def : Pat<(relaxed_load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend64:$extend)), + (LDRXroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>; +def : Pat<(relaxed_load (am_indexed64 GPR64sp:$Rn, + uimm12s8:$offset)), + (LDRXui GPR64sp:$Rn, uimm12s8:$offset)>; +def : Pat<(relaxed_load + (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (LDURXi GPR64sp:$Rn, simm9:$offset)>; + +//===---------------------------------- +// Atomic stores +//===---------------------------------- + +// When they're actually atomic, only one addressing mode (GPR64sp) is +// supported, but when they're relaxed and anything can be used, all the +// standard modes would be valid and may give efficiency gains. + +// A store operation that actually needs release semantics. +class releasing_store + : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{ + AtomicOrdering Ordering = cast(N)->getOrdering(); + assert(Ordering != AcquireRelease && "unexpected store ordering"); + return Ordering == Release || Ordering == SequentiallyConsistent; +}]>; + +// An atomic store operation that doesn't actually need to be atomic on AArch64. +class relaxed_store + : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{ + AtomicOrdering Ordering = cast(N)->getOrdering(); + return Ordering == Monotonic || Ordering == Unordered; +}]>; + +// 8-bit stores +def : Pat<(releasing_store GPR64sp:$ptr, GPR32:$val), + (STLRB GPR32:$val, GPR64sp:$ptr)>; +def : Pat<(relaxed_store + (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend), + GPR32:$val), + (STRBBroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend)>; +def : Pat<(relaxed_store + (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend), + GPR32:$val), + (STRBBroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend)>; +def : Pat<(relaxed_store + (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset), GPR32:$val), + (STRBBui GPR32:$val, GPR64sp:$Rn, uimm12s1:$offset)>; +def : Pat<(relaxed_store + (am_unscaled8 GPR64sp:$Rn, simm9:$offset), GPR32:$val), + (STURBBi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>; + +// 16-bit stores +def : Pat<(releasing_store GPR64sp:$ptr, GPR32:$val), + (STLRH GPR32:$val, GPR64sp:$ptr)>; +def : Pat<(relaxed_store (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend16:$extend), + GPR32:$val), + (STRHHroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>; +def : Pat<(relaxed_store (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend16:$extend), + GPR32:$val), + (STRHHroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>; +def : Pat<(relaxed_store + (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset), GPR32:$val), + (STRHHui GPR32:$val, GPR64sp:$Rn, uimm12s2:$offset)>; +def : Pat<(relaxed_store + (am_unscaled16 GPR64sp:$Rn, simm9:$offset), GPR32:$val), + (STURHHi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>; + +// 32-bit stores +def : Pat<(releasing_store GPR64sp:$ptr, GPR32:$val), + (STLRW GPR32:$val, GPR64sp:$ptr)>; +def : Pat<(relaxed_store (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend32:$extend), + GPR32:$val), + (STRWroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>; +def : Pat<(relaxed_store (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend32:$extend), + GPR32:$val), + (STRWroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>; +def : Pat<(relaxed_store + (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset), GPR32:$val), + (STRWui GPR32:$val, GPR64sp:$Rn, uimm12s4:$offset)>; +def : Pat<(relaxed_store + (am_unscaled32 GPR64sp:$Rn, simm9:$offset), GPR32:$val), + (STURWi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>; + +// 64-bit stores +def : Pat<(releasing_store GPR64sp:$ptr, GPR64:$val), + (STLRX GPR64:$val, GPR64sp:$ptr)>; +def : Pat<(relaxed_store (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend16:$extend), + GPR64:$val), + (STRXroW GPR64:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>; +def : Pat<(relaxed_store (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend16:$extend), + GPR64:$val), + (STRXroX GPR64:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>; +def : Pat<(relaxed_store + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset), GPR64:$val), + (STRXui GPR64:$val, GPR64sp:$Rn, uimm12s8:$offset)>; +def : Pat<(relaxed_store + (am_unscaled64 GPR64sp:$Rn, simm9:$offset), GPR64:$val), + (STURXi GPR64:$val, GPR64sp:$Rn, simm9:$offset)>; + +//===---------------------------------- +// Low-level exclusive operations +//===---------------------------------- + +// Load-exclusives. + +def ldxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i8; +}]>; + +def ldxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i16; +}]>; + +def ldxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i32; +}]>; + +def ldxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i64; +}]>; + +def : Pat<(ldxr_1 GPR64sp:$addr), + (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>; +def : Pat<(ldxr_2 GPR64sp:$addr), + (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>; +def : Pat<(ldxr_4 GPR64sp:$addr), + (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>; +def : Pat<(ldxr_8 GPR64sp:$addr), (LDXRX GPR64sp:$addr)>; + +def : Pat<(and (ldxr_1 GPR64sp:$addr), 0xff), + (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>; +def : Pat<(and (ldxr_2 GPR64sp:$addr), 0xffff), + (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>; +def : Pat<(and (ldxr_4 GPR64sp:$addr), 0xffffffff), + (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>; + +// Load-exclusives. + +def ldaxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i8; +}]>; + +def ldaxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i16; +}]>; + +def ldaxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i32; +}]>; + +def ldaxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i64; +}]>; + +def : Pat<(ldaxr_1 GPR64sp:$addr), + (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>; +def : Pat<(ldaxr_2 GPR64sp:$addr), + (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>; +def : Pat<(ldaxr_4 GPR64sp:$addr), + (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>; +def : Pat<(ldaxr_8 GPR64sp:$addr), (LDAXRX GPR64sp:$addr)>; + +def : Pat<(and (ldaxr_1 GPR64sp:$addr), 0xff), + (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>; +def : Pat<(and (ldaxr_2 GPR64sp:$addr), 0xffff), + (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>; +def : Pat<(and (ldaxr_4 GPR64sp:$addr), 0xffffffff), + (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>; + +// Store-exclusives. + +def stxr_1 : PatFrag<(ops node:$val, node:$ptr), + (int_aarch64_stxr node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i8; +}]>; + +def stxr_2 : PatFrag<(ops node:$val, node:$ptr), + (int_aarch64_stxr node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i16; +}]>; + +def stxr_4 : PatFrag<(ops node:$val, node:$ptr), + (int_aarch64_stxr node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i32; +}]>; + +def stxr_8 : PatFrag<(ops node:$val, node:$ptr), + (int_aarch64_stxr node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i64; +}]>; + + +def : Pat<(stxr_1 GPR64:$val, GPR64sp:$addr), + (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; +def : Pat<(stxr_2 GPR64:$val, GPR64sp:$addr), + (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; +def : Pat<(stxr_4 GPR64:$val, GPR64sp:$addr), + (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; +def : Pat<(stxr_8 GPR64:$val, GPR64sp:$addr), + (STXRX GPR64:$val, GPR64sp:$addr)>; + +def : Pat<(stxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr), + (STXRB GPR32:$val, GPR64sp:$addr)>; +def : Pat<(stxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr), + (STXRH GPR32:$val, GPR64sp:$addr)>; +def : Pat<(stxr_4 (zext GPR32:$val), GPR64sp:$addr), + (STXRW GPR32:$val, GPR64sp:$addr)>; + +def : Pat<(stxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr), + (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; +def : Pat<(stxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr), + (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; +def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr), + (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; + +// Store-release-exclusives. + +def stlxr_1 : PatFrag<(ops node:$val, node:$ptr), + (int_aarch64_stlxr node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i8; +}]>; + +def stlxr_2 : PatFrag<(ops node:$val, node:$ptr), + (int_aarch64_stlxr node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i16; +}]>; + +def stlxr_4 : PatFrag<(ops node:$val, node:$ptr), + (int_aarch64_stlxr node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i32; +}]>; + +def stlxr_8 : PatFrag<(ops node:$val, node:$ptr), + (int_aarch64_stlxr node:$val, node:$ptr), [{ + return cast(N)->getMemoryVT() == MVT::i64; +}]>; + + +def : Pat<(stlxr_1 GPR64:$val, GPR64sp:$addr), + (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; +def : Pat<(stlxr_2 GPR64:$val, GPR64sp:$addr), + (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; +def : Pat<(stlxr_4 GPR64:$val, GPR64sp:$addr), + (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; +def : Pat<(stlxr_8 GPR64:$val, GPR64sp:$addr), + (STLXRX GPR64:$val, GPR64sp:$addr)>; + +def : Pat<(stlxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr), + (STLXRB GPR32:$val, GPR64sp:$addr)>; +def : Pat<(stlxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr), + (STLXRH GPR32:$val, GPR64sp:$addr)>; +def : Pat<(stlxr_4 (zext GPR32:$val), GPR64sp:$addr), + (STLXRW GPR32:$val, GPR64sp:$addr)>; + +def : Pat<(stlxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr), + (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; +def : Pat<(stlxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr), + (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; +def : Pat<(stlxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr), + (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; + + +// And clear exclusive. + +def : Pat<(int_aarch64_clrex), (CLREX 0xf)>; diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td new file mode 100644 index 00000000000..d455d7e45e0 --- /dev/null +++ b/lib/Target/AArch64/AArch64InstrFormats.td @@ -0,0 +1,8574 @@ +//===- AArch64InstrFormats.td - AArch64 Instruction Formats --*- tblgen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Describe AArch64 instructions format here +// + +// Format specifies the encoding used by the instruction. This is part of the +// ad-hoc solution used to emit machine instruction encodings by our machine +// code emitter. +class Format val> { + bits<2> Value = val; +} + +def PseudoFrm : Format<0>; +def NormalFrm : Format<1>; // Do we need any others? + +// AArch64 Instruction Format +class AArch64Inst : Instruction { + field bits<32> Inst; // Instruction encoding. + // Mask of bits that cause an encoding to be UNPREDICTABLE. + // If a bit is set, then if the corresponding bit in the + // target encoding differs from its value in the "Inst" field, + // the instruction is UNPREDICTABLE (SoftFail in abstract parlance). + field bits<32> Unpredictable = 0; + // SoftFail is the generic name for this field, but we alias it so + // as to make it more obvious what it means in ARM-land. + field bits<32> SoftFail = Unpredictable; + let Namespace = "AArch64"; + Format F = f; + bits<2> Form = F.Value; + let Pattern = []; + let Constraints = cstr; +} + +// Pseudo instructions (don't have encoding information) +class Pseudo pattern, string cstr = ""> + : AArch64Inst { + dag OutOperandList = oops; + dag InOperandList = iops; + let Pattern = pattern; + let isCodeGenOnly = 1; +} + +// Real instructions (have encoding information) +class EncodedI pattern> : AArch64Inst { + let Pattern = pattern; + let Size = 4; +} + +// Normal instructions +class I pattern> + : EncodedI { + dag OutOperandList = oops; + dag InOperandList = iops; + let AsmString = !strconcat(asm, operands); +} + +class TriOpFrag : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>; +class BinOpFrag : PatFrag<(ops node:$LHS, node:$RHS), res>; +class UnOpFrag : PatFrag<(ops node:$LHS), res>; + +// Helper fragment for an extract of the high portion of a 128-bit vector. +def extract_high_v16i8 : + UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>; +def extract_high_v8i16 : + UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>; +def extract_high_v4i32 : + UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>; +def extract_high_v2i64 : + UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>; + +//===----------------------------------------------------------------------===// +// Asm Operand Classes. +// + +// Shifter operand for arithmetic shifted encodings. +def ShifterOperand : AsmOperandClass { + let Name = "Shifter"; +} + +// Shifter operand for mov immediate encodings. +def MovImm32ShifterOperand : AsmOperandClass { + let SuperClasses = [ShifterOperand]; + let Name = "MovImm32Shifter"; + let RenderMethod = "addShifterOperands"; + let DiagnosticType = "InvalidMovImm32Shift"; +} +def MovImm64ShifterOperand : AsmOperandClass { + let SuperClasses = [ShifterOperand]; + let Name = "MovImm64Shifter"; + let RenderMethod = "addShifterOperands"; + let DiagnosticType = "InvalidMovImm64Shift"; +} + +// Shifter operand for arithmetic register shifted encodings. +class ArithmeticShifterOperand : AsmOperandClass { + let SuperClasses = [ShifterOperand]; + let Name = "ArithmeticShifter" # width; + let PredicateMethod = "isArithmeticShifter<" # width # ">"; + let RenderMethod = "addShifterOperands"; + let DiagnosticType = "AddSubRegShift" # width; +} + +def ArithmeticShifterOperand32 : ArithmeticShifterOperand<32>; +def ArithmeticShifterOperand64 : ArithmeticShifterOperand<64>; + +// Shifter operand for logical register shifted encodings. +class LogicalShifterOperand : AsmOperandClass { + let SuperClasses = [ShifterOperand]; + let Name = "LogicalShifter" # width; + let PredicateMethod = "isLogicalShifter<" # width # ">"; + let RenderMethod = "addShifterOperands"; + let DiagnosticType = "AddSubRegShift" # width; +} + +def LogicalShifterOperand32 : LogicalShifterOperand<32>; +def LogicalShifterOperand64 : LogicalShifterOperand<64>; + +// Shifter operand for logical vector 128/64-bit shifted encodings. +def LogicalVecShifterOperand : AsmOperandClass { + let SuperClasses = [ShifterOperand]; + let Name = "LogicalVecShifter"; + let RenderMethod = "addShifterOperands"; +} +def LogicalVecHalfWordShifterOperand : AsmOperandClass { + let SuperClasses = [LogicalVecShifterOperand]; + let Name = "LogicalVecHalfWordShifter"; + let RenderMethod = "addShifterOperands"; +} + +// The "MSL" shifter on the vector MOVI instruction. +def MoveVecShifterOperand : AsmOperandClass { + let SuperClasses = [ShifterOperand]; + let Name = "MoveVecShifter"; + let RenderMethod = "addShifterOperands"; +} + +// Extend operand for arithmetic encodings. +def ExtendOperand : AsmOperandClass { + let Name = "Extend"; + let DiagnosticType = "AddSubRegExtendLarge"; +} +def ExtendOperand64 : AsmOperandClass { + let SuperClasses = [ExtendOperand]; + let Name = "Extend64"; + let DiagnosticType = "AddSubRegExtendSmall"; +} +// 'extend' that's a lsl of a 64-bit register. +def ExtendOperandLSL64 : AsmOperandClass { + let SuperClasses = [ExtendOperand]; + let Name = "ExtendLSL64"; + let RenderMethod = "addExtend64Operands"; + let DiagnosticType = "AddSubRegExtendLarge"; +} + +// 8-bit floating-point immediate encodings. +def FPImmOperand : AsmOperandClass { + let Name = "FPImm"; + let ParserMethod = "tryParseFPImm"; + let DiagnosticType = "InvalidFPImm"; +} + +def CondCode : AsmOperandClass { + let Name = "CondCode"; + let DiagnosticType = "InvalidCondCode"; +} + +// A 32-bit register pasrsed as 64-bit +def GPR32as64Operand : AsmOperandClass { + let Name = "GPR32as64"; +} +def GPR32as64 : RegisterOperand { + let ParserMatchClass = GPR32as64Operand; +} + +// 8-bit immediate for AdvSIMD where 64-bit values of the form: +// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh +// are encoded as the eight bit value 'abcdefgh'. +def SIMDImmType10Operand : AsmOperandClass { let Name = "SIMDImmType10"; } + + +//===----------------------------------------------------------------------===// +// Operand Definitions. +// + +// ADR[P] instruction labels. +def AdrpOperand : AsmOperandClass { + let Name = "AdrpLabel"; + let ParserMethod = "tryParseAdrpLabel"; + let DiagnosticType = "InvalidLabel"; +} +def adrplabel : Operand { + let EncoderMethod = "getAdrLabelOpValue"; + let PrintMethod = "printAdrpLabel"; + let ParserMatchClass = AdrpOperand; +} + +def AdrOperand : AsmOperandClass { + let Name = "AdrLabel"; + let ParserMethod = "tryParseAdrLabel"; + let DiagnosticType = "InvalidLabel"; +} +def adrlabel : Operand { + let EncoderMethod = "getAdrLabelOpValue"; + let ParserMatchClass = AdrOperand; +} + +// simm9 predicate - True if the immediate is in the range [-256, 255]. +def SImm9Operand : AsmOperandClass { + let Name = "SImm9"; + let DiagnosticType = "InvalidMemoryIndexedSImm9"; +} +def simm9 : Operand, ImmLeaf= -256 && Imm < 256; }]> { + let ParserMatchClass = SImm9Operand; +} + +// simm7sN predicate - True if the immediate is a multiple of N in the range +// [-64 * N, 63 * N]. +class SImm7Scaled : AsmOperandClass { + let Name = "SImm7s" # Scale; + let DiagnosticType = "InvalidMemoryIndexed" # Scale # "SImm7"; +} + +def SImm7s4Operand : SImm7Scaled<4>; +def SImm7s8Operand : SImm7Scaled<8>; +def SImm7s16Operand : SImm7Scaled<16>; + +def simm7s4 : Operand { + let ParserMatchClass = SImm7s4Operand; + let PrintMethod = "printImmScale<4>"; +} + +def simm7s8 : Operand { + let ParserMatchClass = SImm7s8Operand; + let PrintMethod = "printImmScale<8>"; +} + +def simm7s16 : Operand { + let ParserMatchClass = SImm7s16Operand; + let PrintMethod = "printImmScale<16>"; +} + +class AsmImmRange : AsmOperandClass { + let Name = "Imm" # Low # "_" # High; + let DiagnosticType = "InvalidImm" # Low # "_" # High; +} + +def Imm1_8Operand : AsmImmRange<1, 8>; +def Imm1_16Operand : AsmImmRange<1, 16>; +def Imm1_32Operand : AsmImmRange<1, 32>; +def Imm1_64Operand : AsmImmRange<1, 64>; + +def MovZSymbolG3AsmOperand : AsmOperandClass { + let Name = "MovZSymbolG3"; + let RenderMethod = "addImmOperands"; +} + +def movz_symbol_g3 : Operand { + let ParserMatchClass = MovZSymbolG3AsmOperand; +} + +def MovZSymbolG2AsmOperand : AsmOperandClass { + let Name = "MovZSymbolG2"; + let RenderMethod = "addImmOperands"; +} + +def movz_symbol_g2 : Operand { + let ParserMatchClass = MovZSymbolG2AsmOperand; +} + +def MovZSymbolG1AsmOperand : AsmOperandClass { + let Name = "MovZSymbolG1"; + let RenderMethod = "addImmOperands"; +} + +def movz_symbol_g1 : Operand { + let ParserMatchClass = MovZSymbolG1AsmOperand; +} + +def MovZSymbolG0AsmOperand : AsmOperandClass { + let Name = "MovZSymbolG0"; + let RenderMethod = "addImmOperands"; +} + +def movz_symbol_g0 : Operand { + let ParserMatchClass = MovZSymbolG0AsmOperand; +} + +def MovKSymbolG3AsmOperand : AsmOperandClass { + let Name = "MovKSymbolG3"; + let RenderMethod = "addImmOperands"; +} + +def movk_symbol_g3 : Operand { + let ParserMatchClass = MovKSymbolG3AsmOperand; +} + +def MovKSymbolG2AsmOperand : AsmOperandClass { + let Name = "MovKSymbolG2"; + let RenderMethod = "addImmOperands"; +} + +def movk_symbol_g2 : Operand { + let ParserMatchClass = MovKSymbolG2AsmOperand; +} + +def MovKSymbolG1AsmOperand : AsmOperandClass { + let Name = "MovKSymbolG1"; + let RenderMethod = "addImmOperands"; +} + +def movk_symbol_g1 : Operand { + let ParserMatchClass = MovKSymbolG1AsmOperand; +} + +def MovKSymbolG0AsmOperand : AsmOperandClass { + let Name = "MovKSymbolG0"; + let RenderMethod = "addImmOperands"; +} + +def movk_symbol_g0 : Operand { + let ParserMatchClass = MovKSymbolG0AsmOperand; +} + +class fixedpoint_i32 + : Operand, + ComplexPattern", [fpimm, ld]> { + let EncoderMethod = "getFixedPointScaleOpValue"; + let DecoderMethod = "DecodeFixedPointScaleImm32"; + let ParserMatchClass = Imm1_32Operand; +} + +class fixedpoint_i64 + : Operand, + ComplexPattern", [fpimm, ld]> { + let EncoderMethod = "getFixedPointScaleOpValue"; + let DecoderMethod = "DecodeFixedPointScaleImm64"; + let ParserMatchClass = Imm1_64Operand; +} + +def fixedpoint_f32_i32 : fixedpoint_i32; +def fixedpoint_f64_i32 : fixedpoint_i32; + +def fixedpoint_f32_i64 : fixedpoint_i64; +def fixedpoint_f64_i64 : fixedpoint_i64; + +def vecshiftR8 : Operand, ImmLeaf 0) && (((uint32_t)Imm) < 9); +}]> { + let EncoderMethod = "getVecShiftR8OpValue"; + let DecoderMethod = "DecodeVecShiftR8Imm"; + let ParserMatchClass = Imm1_8Operand; +} +def vecshiftR16 : Operand, ImmLeaf 0) && (((uint32_t)Imm) < 17); +}]> { + let EncoderMethod = "getVecShiftR16OpValue"; + let DecoderMethod = "DecodeVecShiftR16Imm"; + let ParserMatchClass = Imm1_16Operand; +} +def vecshiftR16Narrow : Operand, ImmLeaf 0) && (((uint32_t)Imm) < 9); +}]> { + let EncoderMethod = "getVecShiftR16OpValue"; + let DecoderMethod = "DecodeVecShiftR16ImmNarrow"; + let ParserMatchClass = Imm1_8Operand; +} +def vecshiftR32 : Operand, ImmLeaf 0) && (((uint32_t)Imm) < 33); +}]> { + let EncoderMethod = "getVecShiftR32OpValue"; + let DecoderMethod = "DecodeVecShiftR32Imm"; + let ParserMatchClass = Imm1_32Operand; +} +def vecshiftR32Narrow : Operand, ImmLeaf 0) && (((uint32_t)Imm) < 17); +}]> { + let EncoderMethod = "getVecShiftR32OpValue"; + let DecoderMethod = "DecodeVecShiftR32ImmNarrow"; + let ParserMatchClass = Imm1_16Operand; +} +def vecshiftR64 : Operand, ImmLeaf 0) && (((uint32_t)Imm) < 65); +}]> { + let EncoderMethod = "getVecShiftR64OpValue"; + let DecoderMethod = "DecodeVecShiftR64Imm"; + let ParserMatchClass = Imm1_64Operand; +} +def vecshiftR64Narrow : Operand, ImmLeaf 0) && (((uint32_t)Imm) < 33); +}]> { + let EncoderMethod = "getVecShiftR64OpValue"; + let DecoderMethod = "DecodeVecShiftR64ImmNarrow"; + let ParserMatchClass = Imm1_32Operand; +} + +def Imm0_7Operand : AsmImmRange<0, 7>; +def Imm0_15Operand : AsmImmRange<0, 15>; +def Imm0_31Operand : AsmImmRange<0, 31>; +def Imm0_63Operand : AsmImmRange<0, 63>; + +def vecshiftL8 : Operand, ImmLeaf { + let EncoderMethod = "getVecShiftL8OpValue"; + let DecoderMethod = "DecodeVecShiftL8Imm"; + let ParserMatchClass = Imm0_7Operand; +} +def vecshiftL16 : Operand, ImmLeaf { + let EncoderMethod = "getVecShiftL16OpValue"; + let DecoderMethod = "DecodeVecShiftL16Imm"; + let ParserMatchClass = Imm0_15Operand; +} +def vecshiftL32 : Operand, ImmLeaf { + let EncoderMethod = "getVecShiftL32OpValue"; + let DecoderMethod = "DecodeVecShiftL32Imm"; + let ParserMatchClass = Imm0_31Operand; +} +def vecshiftL64 : Operand, ImmLeaf { + let EncoderMethod = "getVecShiftL64OpValue"; + let DecoderMethod = "DecodeVecShiftL64Imm"; + let ParserMatchClass = Imm0_63Operand; +} + + +// Crazy immediate formats used by 32-bit and 64-bit logical immediate +// instructions for splatting repeating bit patterns across the immediate. +def logical_imm32_XFORM : SDNodeXFormgetZExtValue(), 32); + return CurDAG->getTargetConstant(enc, MVT::i32); +}]>; +def logical_imm64_XFORM : SDNodeXFormgetZExtValue(), 64); + return CurDAG->getTargetConstant(enc, MVT::i32); +}]>; + +def LogicalImm32Operand : AsmOperandClass { + let Name = "LogicalImm32"; + let DiagnosticType = "LogicalSecondSource"; +} +def LogicalImm64Operand : AsmOperandClass { + let Name = "LogicalImm64"; + let DiagnosticType = "LogicalSecondSource"; +} +def logical_imm32 : Operand, PatLeaf<(imm), [{ + return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 32); +}], logical_imm32_XFORM> { + let PrintMethod = "printLogicalImm32"; + let ParserMatchClass = LogicalImm32Operand; +} +def logical_imm64 : Operand, PatLeaf<(imm), [{ + return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 64); +}], logical_imm64_XFORM> { + let PrintMethod = "printLogicalImm64"; + let ParserMatchClass = LogicalImm64Operand; +} + +// imm0_65535 predicate - True if the immediate is in the range [0,65535]. +def Imm0_65535Operand : AsmImmRange<0, 65535>; +def imm0_65535 : Operand, ImmLeaf { + let ParserMatchClass = Imm0_65535Operand; + let PrintMethod = "printHexImm"; +} + +// imm0_255 predicate - True if the immediate is in the range [0,255]. +def Imm0_255Operand : AsmOperandClass { let Name = "Imm0_255"; } +def imm0_255 : Operand, ImmLeaf { + let ParserMatchClass = Imm0_255Operand; + let PrintMethod = "printHexImm"; +} + +// imm0_127 predicate - True if the immediate is in the range [0,127] +def Imm0_127Operand : AsmImmRange<0, 127>; +def imm0_127 : Operand, ImmLeaf { + let ParserMatchClass = Imm0_127Operand; + let PrintMethod = "printHexImm"; +} + +// NOTE: These imm0_N operands have to be of type i64 because i64 is the size +// for all shift-amounts. + +// imm0_63 predicate - True if the immediate is in the range [0,63] +def imm0_63 : Operand, ImmLeaf { + let ParserMatchClass = Imm0_63Operand; +} + +// imm0_31 predicate - True if the immediate is in the range [0,31] +def imm0_31 : Operand, ImmLeaf { + let ParserMatchClass = Imm0_31Operand; +} + +// imm0_15 predicate - True if the immediate is in the range [0,15] +def imm0_15 : Operand, ImmLeaf { + let ParserMatchClass = Imm0_15Operand; +} + +// imm0_7 predicate - True if the immediate is in the range [0,7] +def imm0_7 : Operand, ImmLeaf { + let ParserMatchClass = Imm0_7Operand; +} + +// An arithmetic shifter operand: +// {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr +// {5-0} - imm6 +class arith_shift : Operand { + let PrintMethod = "printShifter"; + let ParserMatchClass = !cast( + "ArithmeticShifterOperand" # width); +} + +def arith_shift32 : arith_shift; +def arith_shift64 : arith_shift; + +class arith_shifted_reg + : Operand, + ComplexPattern { + let PrintMethod = "printShiftedRegister"; + let MIOperandInfo = (ops regclass, !cast("arith_shift" # width)); +} + +def arith_shifted_reg32 : arith_shifted_reg; +def arith_shifted_reg64 : arith_shifted_reg; + +// An arithmetic shifter operand: +// {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror +// {5-0} - imm6 +class logical_shift : Operand { + let PrintMethod = "printShifter"; + let ParserMatchClass = !cast( + "LogicalShifterOperand" # width); +} + +def logical_shift32 : logical_shift<32>; +def logical_shift64 : logical_shift<64>; + +class logical_shifted_reg + : Operand, + ComplexPattern { + let PrintMethod = "printShiftedRegister"; + let MIOperandInfo = (ops regclass, shiftop); +} + +def logical_shifted_reg32 : logical_shifted_reg; +def logical_shifted_reg64 : logical_shifted_reg; + +// A logical vector shifter operand: +// {7-6} - shift type: 00 = lsl +// {5-0} - imm6: #0, #8, #16, or #24 +def logical_vec_shift : Operand { + let PrintMethod = "printShifter"; + let EncoderMethod = "getVecShifterOpValue"; + let ParserMatchClass = LogicalVecShifterOperand; +} + +// A logical vector half-word shifter operand: +// {7-6} - shift type: 00 = lsl +// {5-0} - imm6: #0 or #8 +def logical_vec_hw_shift : Operand { + let PrintMethod = "printShifter"; + let EncoderMethod = "getVecShifterOpValue"; + let ParserMatchClass = LogicalVecHalfWordShifterOperand; +} + +// A vector move shifter operand: +// {0} - imm1: #8 or #16 +def move_vec_shift : Operand { + let PrintMethod = "printShifter"; + let EncoderMethod = "getMoveVecShifterOpValue"; + let ParserMatchClass = MoveVecShifterOperand; +} + +def AddSubImmOperand : AsmOperandClass { + let Name = "AddSubImm"; + let ParserMethod = "tryParseAddSubImm"; + let DiagnosticType = "AddSubSecondSource"; +} +// An ADD/SUB immediate shifter operand: +// second operand: +// {7-6} - shift type: 00 = lsl +// {5-0} - imm6: #0 or #12 +class addsub_shifted_imm + : Operand, ComplexPattern { + let PrintMethod = "printAddSubImm"; + let EncoderMethod = "getAddSubImmOpValue"; + let ParserMatchClass = AddSubImmOperand; + let MIOperandInfo = (ops i32imm, i32imm); +} + +def addsub_shifted_imm32 : addsub_shifted_imm; +def addsub_shifted_imm64 : addsub_shifted_imm; + +class neg_addsub_shifted_imm + : Operand, ComplexPattern { + let PrintMethod = "printAddSubImm"; + let EncoderMethod = "getAddSubImmOpValue"; + let ParserMatchClass = AddSubImmOperand; + let MIOperandInfo = (ops i32imm, i32imm); +} + +def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm; +def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm; + +// An extend operand: +// {5-3} - extend type +// {2-0} - imm3 +def arith_extend : Operand { + let PrintMethod = "printArithExtend"; + let ParserMatchClass = ExtendOperand; +} +def arith_extend64 : Operand { + let PrintMethod = "printArithExtend"; + let ParserMatchClass = ExtendOperand64; +} + +// 'extend' that's a lsl of a 64-bit register. +def arith_extendlsl64 : Operand { + let PrintMethod = "printArithExtend"; + let ParserMatchClass = ExtendOperandLSL64; +} + +class arith_extended_reg32 : Operand, + ComplexPattern { + let PrintMethod = "printExtendedRegister"; + let MIOperandInfo = (ops GPR32, arith_extend); +} + +class arith_extended_reg32to64 : Operand, + ComplexPattern { + let PrintMethod = "printExtendedRegister"; + let MIOperandInfo = (ops GPR32, arith_extend64); +} + +// Floating-point immediate. +def fpimm32 : Operand, + PatLeaf<(f32 fpimm), [{ + return AArch64_AM::getFP32Imm(N->getValueAPF()) != -1; + }], SDNodeXFormgetValueAPF(); + uint32_t enc = AArch64_AM::getFP32Imm(InVal); + return CurDAG->getTargetConstant(enc, MVT::i32); + }]>> { + let ParserMatchClass = FPImmOperand; + let PrintMethod = "printFPImmOperand"; +} +def fpimm64 : Operand, + PatLeaf<(f64 fpimm), [{ + return AArch64_AM::getFP64Imm(N->getValueAPF()) != -1; + }], SDNodeXFormgetValueAPF(); + uint32_t enc = AArch64_AM::getFP64Imm(InVal); + return CurDAG->getTargetConstant(enc, MVT::i32); + }]>> { + let ParserMatchClass = FPImmOperand; + let PrintMethod = "printFPImmOperand"; +} + +def fpimm8 : Operand { + let ParserMatchClass = FPImmOperand; + let PrintMethod = "printFPImmOperand"; +} + +def fpimm0 : PatLeaf<(fpimm), [{ + return N->isExactlyValue(+0.0); +}]>; + +// Vector lane operands +class AsmVectorIndex : AsmOperandClass { + let Name = "VectorIndex" # Suffix; + let DiagnosticType = "InvalidIndex" # Suffix; +} +def VectorIndex1Operand : AsmVectorIndex<"1">; +def VectorIndexBOperand : AsmVectorIndex<"B">; +def VectorIndexHOperand : AsmVectorIndex<"H">; +def VectorIndexSOperand : AsmVectorIndex<"S">; +def VectorIndexDOperand : AsmVectorIndex<"D">; + +def VectorIndex1 : Operand, ImmLeaf { + let ParserMatchClass = VectorIndex1Operand; + let PrintMethod = "printVectorIndex"; + let MIOperandInfo = (ops i64imm); +} +def VectorIndexB : Operand, ImmLeaf { + let ParserMatchClass = VectorIndexBOperand; + let PrintMethod = "printVectorIndex"; + let MIOperandInfo = (ops i64imm); +} +def VectorIndexH : Operand, ImmLeaf { + let ParserMatchClass = VectorIndexHOperand; + let PrintMethod = "printVectorIndex"; + let MIOperandInfo = (ops i64imm); +} +def VectorIndexS : Operand, ImmLeaf { + let ParserMatchClass = VectorIndexSOperand; + let PrintMethod = "printVectorIndex"; + let MIOperandInfo = (ops i64imm); +} +def VectorIndexD : Operand, ImmLeaf { + let ParserMatchClass = VectorIndexDOperand; + let PrintMethod = "printVectorIndex"; + let MIOperandInfo = (ops i64imm); +} + +// 8-bit immediate for AdvSIMD where 64-bit values of the form: +// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh +// are encoded as the eight bit value 'abcdefgh'. +def simdimmtype10 : Operand, + PatLeaf<(f64 fpimm), [{ + return AArch64_AM::isAdvSIMDModImmType10(N->getValueAPF() + .bitcastToAPInt() + .getZExtValue()); + }], SDNodeXFormgetValueAPF(); + uint32_t enc = AArch64_AM::encodeAdvSIMDModImmType10(N->getValueAPF() + .bitcastToAPInt() + .getZExtValue()); + return CurDAG->getTargetConstant(enc, MVT::i32); + }]>> { + let ParserMatchClass = SIMDImmType10Operand; + let PrintMethod = "printSIMDType10Operand"; +} + + +//--- +// System management +//--- + +// Base encoding for system instruction operands. +let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in +class BaseSystemI + : I { + let Inst{31-22} = 0b1101010100; + let Inst{21} = L; +} + +// System instructions which do not have an Rt register. +class SimpleSystemI + : BaseSystemI { + let Inst{4-0} = 0b11111; +} + +// System instructions which have an Rt register. +class RtSystemI + : BaseSystemI, + Sched<[WriteSys]> { + bits<5> Rt; + let Inst{4-0} = Rt; +} + +// Hint instructions that take both a CRm and a 3-bit immediate. +class HintI + : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "">, + Sched<[WriteHint]> { + bits <7> imm; + let Inst{20-12} = 0b000110010; + let Inst{11-5} = imm; +} + +// System instructions taking a single literal operand which encodes into +// CRm. op2 differentiates the opcodes. +def BarrierAsmOperand : AsmOperandClass { + let Name = "Barrier"; + let ParserMethod = "tryParseBarrierOperand"; +} +def barrier_op : Operand { + let PrintMethod = "printBarrierOption"; + let ParserMatchClass = BarrierAsmOperand; +} +class CRmSystemI opc, string asm> + : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm">, + Sched<[WriteBarrier]> { + bits<4> CRm; + let Inst{20-12} = 0b000110011; + let Inst{11-8} = CRm; + let Inst{7-5} = opc; +} + +// MRS/MSR system instructions. These have different operand classes because +// a different subset of registers can be accessed through each instruction. +def MRSSystemRegisterOperand : AsmOperandClass { + let Name = "MRSSystemRegister"; + let ParserMethod = "tryParseSysReg"; + let DiagnosticType = "MRS"; +} +// concatenation of 1, op0, op1, CRn, CRm, op2. 16-bit immediate. +def mrs_sysreg_op : Operand { + let ParserMatchClass = MRSSystemRegisterOperand; + let DecoderMethod = "DecodeMRSSystemRegister"; + let PrintMethod = "printMRSSystemRegister"; +} + +def MSRSystemRegisterOperand : AsmOperandClass { + let Name = "MSRSystemRegister"; + let ParserMethod = "tryParseSysReg"; + let DiagnosticType = "MSR"; +} +def msr_sysreg_op : Operand { + let ParserMatchClass = MSRSystemRegisterOperand; + let DecoderMethod = "DecodeMSRSystemRegister"; + let PrintMethod = "printMSRSystemRegister"; +} + +class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg), + "mrs", "\t$Rt, $systemreg"> { + bits<15> systemreg; + let Inst{20} = 1; + let Inst{19-5} = systemreg; +} + +// FIXME: Some of these def NZCV, others don't. Best way to model that? +// Explicitly modeling each of the system register as a register class +// would do it, but feels like overkill at this point. +class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt), + "msr", "\t$systemreg, $Rt"> { + bits<15> systemreg; + let Inst{20} = 1; + let Inst{19-5} = systemreg; +} + +def SystemPStateFieldOperand : AsmOperandClass { + let Name = "SystemPStateField"; + let ParserMethod = "tryParseSysReg"; +} +def pstatefield_op : Operand { + let ParserMatchClass = SystemPStateFieldOperand; + let PrintMethod = "printSystemPStateField"; +} + +let Defs = [NZCV] in +class MSRpstateI + : SimpleSystemI<0, (ins pstatefield_op:$pstate_field, imm0_15:$imm), + "msr", "\t$pstate_field, $imm">, + Sched<[WriteSys]> { + bits<6> pstatefield; + bits<4> imm; + let Inst{20-19} = 0b00; + let Inst{18-16} = pstatefield{5-3}; + let Inst{15-12} = 0b0100; + let Inst{11-8} = imm; + let Inst{7-5} = pstatefield{2-0}; + + let DecoderMethod = "DecodeSystemPStateInstruction"; +} + +// SYS and SYSL generic system instructions. +def SysCRAsmOperand : AsmOperandClass { + let Name = "SysCR"; + let ParserMethod = "tryParseSysCROperand"; +} + +def sys_cr_op : Operand { + let PrintMethod = "printSysCROperand"; + let ParserMatchClass = SysCRAsmOperand; +} + +class SystemXtI + : RtSystemI { + bits<3> op1; + bits<4> Cn; + bits<4> Cm; + bits<3> op2; + let Inst{20-19} = 0b01; + let Inst{18-16} = op1; + let Inst{15-12} = Cn; + let Inst{11-8} = Cm; + let Inst{7-5} = op2; +} + +class SystemLXtI + : RtSystemI { + bits<3> op1; + bits<4> Cn; + bits<4> Cm; + bits<3> op2; + let Inst{20-19} = 0b01; + let Inst{18-16} = op1; + let Inst{15-12} = Cn; + let Inst{11-8} = Cm; + let Inst{7-5} = op2; +} + + +// Branch (register) instructions: +// +// case opc of +// 0001 blr +// 0000 br +// 0101 dret +// 0100 eret +// 0010 ret +// otherwise UNDEFINED +class BaseBranchReg opc, dag oops, dag iops, string asm, + string operands, list pattern> + : I, Sched<[WriteBrReg]> { + let Inst{31-25} = 0b1101011; + let Inst{24-21} = opc; + let Inst{20-16} = 0b11111; + let Inst{15-10} = 0b000000; + let Inst{4-0} = 0b00000; +} + +class BranchReg opc, string asm, list pattern> + : BaseBranchReg { + bits<5> Rn; + let Inst{9-5} = Rn; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 1, isReturn = 1 in +class SpecialReturn opc, string asm> + : BaseBranchReg { + let Inst{9-5} = 0b11111; +} + +//--- +// Conditional branch instruction. +//--- + +// Condition code. +// 4-bit immediate. Pretty-printed as +def ccode : Operand { + let PrintMethod = "printCondCode"; + let ParserMatchClass = CondCode; +} +def inv_ccode : Operand { + let PrintMethod = "printInverseCondCode"; + let ParserMatchClass = CondCode; +} + +// Conditional branch target. 19-bit immediate. The low two bits of the target +// offset are implied zero and so are not part of the immediate. +def PCRelLabel19Operand : AsmOperandClass { + let Name = "PCRelLabel19"; + let DiagnosticType = "InvalidLabel"; +} +def am_brcond : Operand { + let EncoderMethod = "getCondBranchTargetOpValue"; + let DecoderMethod = "DecodePCRelLabel19"; + let PrintMethod = "printAlignedLabel"; + let ParserMatchClass = PCRelLabel19Operand; +} + +class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target), + "b", ".$cond\t$target", "", + [(AArch64brcond bb:$target, imm:$cond, NZCV)]>, + Sched<[WriteBr]> { + let isBranch = 1; + let isTerminator = 1; + let Uses = [NZCV]; + + bits<4> cond; + bits<19> target; + let Inst{31-24} = 0b01010100; + let Inst{23-5} = target; + let Inst{4} = 0; + let Inst{3-0} = cond; +} + +//--- +// Compare-and-branch instructions. +//--- +class BaseCmpBranch + : I<(outs), (ins regtype:$Rt, am_brcond:$target), + asm, "\t$Rt, $target", "", + [(node regtype:$Rt, bb:$target)]>, + Sched<[WriteBr]> { + let isBranch = 1; + let isTerminator = 1; + + bits<5> Rt; + bits<19> target; + let Inst{30-25} = 0b011010; + let Inst{24} = op; + let Inst{23-5} = target; + let Inst{4-0} = Rt; +} + +multiclass CmpBranch { + def W : BaseCmpBranch { + let Inst{31} = 0; + } + def X : BaseCmpBranch { + let Inst{31} = 1; + } +} + +//--- +// Test-bit-and-branch instructions. +//--- +// Test-and-branch target. 14-bit sign-extended immediate. The low two bits of +// the target offset are implied zero and so are not part of the immediate. +def BranchTarget14Operand : AsmOperandClass { + let Name = "BranchTarget14"; +} +def am_tbrcond : Operand { + let EncoderMethod = "getTestBranchTargetOpValue"; + let PrintMethod = "printAlignedLabel"; + let ParserMatchClass = BranchTarget14Operand; +} + +// AsmOperand classes to emit (or not) special diagnostics +def TBZImm0_31Operand : AsmOperandClass { + let Name = "TBZImm0_31"; + let PredicateMethod = "isImm0_31"; + let RenderMethod = "addImm0_31Operands"; +} +def TBZImm32_63Operand : AsmOperandClass { + let Name = "Imm32_63"; + let DiagnosticType = "InvalidImm0_63"; +} + +class tbz_imm0_31 : Operand, ImmLeaf { + let ParserMatchClass = matcher; +} + +def tbz_imm0_31_diag : tbz_imm0_31; +def tbz_imm0_31_nodiag : tbz_imm0_31; + +def tbz_imm32_63 : Operand, ImmLeaf 31) && (((uint32_t)Imm) < 64); +}]> { + let ParserMatchClass = TBZImm32_63Operand; +} + +class BaseTestBranch + : I<(outs), (ins regtype:$Rt, immtype:$bit_off, am_tbrcond:$target), + asm, "\t$Rt, $bit_off, $target", "", + [(node regtype:$Rt, immtype:$bit_off, bb:$target)]>, + Sched<[WriteBr]> { + let isBranch = 1; + let isTerminator = 1; + + bits<5> Rt; + bits<6> bit_off; + bits<14> target; + + let Inst{30-25} = 0b011011; + let Inst{24} = op; + let Inst{23-19} = bit_off{4-0}; + let Inst{18-5} = target; + let Inst{4-0} = Rt; + + let DecoderMethod = "DecodeTestAndBranch"; +} + +multiclass TestBranch { + def W : BaseTestBranch { + let Inst{31} = 0; + } + + def X : BaseTestBranch { + let Inst{31} = 1; + } + + // Alias X-reg with 0-31 imm to W-Reg. + def : InstAlias(NAME#"W") GPR32as64:$Rd, + tbz_imm0_31_nodiag:$imm, am_tbrcond:$target), 0>; + def : Pat<(node GPR64:$Rn, tbz_imm0_31_diag:$imm, bb:$target), + (!cast(NAME#"W") (EXTRACT_SUBREG GPR64:$Rn, sub_32), + tbz_imm0_31_diag:$imm, bb:$target)>; +} + +//--- +// Unconditional branch (immediate) instructions. +//--- +def BranchTarget26Operand : AsmOperandClass { + let Name = "BranchTarget26"; + let DiagnosticType = "InvalidLabel"; +} +def am_b_target : Operand { + let EncoderMethod = "getBranchTargetOpValue"; + let PrintMethod = "printAlignedLabel"; + let ParserMatchClass = BranchTarget26Operand; +} +def am_bl_target : Operand { + let EncoderMethod = "getBranchTargetOpValue"; + let PrintMethod = "printAlignedLabel"; + let ParserMatchClass = BranchTarget26Operand; +} + +class BImm pattern> + : I<(outs), iops, asm, "\t$addr", "", pattern>, Sched<[WriteBr]> { + bits<26> addr; + let Inst{31} = op; + let Inst{30-26} = 0b00101; + let Inst{25-0} = addr; + + let DecoderMethod = "DecodeUnconditionalBranch"; +} + +class BranchImm pattern> + : BImm; +class CallImm pattern> + : BImm; + +//--- +// Basic one-operand data processing instructions. +//--- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseOneOperandData opc, RegisterClass regtype, string asm, + SDPatternOperator node> + : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "", + [(set regtype:$Rd, (node regtype:$Rn))]>, + Sched<[WriteI, ReadI]> { + bits<5> Rd; + bits<5> Rn; + + let Inst{30-13} = 0b101101011000000000; + let Inst{12-10} = opc; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +multiclass OneOperandData opc, string asm, + SDPatternOperator node = null_frag> { + def Wr : BaseOneOperandData { + let Inst{31} = 0; + } + + def Xr : BaseOneOperandData { + let Inst{31} = 1; + } +} + +class OneWRegData opc, string asm, SDPatternOperator node> + : BaseOneOperandData { + let Inst{31} = 0; +} + +class OneXRegData opc, string asm, SDPatternOperator node> + : BaseOneOperandData { + let Inst{31} = 1; +} + +//--- +// Basic two-operand data processing instructions. +//--- +class BaseBaseAddSubCarry pattern> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), + asm, "\t$Rd, $Rn, $Rm", "", pattern>, + Sched<[WriteI, ReadI, ReadI]> { + let Uses = [NZCV]; + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{30} = isSub; + let Inst{28-21} = 0b11010000; + let Inst{20-16} = Rm; + let Inst{15-10} = 0; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class BaseAddSubCarry + : BaseBaseAddSubCarry; + +class BaseAddSubCarrySetFlags + : BaseBaseAddSubCarry { + let Defs = [NZCV]; +} + +multiclass AddSubCarry { + def Wr : BaseAddSubCarry { + let Inst{31} = 0; + let Inst{29} = 0; + } + def Xr : BaseAddSubCarry { + let Inst{31} = 1; + let Inst{29} = 0; + } + + // Sets flags. + def SWr : BaseAddSubCarrySetFlags { + let Inst{31} = 0; + let Inst{29} = 1; + } + def SXr : BaseAddSubCarrySetFlags { + let Inst{31} = 1; + let Inst{29} = 1; + } +} + +class BaseTwoOperand opc, RegisterClass regtype, string asm, + SDPatternOperator OpNode> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), + asm, "\t$Rd, $Rn, $Rm", "", + [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{30-21} = 0b0011010110; + let Inst{20-16} = Rm; + let Inst{15-14} = 0b00; + let Inst{13-10} = opc; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class BaseDiv + : BaseTwoOperand<{0,0,1,?}, regtype, asm, OpNode> { + let Inst{10} = isSigned; +} + +multiclass Div { + def Wr : BaseDiv, + Sched<[WriteID32, ReadID, ReadID]> { + let Inst{31} = 0; + } + def Xr : BaseDiv, + Sched<[WriteID64, ReadID, ReadID]> { + let Inst{31} = 1; + } +} + +class BaseShift shift_type, RegisterClass regtype, string asm, + SDPatternOperator OpNode = null_frag> + : BaseTwoOperand<{1,0,?,?}, regtype, asm, OpNode>, + Sched<[WriteIS, ReadI]> { + let Inst{11-10} = shift_type; +} + +multiclass Shift shift_type, string asm, SDNode OpNode> { + def Wr : BaseShift { + let Inst{31} = 0; + } + + def Xr : BaseShift { + let Inst{31} = 1; + } + + def : Pat<(i32 (OpNode GPR32:$Rn, i64:$Rm)), + (!cast(NAME # "Wr") GPR32:$Rn, + (EXTRACT_SUBREG i64:$Rm, sub_32))>; + + def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (zext GPR32:$Rm)))), + (!cast(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>; + + def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (anyext GPR32:$Rm)))), + (!cast(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>; + + def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (sext GPR32:$Rm)))), + (!cast(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>; +} + +class ShiftAlias + : InstAlias; + +class BaseMulAccum opc, RegisterClass multype, + RegisterClass addtype, string asm, + list pattern> + : I<(outs addtype:$Rd), (ins multype:$Rn, multype:$Rm, addtype:$Ra), + asm, "\t$Rd, $Rn, $Rm, $Ra", "", pattern> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + bits<5> Ra; + let Inst{30-24} = 0b0011011; + let Inst{23-21} = opc; + let Inst{20-16} = Rm; + let Inst{15} = isSub; + let Inst{14-10} = Ra; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass MulAccum { + def Wrrr : BaseMulAccum, + Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> { + let Inst{31} = 0; + } + + def Xrrr : BaseMulAccum, + Sched<[WriteIM64, ReadIMA, ReadIM, ReadIM]> { + let Inst{31} = 1; + } +} + +class WideMulAccum opc, string asm, + SDNode AccNode, SDNode ExtNode> + : BaseMulAccum, + Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> { + let Inst{31} = 1; +} + +class MulHi opc, string asm, SDNode OpNode> + : I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm), + asm, "\t$Rd, $Rn, $Rm", "", + [(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64:$Rm))]>, + Sched<[WriteIM64, ReadIM, ReadIM]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31-24} = 0b10011011; + let Inst{23-21} = opc; + let Inst{20-16} = Rm; + let Inst{15} = 0; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; + + // The Ra field of SMULH and UMULH is unused: it should be assembled as 31 + // (i.e. all bits 1) but is ignored by the processor. + let PostEncoderMethod = "fixMulHigh"; +} + +class MulAccumWAlias + : InstAlias; +class MulAccumXAlias + : InstAlias; +class WideMulAccumAlias + : InstAlias; + +class BaseCRC32 sz, bit C, RegisterClass StreamReg, + SDPatternOperator OpNode, string asm> + : I<(outs GPR32:$Rd), (ins GPR32:$Rn, StreamReg:$Rm), + asm, "\t$Rd, $Rn, $Rm", "", + [(set GPR32:$Rd, (OpNode GPR32:$Rn, StreamReg:$Rm))]>, + Sched<[WriteISReg, ReadI, ReadISReg]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + + let Inst{31} = sf; + let Inst{30-21} = 0b0011010110; + let Inst{20-16} = Rm; + let Inst{15-13} = 0b010; + let Inst{12} = C; + let Inst{11-10} = sz; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; + let Predicates = [HasCRC]; +} + +//--- +// Address generation. +//--- + +class ADRI pattern> + : I<(outs GPR64:$Xd), (ins adr:$label), asm, "\t$Xd, $label", "", + pattern>, + Sched<[WriteI]> { + bits<5> Xd; + bits<21> label; + let Inst{31} = page; + let Inst{30-29} = label{1-0}; + let Inst{28-24} = 0b10000; + let Inst{23-5} = label{20-2}; + let Inst{4-0} = Xd; + + let DecoderMethod = "DecodeAdrInstruction"; +} + +//--- +// Move immediate. +//--- + +def movimm32_imm : Operand { + let ParserMatchClass = Imm0_65535Operand; + let EncoderMethod = "getMoveWideImmOpValue"; + let PrintMethod = "printHexImm"; +} +def movimm32_shift : Operand { + let PrintMethod = "printShifter"; + let ParserMatchClass = MovImm32ShifterOperand; +} +def movimm64_shift : Operand { + let PrintMethod = "printShifter"; + let ParserMatchClass = MovImm64ShifterOperand; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseMoveImmediate opc, RegisterClass regtype, Operand shifter, + string asm> + : I<(outs regtype:$Rd), (ins movimm32_imm:$imm, shifter:$shift), + asm, "\t$Rd, $imm$shift", "", []>, + Sched<[WriteImm]> { + bits<5> Rd; + bits<16> imm; + bits<6> shift; + let Inst{30-29} = opc; + let Inst{28-23} = 0b100101; + let Inst{22-21} = shift{5-4}; + let Inst{20-5} = imm; + let Inst{4-0} = Rd; + + let DecoderMethod = "DecodeMoveImmInstruction"; +} + +multiclass MoveImmediate opc, string asm> { + def Wi : BaseMoveImmediate { + let Inst{31} = 0; + } + + def Xi : BaseMoveImmediate { + let Inst{31} = 1; + } +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseInsertImmediate opc, RegisterClass regtype, Operand shifter, + string asm> + : I<(outs regtype:$Rd), + (ins regtype:$src, movimm32_imm:$imm, shifter:$shift), + asm, "\t$Rd, $imm$shift", "$src = $Rd", []>, + Sched<[WriteI, ReadI]> { + bits<5> Rd; + bits<16> imm; + bits<6> shift; + let Inst{30-29} = opc; + let Inst{28-23} = 0b100101; + let Inst{22-21} = shift{5-4}; + let Inst{20-5} = imm; + let Inst{4-0} = Rd; + + let DecoderMethod = "DecodeMoveImmInstruction"; +} + +multiclass InsertImmediate opc, string asm> { + def Wi : BaseInsertImmediate { + let Inst{31} = 0; + } + + def Xi : BaseInsertImmediate { + let Inst{31} = 1; + } +} + +//--- +// Add/Subtract +//--- + +class BaseAddSubImm + : I<(outs dstRegtype:$Rd), (ins srcRegtype:$Rn, immtype:$imm), + asm, "\t$Rd, $Rn, $imm", "", + [(set dstRegtype:$Rd, (OpNode srcRegtype:$Rn, immtype:$imm))]>, + Sched<[WriteI, ReadI]> { + bits<5> Rd; + bits<5> Rn; + bits<14> imm; + let Inst{30} = isSub; + let Inst{29} = setFlags; + let Inst{28-24} = 0b10001; + let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12 + let Inst{21-10} = imm{11-0}; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; + let DecoderMethod = "DecodeBaseAddSubImm"; +} + +class BaseAddSubRegPseudo + : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), + [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>, + Sched<[WriteI, ReadI, ReadI]>; + +class BaseAddSubSReg + : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm), + asm, "\t$Rd, $Rn, $Rm", "", + [(set regtype:$Rd, (OpNode regtype:$Rn, shifted_regtype:$Rm))]>, + Sched<[WriteISReg, ReadI, ReadISReg]> { + // The operands are in order to match the 'addr' MI operands, so we + // don't need an encoder method and by-name matching. Just use the default + // in-order handling. Since we're using by-order, make sure the names + // do not match. + bits<5> dst; + bits<5> src1; + bits<5> src2; + bits<8> shift; + let Inst{30} = isSub; + let Inst{29} = setFlags; + let Inst{28-24} = 0b01011; + let Inst{23-22} = shift{7-6}; + let Inst{21} = 0; + let Inst{20-16} = src2; + let Inst{15-10} = shift{5-0}; + let Inst{9-5} = src1; + let Inst{4-0} = dst; + + let DecoderMethod = "DecodeThreeAddrSRegInstruction"; +} + +class BaseAddSubEReg + : I<(outs dstRegtype:$R1), + (ins src1Regtype:$R2, src2Regtype:$R3), + asm, "\t$R1, $R2, $R3", "", + [(set dstRegtype:$R1, (OpNode src1Regtype:$R2, src2Regtype:$R3))]>, + Sched<[WriteIEReg, ReadI, ReadIEReg]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + bits<6> ext; + let Inst{30} = isSub; + let Inst{29} = setFlags; + let Inst{28-24} = 0b01011; + let Inst{23-21} = 0b001; + let Inst{20-16} = Rm; + let Inst{15-13} = ext{5-3}; + let Inst{12-10} = ext{2-0}; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; + + let DecoderMethod = "DecodeAddSubERegInstruction"; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseAddSubEReg64 + : I<(outs dstRegtype:$Rd), + (ins src1Regtype:$Rn, src2Regtype:$Rm, ext_op:$ext), + asm, "\t$Rd, $Rn, $Rm$ext", "", []>, + Sched<[WriteIEReg, ReadI, ReadIEReg]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + bits<6> ext; + let Inst{30} = isSub; + let Inst{29} = setFlags; + let Inst{28-24} = 0b01011; + let Inst{23-21} = 0b001; + let Inst{20-16} = Rm; + let Inst{15} = ext{5}; + let Inst{12-10} = ext{2-0}; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; + + let DecoderMethod = "DecodeAddSubERegInstruction"; +} + +// Aliases for register+register add/subtract. +class AddSubRegAlias + : InstAlias; + +multiclass AddSub { + let hasSideEffects = 0 in { + // Add/Subtract immediate + def Wri : BaseAddSubImm { + let Inst{31} = 0; + } + def Xri : BaseAddSubImm { + let Inst{31} = 1; + } + + // Add/Subtract register - Only used for CodeGen + def Wrr : BaseAddSubRegPseudo; + def Xrr : BaseAddSubRegPseudo; + + // Add/Subtract shifted register + def Wrs : BaseAddSubSReg { + let Inst{31} = 0; + } + def Xrs : BaseAddSubSReg { + let Inst{31} = 1; + } + } + + // Add/Subtract extended register + let AddedComplexity = 1, hasSideEffects = 0 in { + def Wrx : BaseAddSubEReg, mnemonic, OpNode> { + let Inst{31} = 0; + } + def Xrx : BaseAddSubEReg, mnemonic, OpNode> { + let Inst{31} = 1; + } + } + + def Xrx64 : BaseAddSubEReg64 { + // UXTX and SXTX only. + let Inst{14-13} = 0b11; + let Inst{31} = 1; + } + + // Register/register aliases with no shift when SP is not used. + def : AddSubRegAlias(NAME#"Wrs"), + GPR32, GPR32, GPR32, 0>; + def : AddSubRegAlias(NAME#"Xrs"), + GPR64, GPR64, GPR64, 0>; + + // Register/register aliases with no shift when either the destination or + // first source register is SP. + def : AddSubRegAlias(NAME#"Wrx"), + GPR32sponly, GPR32sp, GPR32, 16>; // UXTW #0 + def : AddSubRegAlias(NAME#"Wrx"), + GPR32sp, GPR32sponly, GPR32, 16>; // UXTW #0 + def : AddSubRegAlias(NAME#"Xrx64"), + GPR64sponly, GPR64sp, GPR64, 24>; // UXTX #0 + def : AddSubRegAlias(NAME#"Xrx64"), + GPR64sp, GPR64sponly, GPR64, 24>; // UXTX #0 +} + +multiclass AddSubS { + let isCompare = 1, Defs = [NZCV] in { + // Add/Subtract immediate + def Wri : BaseAddSubImm { + let Inst{31} = 0; + } + def Xri : BaseAddSubImm { + let Inst{31} = 1; + } + + // Add/Subtract register + def Wrr : BaseAddSubRegPseudo; + def Xrr : BaseAddSubRegPseudo; + + // Add/Subtract shifted register + def Wrs : BaseAddSubSReg { + let Inst{31} = 0; + } + def Xrs : BaseAddSubSReg { + let Inst{31} = 1; + } + + // Add/Subtract extended register + let AddedComplexity = 1 in { + def Wrx : BaseAddSubEReg, mnemonic, OpNode> { + let Inst{31} = 0; + } + def Xrx : BaseAddSubEReg, mnemonic, OpNode> { + let Inst{31} = 1; + } + } + + def Xrx64 : BaseAddSubEReg64 { + // UXTX and SXTX only. + let Inst{14-13} = 0b11; + let Inst{31} = 1; + } + } // Defs = [NZCV] + + // Compare aliases + def : InstAlias(NAME#"Wri") + WZR, GPR32sp:$src, addsub_shifted_imm32:$imm), 5>; + def : InstAlias(NAME#"Xri") + XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>; + def : InstAlias(NAME#"Wrx") + WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>; + def : InstAlias(NAME#"Xrx") + XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>; + def : InstAlias(NAME#"Xrx64") + XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>; + def : InstAlias(NAME#"Wrs") + WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>; + def : InstAlias(NAME#"Xrs") + XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>; + + // Compare shorthands + def : InstAlias(NAME#"Wrs") + WZR, GPR32:$src1, GPR32:$src2, 0), 5>; + def : InstAlias(NAME#"Xrs") + XZR, GPR64:$src1, GPR64:$src2, 0), 5>; + + // Register/register aliases with no shift when SP is not used. + def : AddSubRegAlias(NAME#"Wrs"), + GPR32, GPR32, GPR32, 0>; + def : AddSubRegAlias(NAME#"Xrs"), + GPR64, GPR64, GPR64, 0>; + + // Register/register aliases with no shift when the first source register + // is SP. + def : AddSubRegAlias(NAME#"Wrx"), + GPR32, GPR32sponly, GPR32, 16>; // UXTW #0 + def : AddSubRegAlias(NAME#"Xrx64"), + GPR64, GPR64sponly, GPR64, 24>; // UXTX #0 +} + +//--- +// Extract +//--- +def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, + SDTCisPtrTy<3>]>; +def AArch64Extr : SDNode<"AArch64ISD::EXTR", SDTA64EXTR>; + +class BaseExtractImm patterns> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, imm_type:$imm), + asm, "\t$Rd, $Rn, $Rm, $imm", "", patterns>, + Sched<[WriteExtr, ReadExtrHi]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + bits<6> imm; + + let Inst{30-23} = 0b00100111; + let Inst{21} = 0; + let Inst{20-16} = Rm; + let Inst{15-10} = imm; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass ExtractImm { + def Wrri : BaseExtractImm { + let Inst{31} = 0; + let Inst{22} = 0; + // imm<5> must be zero. + let imm{5} = 0; + } + def Xrri : BaseExtractImm { + + let Inst{31} = 1; + let Inst{22} = 1; + } +} + +//--- +// Bitfield +//--- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseBitfieldImm opc, + RegisterClass regtype, Operand imm_type, string asm> + : I<(outs regtype:$Rd), (ins regtype:$Rn, imm_type:$immr, imm_type:$imms), + asm, "\t$Rd, $Rn, $immr, $imms", "", []>, + Sched<[WriteIS, ReadI]> { + bits<5> Rd; + bits<5> Rn; + bits<6> immr; + bits<6> imms; + + let Inst{30-29} = opc; + let Inst{28-23} = 0b100110; + let Inst{21-16} = immr; + let Inst{15-10} = imms; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass BitfieldImm opc, string asm> { + def Wri : BaseBitfieldImm { + let Inst{31} = 0; + let Inst{22} = 0; + // imms<5> and immr<5> must be zero, else ReservedValue(). + let Inst{21} = 0; + let Inst{15} = 0; + } + def Xri : BaseBitfieldImm { + let Inst{31} = 1; + let Inst{22} = 1; + } +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseBitfieldImmWith2RegArgs opc, + RegisterClass regtype, Operand imm_type, string asm> + : I<(outs regtype:$Rd), (ins regtype:$src, regtype:$Rn, imm_type:$immr, + imm_type:$imms), + asm, "\t$Rd, $Rn, $immr, $imms", "$src = $Rd", []>, + Sched<[WriteIS, ReadI]> { + bits<5> Rd; + bits<5> Rn; + bits<6> immr; + bits<6> imms; + + let Inst{30-29} = opc; + let Inst{28-23} = 0b100110; + let Inst{21-16} = immr; + let Inst{15-10} = imms; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass BitfieldImmWith2RegArgs opc, string asm> { + def Wri : BaseBitfieldImmWith2RegArgs { + let Inst{31} = 0; + let Inst{22} = 0; + // imms<5> and immr<5> must be zero, else ReservedValue(). + let Inst{21} = 0; + let Inst{15} = 0; + } + def Xri : BaseBitfieldImmWith2RegArgs { + let Inst{31} = 1; + let Inst{22} = 1; + } +} + +//--- +// Logical +//--- + +// Logical (immediate) +class BaseLogicalImm opc, RegisterClass dregtype, + RegisterClass sregtype, Operand imm_type, string asm, + list pattern> + : I<(outs dregtype:$Rd), (ins sregtype:$Rn, imm_type:$imm), + asm, "\t$Rd, $Rn, $imm", "", pattern>, + Sched<[WriteI, ReadI]> { + bits<5> Rd; + bits<5> Rn; + bits<13> imm; + let Inst{30-29} = opc; + let Inst{28-23} = 0b100100; + let Inst{22} = imm{12}; + let Inst{21-16} = imm{11-6}; + let Inst{15-10} = imm{5-0}; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; + + let DecoderMethod = "DecodeLogicalImmInstruction"; +} + +// Logical (shifted register) +class BaseLogicalSReg opc, bit N, RegisterClass regtype, + logical_shifted_reg shifted_regtype, string asm, + list pattern> + : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm), + asm, "\t$Rd, $Rn, $Rm", "", pattern>, + Sched<[WriteISReg, ReadI, ReadISReg]> { + // The operands are in order to match the 'addr' MI operands, so we + // don't need an encoder method and by-name matching. Just use the default + // in-order handling. Since we're using by-order, make sure the names + // do not match. + bits<5> dst; + bits<5> src1; + bits<5> src2; + bits<8> shift; + let Inst{30-29} = opc; + let Inst{28-24} = 0b01010; + let Inst{23-22} = shift{7-6}; + let Inst{21} = N; + let Inst{20-16} = src2; + let Inst{15-10} = shift{5-0}; + let Inst{9-5} = src1; + let Inst{4-0} = dst; + + let DecoderMethod = "DecodeThreeAddrSRegInstruction"; +} + +// Aliases for register+register logical instructions. +class LogicalRegAlias + : InstAlias; + +let AddedComplexity = 6 in +multiclass LogicalImm opc, string mnemonic, SDNode OpNode> { + def Wri : BaseLogicalImm { + let Inst{31} = 0; + let Inst{22} = 0; // 64-bit version has an additional bit of immediate. + } + def Xri : BaseLogicalImm { + let Inst{31} = 1; + } +} + +multiclass LogicalImmS opc, string mnemonic, SDNode OpNode> { + let isCompare = 1, Defs = [NZCV] in { + def Wri : BaseLogicalImm { + let Inst{31} = 0; + let Inst{22} = 0; // 64-bit version has an additional bit of immediate. + } + def Xri : BaseLogicalImm { + let Inst{31} = 1; + } + } // end Defs = [NZCV] +} + +class BaseLogicalRegPseudo + : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), + [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>, + Sched<[WriteI, ReadI, ReadI]>; + +// Split from LogicalImm as not all instructions have both. +multiclass LogicalReg opc, bit N, string mnemonic, + SDPatternOperator OpNode> { + def Wrr : BaseLogicalRegPseudo; + def Xrr : BaseLogicalRegPseudo; + + def Wrs : BaseLogicalSReg { + let Inst{31} = 0; + } + def Xrs : BaseLogicalSReg { + let Inst{31} = 1; + } + + def : LogicalRegAlias(NAME#"Wrs"), GPR32>; + def : LogicalRegAlias(NAME#"Xrs"), GPR64>; +} + +// Split from LogicalReg to allow setting NZCV Defs +multiclass LogicalRegS opc, bit N, string mnemonic, + SDPatternOperator OpNode = null_frag> { + let Defs = [NZCV], mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + def Wrr : BaseLogicalRegPseudo; + def Xrr : BaseLogicalRegPseudo; + + def Wrs : BaseLogicalSReg { + let Inst{31} = 0; + } + def Xrs : BaseLogicalSReg { + let Inst{31} = 1; + } + } // Defs = [NZCV] + + def : LogicalRegAlias(NAME#"Wrs"), GPR32>; + def : LogicalRegAlias(NAME#"Xrs"), GPR64>; +} + +//--- +// Conditionally set flags +//--- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseCondSetFlagsImm + : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond), + asm, "\t$Rn, $imm, $nzcv, $cond", "", []>, + Sched<[WriteI, ReadI]> { + let Uses = [NZCV]; + let Defs = [NZCV]; + + bits<5> Rn; + bits<5> imm; + bits<4> nzcv; + bits<4> cond; + + let Inst{30} = op; + let Inst{29-21} = 0b111010010; + let Inst{20-16} = imm; + let Inst{15-12} = cond; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4} = 0b0; + let Inst{3-0} = nzcv; +} + +multiclass CondSetFlagsImm { + def Wi : BaseCondSetFlagsImm { + let Inst{31} = 0; + } + def Xi : BaseCondSetFlagsImm { + let Inst{31} = 1; + } +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseCondSetFlagsReg + : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond), + asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>, + Sched<[WriteI, ReadI, ReadI]> { + let Uses = [NZCV]; + let Defs = [NZCV]; + + bits<5> Rn; + bits<5> Rm; + bits<4> nzcv; + bits<4> cond; + + let Inst{30} = op; + let Inst{29-21} = 0b111010010; + let Inst{20-16} = Rm; + let Inst{15-12} = cond; + let Inst{11-10} = 0b00; + let Inst{9-5} = Rn; + let Inst{4} = 0b0; + let Inst{3-0} = nzcv; +} + +multiclass CondSetFlagsReg { + def Wr : BaseCondSetFlagsReg { + let Inst{31} = 0; + } + def Xr : BaseCondSetFlagsReg { + let Inst{31} = 1; + } +} + +//--- +// Conditional select +//--- + +class BaseCondSelect op2, RegisterClass regtype, string asm> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond), + asm, "\t$Rd, $Rn, $Rm, $cond", "", + [(set regtype:$Rd, + (AArch64csel regtype:$Rn, regtype:$Rm, (i32 imm:$cond), NZCV))]>, + Sched<[WriteI, ReadI, ReadI]> { + let Uses = [NZCV]; + + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + bits<4> cond; + + let Inst{30} = op; + let Inst{29-21} = 0b011010100; + let Inst{20-16} = Rm; + let Inst{15-12} = cond; + let Inst{11-10} = op2; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass CondSelect op2, string asm> { + def Wr : BaseCondSelect { + let Inst{31} = 0; + } + def Xr : BaseCondSelect { + let Inst{31} = 1; + } +} + +class BaseCondSelectOp op2, RegisterClass regtype, string asm, + PatFrag frag> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond), + asm, "\t$Rd, $Rn, $Rm, $cond", "", + [(set regtype:$Rd, + (AArch64csel regtype:$Rn, (frag regtype:$Rm), + (i32 imm:$cond), NZCV))]>, + Sched<[WriteI, ReadI, ReadI]> { + let Uses = [NZCV]; + + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + bits<4> cond; + + let Inst{30} = op; + let Inst{29-21} = 0b011010100; + let Inst{20-16} = Rm; + let Inst{15-12} = cond; + let Inst{11-10} = op2; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +def inv_cond_XFORM : SDNodeXForm(N->getZExtValue()); + return CurDAG->getTargetConstant(AArch64CC::getInvertedCondCode(CC), MVT::i32); +}]>; + +multiclass CondSelectOp op2, string asm, PatFrag frag> { + def Wr : BaseCondSelectOp { + let Inst{31} = 0; + } + def Xr : BaseCondSelectOp { + let Inst{31} = 1; + } + + def : Pat<(AArch64csel (frag GPR32:$Rm), GPR32:$Rn, (i32 imm:$cond), NZCV), + (!cast(NAME # Wr) GPR32:$Rn, GPR32:$Rm, + (inv_cond_XFORM imm:$cond))>; + + def : Pat<(AArch64csel (frag GPR64:$Rm), GPR64:$Rn, (i32 imm:$cond), NZCV), + (!cast(NAME # Xr) GPR64:$Rn, GPR64:$Rm, + (inv_cond_XFORM imm:$cond))>; +} + +//--- +// Special Mask Value +//--- +def maski8_or_more : Operand, + ImmLeaf { +} +def maski16_or_more : Operand, + ImmLeaf { +} + + +//--- +// Load/store +//--- + +// (unsigned immediate) +// Indexed for 8-bit registers. offset is in range [0,4095]. +def am_indexed8 : ComplexPattern; +def am_indexed16 : ComplexPattern; +def am_indexed32 : ComplexPattern; +def am_indexed64 : ComplexPattern; +def am_indexed128 : ComplexPattern; + +class UImm12OffsetOperand : AsmOperandClass { + let Name = "UImm12Offset" # Scale; + let RenderMethod = "addUImm12OffsetOperands<" # Scale # ">"; + let PredicateMethod = "isUImm12Offset<" # Scale # ">"; + let DiagnosticType = "InvalidMemoryIndexed" # Scale; +} + +def UImm12OffsetScale1Operand : UImm12OffsetOperand<1>; +def UImm12OffsetScale2Operand : UImm12OffsetOperand<2>; +def UImm12OffsetScale4Operand : UImm12OffsetOperand<4>; +def UImm12OffsetScale8Operand : UImm12OffsetOperand<8>; +def UImm12OffsetScale16Operand : UImm12OffsetOperand<16>; + +class uimm12_scaled : Operand { + let ParserMatchClass + = !cast("UImm12OffsetScale" # Scale # "Operand"); + let EncoderMethod + = "getLdStUImm12OpValue"; + let PrintMethod = "printUImm12Offset<" # Scale # ">"; +} + +def uimm12s1 : uimm12_scaled<1>; +def uimm12s2 : uimm12_scaled<2>; +def uimm12s4 : uimm12_scaled<4>; +def uimm12s8 : uimm12_scaled<8>; +def uimm12s16 : uimm12_scaled<16>; + +class BaseLoadStoreUI sz, bit V, bits<2> opc, dag oops, dag iops, + string asm, list pattern> + : I { + bits<5> Rt; + + bits<5> Rn; + bits<12> offset; + + let Inst{31-30} = sz; + let Inst{29-27} = 0b111; + let Inst{26} = V; + let Inst{25-24} = 0b01; + let Inst{23-22} = opc; + let Inst{21-10} = offset; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let DecoderMethod = "DecodeUnsignedLdStInstruction"; +} + +multiclass LoadUI sz, bit V, bits<2> opc, RegisterClass regtype, + Operand indextype, string asm, list pattern> { + let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in + def ui : BaseLoadStoreUI, + Sched<[WriteLD]>; + + def : InstAlias(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>; +} + +multiclass StoreUI sz, bit V, bits<2> opc, RegisterClass regtype, + Operand indextype, string asm, list pattern> { + let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in + def ui : BaseLoadStoreUI, + Sched<[WriteST]>; + + def : InstAlias(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>; +} + +def PrefetchOperand : AsmOperandClass { + let Name = "Prefetch"; + let ParserMethod = "tryParsePrefetch"; +} +def prfop : Operand { + let PrintMethod = "printPrefetchOp"; + let ParserMatchClass = PrefetchOperand; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in +class PrefetchUI sz, bit V, bits<2> opc, string asm, list pat> + : BaseLoadStoreUI, + Sched<[WriteLD]>; + +//--- +// Load literal +//--- + +// Load literal address: 19-bit immediate. The low two bits of the target +// offset are implied zero and so are not part of the immediate. +def am_ldrlit : Operand { + let EncoderMethod = "getLoadLiteralOpValue"; + let DecoderMethod = "DecodePCRelLabel19"; + let PrintMethod = "printAlignedLabel"; + let ParserMatchClass = PCRelLabel19Operand; +} + +let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in +class LoadLiteral opc, bit V, RegisterClass regtype, string asm> + : I<(outs regtype:$Rt), (ins am_ldrlit:$label), + asm, "\t$Rt, $label", "", []>, + Sched<[WriteLD]> { + bits<5> Rt; + bits<19> label; + let Inst{31-30} = opc; + let Inst{29-27} = 0b011; + let Inst{26} = V; + let Inst{25-24} = 0b00; + let Inst{23-5} = label; + let Inst{4-0} = Rt; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in +class PrefetchLiteral opc, bit V, string asm, list pat> + : I<(outs), (ins prfop:$Rt, am_ldrlit:$label), + asm, "\t$Rt, $label", "", pat>, + Sched<[WriteLD]> { + bits<5> Rt; + bits<19> label; + let Inst{31-30} = opc; + let Inst{29-27} = 0b011; + let Inst{26} = V; + let Inst{25-24} = 0b00; + let Inst{23-5} = label; + let Inst{4-0} = Rt; +} + +//--- +// Load/store register offset +//--- + +def ro_Xindexed8 : ComplexPattern", []>; +def ro_Xindexed16 : ComplexPattern", []>; +def ro_Xindexed32 : ComplexPattern", []>; +def ro_Xindexed64 : ComplexPattern", []>; +def ro_Xindexed128 : ComplexPattern", []>; + +def ro_Windexed8 : ComplexPattern", []>; +def ro_Windexed16 : ComplexPattern", []>; +def ro_Windexed32 : ComplexPattern", []>; +def ro_Windexed64 : ComplexPattern", []>; +def ro_Windexed128 : ComplexPattern", []>; + +class MemExtendOperand : AsmOperandClass { + let Name = "Mem" # Reg # "Extend" # Width; + let PredicateMethod = "isMem" # Reg # "Extend<" # Width # ">"; + let RenderMethod = "addMemExtendOperands"; + let DiagnosticType = "InvalidMemory" # Reg # "Extend" # Width; +} + +def MemWExtend8Operand : MemExtendOperand<"W", 8> { + // The address "[x0, x1, lsl #0]" actually maps to the variant which performs + // the trivial shift. + let RenderMethod = "addMemExtend8Operands"; +} +def MemWExtend16Operand : MemExtendOperand<"W", 16>; +def MemWExtend32Operand : MemExtendOperand<"W", 32>; +def MemWExtend64Operand : MemExtendOperand<"W", 64>; +def MemWExtend128Operand : MemExtendOperand<"W", 128>; + +def MemXExtend8Operand : MemExtendOperand<"X", 8> { + // The address "[x0, x1, lsl #0]" actually maps to the variant which performs + // the trivial shift. + let RenderMethod = "addMemExtend8Operands"; +} +def MemXExtend16Operand : MemExtendOperand<"X", 16>; +def MemXExtend32Operand : MemExtendOperand<"X", 32>; +def MemXExtend64Operand : MemExtendOperand<"X", 64>; +def MemXExtend128Operand : MemExtendOperand<"X", 128>; + +class ro_extend + : Operand { + let ParserMatchClass = ParserClass; + let PrintMethod = "printMemExtend<'" # Reg # "', " # Width # ">"; + let DecoderMethod = "DecodeMemExtend"; + let EncoderMethod = "getMemExtendOpValue"; + let MIOperandInfo = (ops i32imm:$signed, i32imm:$doshift); +} + +def ro_Wextend8 : ro_extend; +def ro_Wextend16 : ro_extend; +def ro_Wextend32 : ro_extend; +def ro_Wextend64 : ro_extend; +def ro_Wextend128 : ro_extend; + +def ro_Xextend8 : ro_extend; +def ro_Xextend16 : ro_extend; +def ro_Xextend32 : ro_extend; +def ro_Xextend64 : ro_extend; +def ro_Xextend128 : ro_extend; + +class ROAddrMode { + // CodeGen-level pattern covering the entire addressing mode. + ComplexPattern Wpat = windex; + ComplexPattern Xpat = xindex; + + // Asm-level Operand covering the valid "uxtw #3" style syntax. + Operand Wext = wextend; + Operand Xext = xextend; +} + +def ro8 : ROAddrMode; +def ro16 : ROAddrMode; +def ro32 : ROAddrMode; +def ro64 : ROAddrMode; +def ro128 : ROAddrMode; + +class LoadStore8RO sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, dag ins, dag outs, list pat> + : I { + bits<5> Rt; + bits<5> Rn; + bits<5> Rm; + bits<2> extend; + let Inst{31-30} = sz; + let Inst{29-27} = 0b111; + let Inst{26} = V; + let Inst{25-24} = 0b00; + let Inst{23-22} = opc; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15} = extend{1}; // sign extend Rm? + let Inst{14} = 1; + let Inst{12} = extend{0}; // do shift? + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; +} + +class ROInstAlias + : InstAlias; + +multiclass Load8RO sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, ValueType Ty, SDPatternOperator loadop> { + let AddedComplexity = 10 in + def roW : LoadStore8RO, + Sched<[WriteLDIdx, ReadAdrBase]> { + let Inst{13} = 0b0; + } + + let AddedComplexity = 10 in + def roX : LoadStore8RO, + Sched<[WriteLDIdx, ReadAdrBase]> { + let Inst{13} = 0b1; + } + + def : ROInstAlias(NAME # "roX")>; +} + +multiclass Store8RO sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, ValueType Ty, SDPatternOperator storeop> { + let AddedComplexity = 10 in + def roW : LoadStore8RO, + Sched<[WriteSTIdx, ReadAdrBase]> { + let Inst{13} = 0b0; + } + + let AddedComplexity = 10 in + def roX : LoadStore8RO, + Sched<[WriteSTIdx, ReadAdrBase]> { + let Inst{13} = 0b1; + } + + def : ROInstAlias(NAME # "roX")>; +} + +class LoadStore16RO sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, dag ins, dag outs, list pat> + : I { + bits<5> Rt; + bits<5> Rn; + bits<5> Rm; + bits<2> extend; + let Inst{31-30} = sz; + let Inst{29-27} = 0b111; + let Inst{26} = V; + let Inst{25-24} = 0b00; + let Inst{23-22} = opc; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15} = extend{1}; // sign extend Rm? + let Inst{14} = 1; + let Inst{12} = extend{0}; // do shift? + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; +} + +multiclass Load16RO sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, ValueType Ty, SDPatternOperator loadop> { + let AddedComplexity = 10 in + def roW : LoadStore16RO, + Sched<[WriteLDIdx, ReadAdrBase]> { + let Inst{13} = 0b0; + } + + let AddedComplexity = 10 in + def roX : LoadStore16RO, + Sched<[WriteLDIdx, ReadAdrBase]> { + let Inst{13} = 0b1; + } + + def : ROInstAlias(NAME # "roX")>; +} + +multiclass Store16RO sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, ValueType Ty, SDPatternOperator storeop> { + let AddedComplexity = 10 in + def roW : LoadStore16RO, + Sched<[WriteSTIdx, ReadAdrBase]> { + let Inst{13} = 0b0; + } + + let AddedComplexity = 10 in + def roX : LoadStore16RO, + Sched<[WriteSTIdx, ReadAdrBase]> { + let Inst{13} = 0b1; + } + + def : ROInstAlias(NAME # "roX")>; +} + +class LoadStore32RO sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, dag ins, dag outs, list pat> + : I { + bits<5> Rt; + bits<5> Rn; + bits<5> Rm; + bits<2> extend; + let Inst{31-30} = sz; + let Inst{29-27} = 0b111; + let Inst{26} = V; + let Inst{25-24} = 0b00; + let Inst{23-22} = opc; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15} = extend{1}; // sign extend Rm? + let Inst{14} = 1; + let Inst{12} = extend{0}; // do shift? + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; +} + +multiclass Load32RO sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, ValueType Ty, SDPatternOperator loadop> { + let AddedComplexity = 10 in + def roW : LoadStore32RO, + Sched<[WriteLDIdx, ReadAdrBase]> { + let Inst{13} = 0b0; + } + + let AddedComplexity = 10 in + def roX : LoadStore32RO, + Sched<[WriteLDIdx, ReadAdrBase]> { + let Inst{13} = 0b1; + } + + def : ROInstAlias(NAME # "roX")>; +} + +multiclass Store32RO sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, ValueType Ty, SDPatternOperator storeop> { + let AddedComplexity = 10 in + def roW : LoadStore32RO, + Sched<[WriteSTIdx, ReadAdrBase]> { + let Inst{13} = 0b0; + } + + let AddedComplexity = 10 in + def roX : LoadStore32RO, + Sched<[WriteSTIdx, ReadAdrBase]> { + let Inst{13} = 0b1; + } + + def : ROInstAlias(NAME # "roX")>; +} + +class LoadStore64RO sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, dag ins, dag outs, list pat> + : I { + bits<5> Rt; + bits<5> Rn; + bits<5> Rm; + bits<2> extend; + let Inst{31-30} = sz; + let Inst{29-27} = 0b111; + let Inst{26} = V; + let Inst{25-24} = 0b00; + let Inst{23-22} = opc; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15} = extend{1}; // sign extend Rm? + let Inst{14} = 1; + let Inst{12} = extend{0}; // do shift? + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; +} + +multiclass Load64RO sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, ValueType Ty, SDPatternOperator loadop> { + let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in + def roW : LoadStore64RO, + Sched<[WriteLDIdx, ReadAdrBase]> { + let Inst{13} = 0b0; + } + + let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in + def roX : LoadStore64RO, + Sched<[WriteLDIdx, ReadAdrBase]> { + let Inst{13} = 0b1; + } + + def : ROInstAlias(NAME # "roX")>; +} + +multiclass Store64RO sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, ValueType Ty, SDPatternOperator storeop> { + let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in + def roW : LoadStore64RO, + Sched<[WriteSTIdx, ReadAdrBase]> { + let Inst{13} = 0b0; + } + + let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in + def roX : LoadStore64RO, + Sched<[WriteSTIdx, ReadAdrBase]> { + let Inst{13} = 0b1; + } + + def : ROInstAlias(NAME # "roX")>; +} + +class LoadStore128RO sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, dag ins, dag outs, list pat> + : I { + bits<5> Rt; + bits<5> Rn; + bits<5> Rm; + bits<2> extend; + let Inst{31-30} = sz; + let Inst{29-27} = 0b111; + let Inst{26} = V; + let Inst{25-24} = 0b00; + let Inst{23-22} = opc; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15} = extend{1}; // sign extend Rm? + let Inst{14} = 1; + let Inst{12} = extend{0}; // do shift? + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; +} + +multiclass Load128RO sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, ValueType Ty, SDPatternOperator loadop> { + let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in + def roW : LoadStore128RO, + Sched<[WriteLDIdx, ReadAdrBase]> { + let Inst{13} = 0b0; + } + + let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in + def roX : LoadStore128RO, + Sched<[WriteLDIdx, ReadAdrBase]> { + let Inst{13} = 0b1; + } + + def : ROInstAlias(NAME # "roX")>; +} + +multiclass Store128RO sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, ValueType Ty, SDPatternOperator storeop> { + let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in + def roW : LoadStore128RO, + Sched<[WriteSTIdx, ReadAdrBase]> { + let Inst{13} = 0b0; + } + + let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in + def roX : LoadStore128RO, + Sched<[WriteSTIdx, ReadAdrBase]> { + let Inst{13} = 0b1; + } + + def : ROInstAlias(NAME # "roX")>; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in +class BasePrefetchRO sz, bit V, bits<2> opc, dag outs, dag ins, + string asm, list pat> + : I, + Sched<[WriteLD]> { + bits<5> Rt; + bits<5> Rn; + bits<5> Rm; + bits<2> extend; + let Inst{31-30} = sz; + let Inst{29-27} = 0b111; + let Inst{26} = V; + let Inst{25-24} = 0b00; + let Inst{23-22} = opc; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15} = extend{1}; // sign extend Rm? + let Inst{14} = 1; + let Inst{12} = extend{0}; // do shift? + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; +} + +multiclass PrefetchRO sz, bit V, bits<2> opc, string asm> { + def roW : BasePrefetchRO { + let Inst{13} = 0b0; + } + + def roX : BasePrefetchRO { + let Inst{13} = 0b1; + } + + def : InstAlias<"prfm $Rt, [$Rn, $Rm]", + (!cast(NAME # "roX") prfop:$Rt, + GPR64sp:$Rn, GPR64:$Rm, 0, 0)>; +} + +//--- +// Load/store unscaled immediate +//--- + +def am_unscaled8 : ComplexPattern; +def am_unscaled16 : ComplexPattern; +def am_unscaled32 : ComplexPattern; +def am_unscaled64 : ComplexPattern; +def am_unscaled128 :ComplexPattern; + +class BaseLoadStoreUnscale sz, bit V, bits<2> opc, dag oops, dag iops, + string asm, list pattern> + : I { + bits<5> Rt; + bits<5> Rn; + bits<9> offset; + let Inst{31-30} = sz; + let Inst{29-27} = 0b111; + let Inst{26} = V; + let Inst{25-24} = 0b00; + let Inst{23-22} = opc; + let Inst{21} = 0; + let Inst{20-12} = offset; + let Inst{11-10} = 0b00; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let DecoderMethod = "DecodeSignedLdStInstruction"; +} + +multiclass LoadUnscaled sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, list pattern> { + let AddedComplexity = 1 in // try this before LoadUI + def i : BaseLoadStoreUnscale, + Sched<[WriteLD]>; + + def : InstAlias(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; +} + +multiclass StoreUnscaled sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, list pattern> { + let AddedComplexity = 1 in // try this before StoreUI + def i : BaseLoadStoreUnscale, + Sched<[WriteST]>; + + def : InstAlias(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; +} + +multiclass PrefetchUnscaled sz, bit V, bits<2> opc, string asm, + list pat> { + let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in + def i : BaseLoadStoreUnscale, + Sched<[WriteLD]>; + + def : InstAlias(NAME # "i") prfop:$Rt, GPR64sp:$Rn, 0)>; +} + +//--- +// Load/store unscaled immediate, unprivileged +//--- + +class BaseLoadStoreUnprivileged sz, bit V, bits<2> opc, + dag oops, dag iops, string asm> + : I { + bits<5> Rt; + bits<5> Rn; + bits<9> offset; + let Inst{31-30} = sz; + let Inst{29-27} = 0b111; + let Inst{26} = V; + let Inst{25-24} = 0b00; + let Inst{23-22} = opc; + let Inst{21} = 0; + let Inst{20-12} = offset; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let DecoderMethod = "DecodeSignedLdStInstruction"; +} + +multiclass LoadUnprivileged sz, bit V, bits<2> opc, + RegisterClass regtype, string asm> { + let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in + def i : BaseLoadStoreUnprivileged, + Sched<[WriteLD]>; + + def : InstAlias(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; +} + +multiclass StoreUnprivileged sz, bit V, bits<2> opc, + RegisterClass regtype, string asm> { + let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in + def i : BaseLoadStoreUnprivileged, + Sched<[WriteST]>; + + def : InstAlias(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; +} + +//--- +// Load/store pre-indexed +//--- + +class BaseLoadStorePreIdx sz, bit V, bits<2> opc, dag oops, dag iops, + string asm, string cstr, list pat> + : I { + bits<5> Rt; + bits<5> Rn; + bits<9> offset; + let Inst{31-30} = sz; + let Inst{29-27} = 0b111; + let Inst{26} = V; + let Inst{25-24} = 0; + let Inst{23-22} = opc; + let Inst{21} = 0; + let Inst{20-12} = offset; + let Inst{11-10} = 0b11; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let DecoderMethod = "DecodeSignedLdStInstruction"; +} + +let hasSideEffects = 0 in { +let mayStore = 0, mayLoad = 1 in +class LoadPreIdx sz, bit V, bits<2> opc, RegisterClass regtype, + string asm> + : BaseLoadStorePreIdx, + Sched<[WriteLD, WriteAdr]>; + +let mayStore = 1, mayLoad = 0 in +class StorePreIdx sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, SDPatternOperator storeop, ValueType Ty> + : BaseLoadStorePreIdx, + Sched<[WriteAdr, WriteST]>; +} // hasSideEffects = 0 + +//--- +// Load/store post-indexed +//--- + +// (pre-index) load/stores. +class BaseLoadStorePostIdx sz, bit V, bits<2> opc, dag oops, dag iops, + string asm, string cstr, list pat> + : I { + bits<5> Rt; + bits<5> Rn; + bits<9> offset; + let Inst{31-30} = sz; + let Inst{29-27} = 0b111; + let Inst{26} = V; + let Inst{25-24} = 0b00; + let Inst{23-22} = opc; + let Inst{21} = 0b0; + let Inst{20-12} = offset; + let Inst{11-10} = 0b01; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let DecoderMethod = "DecodeSignedLdStInstruction"; +} + +let hasSideEffects = 0 in { +let mayStore = 0, mayLoad = 1 in +class LoadPostIdx sz, bit V, bits<2> opc, RegisterClass regtype, + string asm> + : BaseLoadStorePostIdx, + Sched<[WriteLD, WriteI]>; + +let mayStore = 1, mayLoad = 0 in +class StorePostIdx sz, bit V, bits<2> opc, RegisterClass regtype, + string asm, SDPatternOperator storeop, ValueType Ty> + : BaseLoadStorePostIdx, + Sched<[WriteAdr, WriteST, ReadAdrBase]>; +} // hasSideEffects = 0 + + +//--- +// Load/store pair +//--- + +// (indexed, offset) + +class BaseLoadStorePairOffset opc, bit V, bit L, dag oops, dag iops, + string asm> + : I { + bits<5> Rt; + bits<5> Rt2; + bits<5> Rn; + bits<7> offset; + let Inst{31-30} = opc; + let Inst{29-27} = 0b101; + let Inst{26} = V; + let Inst{25-23} = 0b010; + let Inst{22} = L; + let Inst{21-15} = offset; + let Inst{14-10} = Rt2; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let DecoderMethod = "DecodePairLdStInstruction"; +} + +multiclass LoadPairOffset opc, bit V, RegisterClass regtype, + Operand indextype, string asm> { + let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in + def i : BaseLoadStorePairOffset, + Sched<[WriteLD, WriteLDHi]>; + + def : InstAlias(NAME # "i") regtype:$Rt, regtype:$Rt2, + GPR64sp:$Rn, 0)>; +} + + +multiclass StorePairOffset opc, bit V, RegisterClass regtype, + Operand indextype, string asm> { + let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in + def i : BaseLoadStorePairOffset, + Sched<[WriteSTP]>; + + def : InstAlias(NAME # "i") regtype:$Rt, regtype:$Rt2, + GPR64sp:$Rn, 0)>; +} + +// (pre-indexed) +class BaseLoadStorePairPreIdx opc, bit V, bit L, dag oops, dag iops, + string asm> + : I { + bits<5> Rt; + bits<5> Rt2; + bits<5> Rn; + bits<7> offset; + let Inst{31-30} = opc; + let Inst{29-27} = 0b101; + let Inst{26} = V; + let Inst{25-23} = 0b011; + let Inst{22} = L; + let Inst{21-15} = offset; + let Inst{14-10} = Rt2; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let DecoderMethod = "DecodePairLdStInstruction"; +} + +let hasSideEffects = 0 in { +let mayStore = 0, mayLoad = 1 in +class LoadPairPreIdx opc, bit V, RegisterClass regtype, + Operand indextype, string asm> + : BaseLoadStorePairPreIdx, + Sched<[WriteLD, WriteLDHi, WriteAdr]>; + +let mayStore = 1, mayLoad = 0 in +class StorePairPreIdx opc, bit V, RegisterClass regtype, + Operand indextype, string asm> + : BaseLoadStorePairPreIdx, + Sched<[WriteAdr, WriteSTP]>; +} // hasSideEffects = 0 + +// (post-indexed) + +class BaseLoadStorePairPostIdx opc, bit V, bit L, dag oops, dag iops, + string asm> + : I { + bits<5> Rt; + bits<5> Rt2; + bits<5> Rn; + bits<7> offset; + let Inst{31-30} = opc; + let Inst{29-27} = 0b101; + let Inst{26} = V; + let Inst{25-23} = 0b001; + let Inst{22} = L; + let Inst{21-15} = offset; + let Inst{14-10} = Rt2; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let DecoderMethod = "DecodePairLdStInstruction"; +} + +let hasSideEffects = 0 in { +let mayStore = 0, mayLoad = 1 in +class LoadPairPostIdx opc, bit V, RegisterClass regtype, + Operand idxtype, string asm> + : BaseLoadStorePairPostIdx, + Sched<[WriteLD, WriteLDHi, WriteAdr]>; + +let mayStore = 1, mayLoad = 0 in +class StorePairPostIdx opc, bit V, RegisterClass regtype, + Operand idxtype, string asm> + : BaseLoadStorePairPostIdx, + Sched<[WriteAdr, WriteSTP]>; +} // hasSideEffects = 0 + +// (no-allocate) + +class BaseLoadStorePairNoAlloc opc, bit V, bit L, dag oops, dag iops, + string asm> + : I { + bits<5> Rt; + bits<5> Rt2; + bits<5> Rn; + bits<7> offset; + let Inst{31-30} = opc; + let Inst{29-27} = 0b101; + let Inst{26} = V; + let Inst{25-23} = 0b000; + let Inst{22} = L; + let Inst{21-15} = offset; + let Inst{14-10} = Rt2; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let DecoderMethod = "DecodePairLdStInstruction"; +} + +multiclass LoadPairNoAlloc opc, bit V, RegisterClass regtype, + Operand indextype, string asm> { + let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in + def i : BaseLoadStorePairNoAlloc, + Sched<[WriteLD, WriteLDHi]>; + + + def : InstAlias(NAME # "i") regtype:$Rt, regtype:$Rt2, + GPR64sp:$Rn, 0)>; +} + +multiclass StorePairNoAlloc opc, bit V, RegisterClass regtype, + Operand indextype, string asm> { + let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in + def i : BaseLoadStorePairNoAlloc, + Sched<[WriteSTP]>; + + def : InstAlias(NAME # "i") regtype:$Rt, regtype:$Rt2, + GPR64sp:$Rn, 0)>; +} + +//--- +// Load/store exclusive +//--- + +// True exclusive operations write to and/or read from the system's exclusive +// monitors, which as far as a compiler is concerned can be modelled as a +// random shared memory address. Hence LoadExclusive mayStore. +// +// Since these instructions have the undefined register bits set to 1 in +// their canonical form, we need a post encoder method to set those bits +// to 1 when encoding these instructions. We do this using the +// fixLoadStoreExclusive function. This function has template parameters: +// +// fixLoadStoreExclusive +// +// hasRs indicates that the instruction uses the Rs field, so we won't set +// it to 1 (and the same for Rt2). We don't need template parameters for +// the other register fields since Rt and Rn are always used. +// +let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in +class BaseLoadStoreExclusive sz, bit o2, bit L, bit o1, bit o0, + dag oops, dag iops, string asm, string operands> + : I { + let Inst{31-30} = sz; + let Inst{29-24} = 0b001000; + let Inst{23} = o2; + let Inst{22} = L; + let Inst{21} = o1; + let Inst{15} = o0; + + let DecoderMethod = "DecodeExclusiveLdStInstruction"; +} + +// Neither Rs nor Rt2 operands. +class LoadStoreExclusiveSimple sz, bit o2, bit L, bit o1, bit o0, + dag oops, dag iops, string asm, string operands> + : BaseLoadStoreExclusive { + bits<5> Rt; + bits<5> Rn; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let PostEncoderMethod = "fixLoadStoreExclusive<0,0>"; +} + +// Simple load acquires don't set the exclusive monitor +let mayLoad = 1, mayStore = 0 in +class LoadAcquire sz, bit o2, bit L, bit o1, bit o0, + RegisterClass regtype, string asm> + : LoadStoreExclusiveSimple, + Sched<[WriteLD]>; + +class LoadExclusive sz, bit o2, bit L, bit o1, bit o0, + RegisterClass regtype, string asm> + : LoadStoreExclusiveSimple, + Sched<[WriteLD]>; + +class LoadExclusivePair sz, bit o2, bit L, bit o1, bit o0, + RegisterClass regtype, string asm> + : BaseLoadStoreExclusive, + Sched<[WriteLD, WriteLDHi]> { + bits<5> Rt; + bits<5> Rt2; + bits<5> Rn; + let Inst{14-10} = Rt2; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let PostEncoderMethod = "fixLoadStoreExclusive<0,1>"; +} + +// Simple store release operations do not check the exclusive monitor. +let mayLoad = 0, mayStore = 1 in +class StoreRelease sz, bit o2, bit L, bit o1, bit o0, + RegisterClass regtype, string asm> + : LoadStoreExclusiveSimple, + Sched<[WriteST]>; + +let mayLoad = 1, mayStore = 1 in +class StoreExclusive sz, bit o2, bit L, bit o1, bit o0, + RegisterClass regtype, string asm> + : BaseLoadStoreExclusive, + Sched<[WriteSTX]> { + bits<5> Ws; + bits<5> Rt; + bits<5> Rn; + let Inst{20-16} = Ws; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let Constraints = "@earlyclobber $Ws"; + let PostEncoderMethod = "fixLoadStoreExclusive<1,0>"; +} + +class StoreExclusivePair sz, bit o2, bit L, bit o1, bit o0, + RegisterClass regtype, string asm> + : BaseLoadStoreExclusive, + Sched<[WriteSTX]> { + bits<5> Ws; + bits<5> Rt; + bits<5> Rt2; + bits<5> Rn; + let Inst{20-16} = Ws; + let Inst{14-10} = Rt2; + let Inst{9-5} = Rn; + let Inst{4-0} = Rt; + + let Constraints = "@earlyclobber $Ws"; +} + +//--- +// Exception generation +//--- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in +class ExceptionGeneration op1, bits<2> ll, string asm> + : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>, + Sched<[WriteSys]> { + bits<16> imm; + let Inst{31-24} = 0b11010100; + let Inst{23-21} = op1; + let Inst{20-5} = imm; + let Inst{4-2} = 0b000; + let Inst{1-0} = ll; +} + +let Predicates = [HasFPARMv8] in { + +//--- +// Floating point to integer conversion +//--- + +class BaseFPToIntegerUnscaled type, bits<2> rmode, bits<3> opcode, + RegisterClass srcType, RegisterClass dstType, + string asm, list pattern> + : I<(outs dstType:$Rd), (ins srcType:$Rn), + asm, "\t$Rd, $Rn", "", pattern>, + Sched<[WriteFCvt]> { + bits<5> Rd; + bits<5> Rn; + let Inst{30-29} = 0b00; + let Inst{28-24} = 0b11110; + let Inst{23-22} = type; + let Inst{21} = 1; + let Inst{20-19} = rmode; + let Inst{18-16} = opcode; + let Inst{15-10} = 0; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseFPToInteger type, bits<2> rmode, bits<3> opcode, + RegisterClass srcType, RegisterClass dstType, + Operand immType, string asm, list pattern> + : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale), + asm, "\t$Rd, $Rn, $scale", "", pattern>, + Sched<[WriteFCvt]> { + bits<5> Rd; + bits<5> Rn; + bits<6> scale; + let Inst{30-29} = 0b00; + let Inst{28-24} = 0b11110; + let Inst{23-22} = type; + let Inst{21} = 0; + let Inst{20-19} = rmode; + let Inst{18-16} = opcode; + let Inst{15-10} = scale; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass FPToIntegerUnscaled rmode, bits<3> opcode, string asm, + SDPatternOperator OpN> { + // Unscaled single-precision to 32-bit + def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm, + [(set GPR32:$Rd, (OpN FPR32:$Rn))]> { + let Inst{31} = 0; // 32-bit GPR flag + } + + // Unscaled single-precision to 64-bit + def UXSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR64, asm, + [(set GPR64:$Rd, (OpN FPR32:$Rn))]> { + let Inst{31} = 1; // 64-bit GPR flag + } + + // Unscaled double-precision to 32-bit + def UWDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR32, asm, + [(set GPR32:$Rd, (OpN (f64 FPR64:$Rn)))]> { + let Inst{31} = 0; // 32-bit GPR flag + } + + // Unscaled double-precision to 64-bit + def UXDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR64, asm, + [(set GPR64:$Rd, (OpN (f64 FPR64:$Rn)))]> { + let Inst{31} = 1; // 64-bit GPR flag + } +} + +multiclass FPToIntegerScaled rmode, bits<3> opcode, string asm, + SDPatternOperator OpN> { + // Scaled single-precision to 32-bit + def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32, + fixedpoint_f32_i32, asm, + [(set GPR32:$Rd, (OpN (fmul FPR32:$Rn, + fixedpoint_f32_i32:$scale)))]> { + let Inst{31} = 0; // 32-bit GPR flag + let scale{5} = 1; + } + + // Scaled single-precision to 64-bit + def SXSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR64, + fixedpoint_f32_i64, asm, + [(set GPR64:$Rd, (OpN (fmul FPR32:$Rn, + fixedpoint_f32_i64:$scale)))]> { + let Inst{31} = 1; // 64-bit GPR flag + } + + // Scaled double-precision to 32-bit + def SWDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR32, + fixedpoint_f64_i32, asm, + [(set GPR32:$Rd, (OpN (fmul FPR64:$Rn, + fixedpoint_f64_i32:$scale)))]> { + let Inst{31} = 0; // 32-bit GPR flag + let scale{5} = 1; + } + + // Scaled double-precision to 64-bit + def SXDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR64, + fixedpoint_f64_i64, asm, + [(set GPR64:$Rd, (OpN (fmul FPR64:$Rn, + fixedpoint_f64_i64:$scale)))]> { + let Inst{31} = 1; // 64-bit GPR flag + } +} + +//--- +// Integer to floating point conversion +//--- + +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +class BaseIntegerToFP pattern> + : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale), + asm, "\t$Rd, $Rn, $scale", "", pattern>, + Sched<[WriteFCvt]> { + bits<5> Rd; + bits<5> Rn; + bits<6> scale; + let Inst{30-23} = 0b00111100; + let Inst{21-17} = 0b00001; + let Inst{16} = isUnsigned; + let Inst{15-10} = scale; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class BaseIntegerToFPUnscaled + : I<(outs dstType:$Rd), (ins srcType:$Rn), + asm, "\t$Rd, $Rn", "", [(set (dvt dstType:$Rd), (node srcType:$Rn))]>, + Sched<[WriteFCvt]> { + bits<5> Rd; + bits<5> Rn; + bits<6> scale; + let Inst{30-23} = 0b00111100; + let Inst{21-17} = 0b10001; + let Inst{16} = isUnsigned; + let Inst{15-10} = 0b000000; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass IntegerToFP { + // Unscaled + def UWSri: BaseIntegerToFPUnscaled { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{22} = 0; // 32-bit FPR flag + } + + def UWDri: BaseIntegerToFPUnscaled { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{22} = 1; // 64-bit FPR flag + } + + def UXSri: BaseIntegerToFPUnscaled { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{22} = 0; // 32-bit FPR flag + } + + def UXDri: BaseIntegerToFPUnscaled { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{22} = 1; // 64-bit FPR flag + } + + // Scaled + def SWSri: BaseIntegerToFP { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{22} = 0; // 32-bit FPR flag + let scale{5} = 1; + } + + def SWDri: BaseIntegerToFP { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{22} = 1; // 64-bit FPR flag + let scale{5} = 1; + } + + def SXSri: BaseIntegerToFP { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{22} = 0; // 32-bit FPR flag + } + + def SXDri: BaseIntegerToFP { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{22} = 1; // 64-bit FPR flag + } +} + +//--- +// Unscaled integer <-> floating point conversion (i.e. FMOV) +//--- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseUnscaledConversion rmode, bits<3> opcode, + RegisterClass srcType, RegisterClass dstType, + string asm> + : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", + // We use COPY_TO_REGCLASS for these bitconvert operations. + // copyPhysReg() expands the resultant COPY instructions after + // regalloc is done. This gives greater freedom for the allocator + // and related passes (coalescing, copy propagation, et. al.) to + // be more effective. + [/*(set (dvt dstType:$Rd), (bitconvert (svt srcType:$Rn)))*/]>, + Sched<[WriteFCopy]> { + bits<5> Rd; + bits<5> Rn; + let Inst{30-23} = 0b00111100; + let Inst{21} = 1; + let Inst{20-19} = rmode; + let Inst{18-16} = opcode; + let Inst{15-10} = 0b000000; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseUnscaledConversionToHigh rmode, bits<3> opcode, + RegisterClass srcType, RegisterOperand dstType, string asm, + string kind> + : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm, + "{\t$Rd"#kind#"$idx, $Rn|"#kind#"\t$Rd$idx, $Rn}", "", []>, + Sched<[WriteFCopy]> { + bits<5> Rd; + bits<5> Rn; + let Inst{30-23} = 0b00111101; + let Inst{21} = 1; + let Inst{20-19} = rmode; + let Inst{18-16} = opcode; + let Inst{15-10} = 0b000000; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; + + let DecoderMethod = "DecodeFMOVLaneInstruction"; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseUnscaledConversionFromHigh rmode, bits<3> opcode, + RegisterOperand srcType, RegisterClass dstType, string asm, + string kind> + : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm, + "{\t$Rd, $Rn"#kind#"$idx|"#kind#"\t$Rd, $Rn$idx}", "", []>, + Sched<[WriteFCopy]> { + bits<5> Rd; + bits<5> Rn; + let Inst{30-23} = 0b00111101; + let Inst{21} = 1; + let Inst{20-19} = rmode; + let Inst{18-16} = opcode; + let Inst{15-10} = 0b000000; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; + + let DecoderMethod = "DecodeFMOVLaneInstruction"; +} + + + +multiclass UnscaledConversion { + def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{22} = 0; // 32-bit FPR flag + } + + def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{22} = 1; // 64-bit FPR flag + } + + def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{22} = 0; // 32-bit FPR flag + } + + def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{22} = 1; // 64-bit FPR flag + } + + def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128, + asm, ".d"> { + let Inst{31} = 1; + let Inst{22} = 0; + } + + def DXHighr : BaseUnscaledConversionFromHigh<0b01, 0b110, V128, GPR64, + asm, ".d"> { + let Inst{31} = 1; + let Inst{22} = 0; + } +} + +//--- +// Floating point conversion +//--- + +class BaseFPConversion type, bits<2> opcode, RegisterClass dstType, + RegisterClass srcType, string asm, list pattern> + : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>, + Sched<[WriteFCvt]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31-24} = 0b00011110; + let Inst{23-22} = type; + let Inst{21-17} = 0b10001; + let Inst{16-15} = opcode; + let Inst{14-10} = 0b10000; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass FPConversion { + // Double-precision to Half-precision + def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm, + [(set FPR16:$Rd, (fround FPR64:$Rn))]>; + + // Double-precision to Single-precision + def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm, + [(set FPR32:$Rd, (fround FPR64:$Rn))]>; + + // Half-precision to Double-precision + def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm, + [(set FPR64:$Rd, (fextend FPR16:$Rn))]>; + + // Half-precision to Single-precision + def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm, + [(set FPR32:$Rd, (fextend FPR16:$Rn))]>; + + // Single-precision to Double-precision + def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm, + [(set FPR64:$Rd, (fextend FPR32:$Rn))]>; + + // Single-precision to Half-precision + def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm, + [(set FPR16:$Rd, (fround FPR32:$Rn))]>; +} + +//--- +// Single operand floating point data processing +//--- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSingleOperandFPData opcode, RegisterClass regtype, + ValueType vt, string asm, SDPatternOperator node> + : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "", + [(set (vt regtype:$Rd), (node (vt regtype:$Rn)))]>, + Sched<[WriteF]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31-23} = 0b000111100; + let Inst{21-19} = 0b100; + let Inst{18-15} = opcode; + let Inst{14-10} = 0b10000; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SingleOperandFPData opcode, string asm, + SDPatternOperator node = null_frag> { + def Sr : BaseSingleOperandFPData { + let Inst{22} = 0; // 32-bit size flag + } + + def Dr : BaseSingleOperandFPData { + let Inst{22} = 1; // 64-bit size flag + } +} + +//--- +// Two operand floating point data processing +//--- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseTwoOperandFPData opcode, RegisterClass regtype, + string asm, list pat> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), + asm, "\t$Rd, $Rn, $Rm", "", pat>, + Sched<[WriteF]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31-23} = 0b000111100; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass TwoOperandFPData opcode, string asm, + SDPatternOperator node = null_frag> { + def Srr : BaseTwoOperandFPData { + let Inst{22} = 0; // 32-bit size flag + } + + def Drr : BaseTwoOperandFPData { + let Inst{22} = 1; // 64-bit size flag + } +} + +multiclass TwoOperandFPDataNeg opcode, string asm, SDNode node> { + def Srr : BaseTwoOperandFPData { + let Inst{22} = 0; // 32-bit size flag + } + + def Drr : BaseTwoOperandFPData { + let Inst{22} = 1; // 64-bit size flag + } +} + + +//--- +// Three operand floating point data processing +//--- + +class BaseThreeOperandFPData pat> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, regtype: $Ra), + asm, "\t$Rd, $Rn, $Rm, $Ra", "", pat>, + Sched<[WriteFMul]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + bits<5> Ra; + let Inst{31-23} = 0b000111110; + let Inst{21} = isNegated; + let Inst{20-16} = Rm; + let Inst{15} = isSub; + let Inst{14-10} = Ra; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass ThreeOperandFPData { + def Srrr : BaseThreeOperandFPData { + let Inst{22} = 0; // 32-bit size flag + } + + def Drrr : BaseThreeOperandFPData { + let Inst{22} = 1; // 64-bit size flag + } +} + +//--- +// Floating point data comparisons +//--- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseOneOperandFPComparison pat> + : I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>, + Sched<[WriteFCmp]> { + bits<5> Rn; + let Inst{31-23} = 0b000111100; + let Inst{21} = 1; + + let Inst{15-10} = 0b001000; + let Inst{9-5} = Rn; + let Inst{4} = signalAllNans; + let Inst{3-0} = 0b1000; + + // Rm should be 0b00000 canonically, but we need to accept any value. + let PostEncoderMethod = "fixOneOperandFPComparison"; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseTwoOperandFPComparison pat> + : I<(outs), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rn, $Rm", "", pat>, + Sched<[WriteFCmp]> { + bits<5> Rm; + bits<5> Rn; + let Inst{31-23} = 0b000111100; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15-10} = 0b001000; + let Inst{9-5} = Rn; + let Inst{4} = signalAllNans; + let Inst{3-0} = 0b0000; +} + +multiclass FPComparison { + let Defs = [NZCV] in { + def Srr : BaseTwoOperandFPComparison { + let Inst{22} = 0; + } + + def Sri : BaseOneOperandFPComparison { + let Inst{22} = 0; + } + + def Drr : BaseTwoOperandFPComparison { + let Inst{22} = 1; + } + + def Dri : BaseOneOperandFPComparison { + let Inst{22} = 1; + } + } // Defs = [NZCV] +} + +//--- +// Floating point conditional comparisons +//--- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseFPCondComparison + : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond), + asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>, + Sched<[WriteFCmp]> { + bits<5> Rn; + bits<5> Rm; + bits<4> nzcv; + bits<4> cond; + + let Inst{31-23} = 0b000111100; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15-12} = cond; + let Inst{11-10} = 0b01; + let Inst{9-5} = Rn; + let Inst{4} = signalAllNans; + let Inst{3-0} = nzcv; +} + +multiclass FPCondComparison { + let Defs = [NZCV], Uses = [NZCV] in { + def Srr : BaseFPCondComparison { + let Inst{22} = 0; + } + + def Drr : BaseFPCondComparison { + let Inst{22} = 1; + } + } // Defs = [NZCV], Uses = [NZCV] +} + +//--- +// Floating point conditional select +//--- + +class BaseFPCondSelect + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond), + asm, "\t$Rd, $Rn, $Rm, $cond", "", + [(set regtype:$Rd, + (AArch64csel (vt regtype:$Rn), regtype:$Rm, + (i32 imm:$cond), NZCV))]>, + Sched<[WriteF]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + bits<4> cond; + + let Inst{31-23} = 0b000111100; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15-12} = cond; + let Inst{11-10} = 0b11; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass FPCondSelect { + let Uses = [NZCV] in { + def Srrr : BaseFPCondSelect { + let Inst{22} = 0; + } + + def Drrr : BaseFPCondSelect { + let Inst{22} = 1; + } + } // Uses = [NZCV] +} + +//--- +// Floating move immediate +//--- + +class BaseFPMoveImmediate + : I<(outs regtype:$Rd), (ins fpimmtype:$imm), asm, "\t$Rd, $imm", "", + [(set regtype:$Rd, fpimmtype:$imm)]>, + Sched<[WriteFImm]> { + bits<5> Rd; + bits<8> imm; + let Inst{31-23} = 0b000111100; + let Inst{21} = 1; + let Inst{20-13} = imm; + let Inst{12-5} = 0b10000000; + let Inst{4-0} = Rd; +} + +multiclass FPMoveImmediate { + def Si : BaseFPMoveImmediate { + let Inst{22} = 0; + } + + def Di : BaseFPMoveImmediate { + let Inst{22} = 1; + } +} +} // end of 'let Predicates = [HasFPARMv8]' + +//---------------------------------------------------------------------------- +// AdvSIMD +//---------------------------------------------------------------------------- + +let Predicates = [HasNEON] in { + +//---------------------------------------------------------------------------- +// AdvSIMD three register vector instructions +//---------------------------------------------------------------------------- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDThreeSameVector size, bits<5> opcode, + RegisterOperand regtype, string asm, string kind, + list pattern> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, + "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # + "|" # kind # "\t$Rd, $Rn, $Rm|}", "", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15-11} = opcode; + let Inst{10} = 1; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDThreeSameVectorTied size, bits<5> opcode, + RegisterOperand regtype, string asm, string kind, + list pattern> + : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm, + "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # + "|" # kind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15-11} = opcode; + let Inst{10} = 1; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +// All operand sizes distinguished in the encoding. +multiclass SIMDThreeSameVector opc, string asm, + SDPatternOperator OpNode> { + def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64, + asm, ".8b", + [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; + def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128, + asm, ".16b", + [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; + def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64, + asm, ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; + def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128, + asm, ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>; + def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64, + asm, ".2s", + [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; + def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128, + asm, ".4s", + [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>; + def v2i64 : BaseSIMDThreeSameVector<1, U, 0b11, opc, V128, + asm, ".2d", + [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>; +} + +// As above, but D sized elements unsupported. +multiclass SIMDThreeSameVectorBHS opc, string asm, + SDPatternOperator OpNode> { + def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64, + asm, ".8b", + [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>; + def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128, + asm, ".16b", + [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>; + def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64, + asm, ".4h", + [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>; + def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128, + asm, ".8h", + [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>; + def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64, + asm, ".2s", + [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>; + def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128, + asm, ".4s", + [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>; +} + +multiclass SIMDThreeSameVectorBHSTied opc, string asm, + SDPatternOperator OpNode> { + def v8i8 : BaseSIMDThreeSameVectorTied<0, U, 0b00, opc, V64, + asm, ".8b", + [(set (v8i8 V64:$dst), + (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; + def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b00, opc, V128, + asm, ".16b", + [(set (v16i8 V128:$dst), + (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; + def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b01, opc, V64, + asm, ".4h", + [(set (v4i16 V64:$dst), + (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; + def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b01, opc, V128, + asm, ".8h", + [(set (v8i16 V128:$dst), + (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>; + def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b10, opc, V64, + asm, ".2s", + [(set (v2i32 V64:$dst), + (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; + def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b10, opc, V128, + asm, ".4s", + [(set (v4i32 V128:$dst), + (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>; +} + +// As above, but only B sized elements supported. +multiclass SIMDThreeSameVectorB opc, string asm, + SDPatternOperator OpNode> { + def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64, + asm, ".8b", + [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; + def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128, + asm, ".16b", + [(set (v16i8 V128:$Rd), + (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; +} + +// As above, but only S and D sized floating point elements supported. +multiclass SIMDThreeSameVectorFP opc, + string asm, SDPatternOperator OpNode> { + def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64, + asm, ".2s", + [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>; + def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128, + asm, ".4s", + [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>; + def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128, + asm, ".2d", + [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; +} + +multiclass SIMDThreeSameVectorFPCmp opc, + string asm, + SDPatternOperator OpNode> { + def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64, + asm, ".2s", + [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>; + def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128, + asm, ".4s", + [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>; + def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128, + asm, ".2d", + [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; +} + +multiclass SIMDThreeSameVectorFPTied opc, + string asm, SDPatternOperator OpNode> { + def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0}, opc, V64, + asm, ".2s", + [(set (v2f32 V64:$dst), + (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>; + def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0}, opc, V128, + asm, ".4s", + [(set (v4f32 V128:$dst), + (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>; + def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,1}, opc, V128, + asm, ".2d", + [(set (v2f64 V128:$dst), + (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; +} + +// As above, but D and B sized elements unsupported. +multiclass SIMDThreeSameVectorHS opc, string asm, + SDPatternOperator OpNode> { + def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64, + asm, ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; + def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128, + asm, ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>; + def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64, + asm, ".2s", + [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; + def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128, + asm, ".4s", + [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>; +} + +// Logical three vector ops share opcode bits, and only use B sized elements. +multiclass SIMDLogicalThreeVector size, string asm, + SDPatternOperator OpNode = null_frag> { + def v8i8 : BaseSIMDThreeSameVector<0, U, size, 0b00011, V64, + asm, ".8b", + [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>; + def v16i8 : BaseSIMDThreeSameVector<1, U, size, 0b00011, V128, + asm, ".16b", + [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>; + + def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)), + (!cast(NAME#"v8i8") V64:$LHS, V64:$RHS)>; + def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)), + (!cast(NAME#"v8i8") V64:$LHS, V64:$RHS)>; + def : Pat<(v1i64 (OpNode V64:$LHS, V64:$RHS)), + (!cast(NAME#"v8i8") V64:$LHS, V64:$RHS)>; + + def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)), + (!cast(NAME#"v16i8") V128:$LHS, V128:$RHS)>; + def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)), + (!cast(NAME#"v16i8") V128:$LHS, V128:$RHS)>; + def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)), + (!cast(NAME#"v16i8") V128:$LHS, V128:$RHS)>; +} + +multiclass SIMDLogicalThreeVectorTied size, + string asm, SDPatternOperator OpNode> { + def v8i8 : BaseSIMDThreeSameVectorTied<0, U, size, 0b00011, V64, + asm, ".8b", + [(set (v8i8 V64:$dst), + (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; + def v16i8 : BaseSIMDThreeSameVectorTied<1, U, size, 0b00011, V128, + asm, ".16b", + [(set (v16i8 V128:$dst), + (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]>; + + def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS), + (v4i16 V64:$RHS))), + (!cast(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS), + (v2i32 V64:$RHS))), + (!cast(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS), + (v1i64 V64:$RHS))), + (!cast(NAME#"v8i8") + V64:$LHS, V64:$MHS, V64:$RHS)>; + + def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS), + (v8i16 V128:$RHS))), + (!cast(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; + def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS), + (v4i32 V128:$RHS))), + (!cast(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; + def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS), + (v2i64 V128:$RHS))), + (!cast(NAME#"v16i8") + V128:$LHS, V128:$MHS, V128:$RHS)>; +} + + +//---------------------------------------------------------------------------- +// AdvSIMD two register vector instructions. +//---------------------------------------------------------------------------- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDTwoSameVector size, bits<5> opcode, + RegisterOperand regtype, string asm, string dstkind, + string srckind, list pattern> + : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, + "{\t$Rd" # dstkind # ", $Rn" # srckind # + "|" # dstkind # "\t$Rd, $Rn}", "", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDTwoSameVectorTied size, bits<5> opcode, + RegisterOperand regtype, string asm, string dstkind, + string srckind, list pattern> + : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm, + "{\t$Rd" # dstkind # ", $Rn" # srckind # + "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +// Supports B, H, and S element sizes. +multiclass SIMDTwoVectorBHS opc, string asm, + SDPatternOperator OpNode> { + def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64, + asm, ".8b", ".8b", + [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; + def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128, + asm, ".16b", ".16b", + [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; + def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64, + asm, ".4h", ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; + def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128, + asm, ".8h", ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; + def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64, + asm, ".2s", ".2s", + [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; + def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128, + asm, ".4s", ".4s", + [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; +} + +class BaseSIMDVectorLShiftLongBySize size, + RegisterOperand regtype, string asm, string dstkind, + string srckind, string amount> + : I<(outs V128:$Rd), (ins regtype:$Rn), asm, + "{\t$Rd" # dstkind # ", $Rn" # srckind # ", #" # amount # + "|" # dstkind # "\t$Rd, $Rn, #" # amount # "}", "", []>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29-24} = 0b101110; + let Inst{23-22} = size; + let Inst{21-10} = 0b100001001110; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SIMDVectorLShiftLongBySizeBHS { + let neverHasSideEffects = 1 in { + def v8i8 : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64, + "shll", ".8h", ".8b", "8">; + def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128, + "shll2", ".8h", ".16b", "8">; + def v4i16 : BaseSIMDVectorLShiftLongBySize<0, 0b01, V64, + "shll", ".4s", ".4h", "16">; + def v8i16 : BaseSIMDVectorLShiftLongBySize<1, 0b01, V128, + "shll2", ".4s", ".8h", "16">; + def v2i32 : BaseSIMDVectorLShiftLongBySize<0, 0b10, V64, + "shll", ".2d", ".2s", "32">; + def v4i32 : BaseSIMDVectorLShiftLongBySize<1, 0b10, V128, + "shll2", ".2d", ".4s", "32">; + } +} + +// Supports all element sizes. +multiclass SIMDLongTwoVector opc, string asm, + SDPatternOperator OpNode> { + def v8i8_v4i16 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64, + asm, ".4h", ".8b", + [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; + def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128, + asm, ".8h", ".16b", + [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; + def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64, + asm, ".2s", ".4h", + [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; + def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128, + asm, ".4s", ".8h", + [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; + def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64, + asm, ".1d", ".2s", + [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; + def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128, + asm, ".2d", ".4s", + [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; +} + +multiclass SIMDLongTwoVectorTied opc, string asm, + SDPatternOperator OpNode> { + def v8i8_v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64, + asm, ".4h", ".8b", + [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), + (v8i8 V64:$Rn)))]>; + def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128, + asm, ".8h", ".16b", + [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), + (v16i8 V128:$Rn)))]>; + def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64, + asm, ".2s", ".4h", + [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), + (v4i16 V64:$Rn)))]>; + def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128, + asm, ".4s", ".8h", + [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), + (v8i16 V128:$Rn)))]>; + def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64, + asm, ".1d", ".2s", + [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd), + (v2i32 V64:$Rn)))]>; + def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128, + asm, ".2d", ".4s", + [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), + (v4i32 V128:$Rn)))]>; +} + +// Supports all element sizes, except 1xD. +multiclass SIMDTwoVectorBHSDTied opc, string asm, + SDPatternOperator OpNode> { + def v8i8 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64, + asm, ".8b", ".8b", + [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>; + def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128, + asm, ".16b", ".16b", + [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>; + def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64, + asm, ".4h", ".4h", + [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>; + def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128, + asm, ".8h", ".8h", + [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>; + def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64, + asm, ".2s", ".2s", + [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>; + def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128, + asm, ".4s", ".4s", + [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>; + def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, V128, + asm, ".2d", ".2d", + [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>; +} + +multiclass SIMDTwoVectorBHSD opc, string asm, + SDPatternOperator OpNode = null_frag> { + def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64, + asm, ".8b", ".8b", + [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; + def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128, + asm, ".16b", ".16b", + [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; + def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64, + asm, ".4h", ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; + def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128, + asm, ".8h", ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; + def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64, + asm, ".2s", ".2s", + [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; + def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128, + asm, ".4s", ".4s", + [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; + def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, V128, + asm, ".2d", ".2d", + [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>; +} + + +// Supports only B element sizes. +multiclass SIMDTwoVectorB size, bits<5> opc, string asm, + SDPatternOperator OpNode> { + def v8i8 : BaseSIMDTwoSameVector<0, U, size, opc, V64, + asm, ".8b", ".8b", + [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; + def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, V128, + asm, ".16b", ".16b", + [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; + +} + +// Supports only B and H element sizes. +multiclass SIMDTwoVectorBH opc, string asm, + SDPatternOperator OpNode> { + def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64, + asm, ".8b", ".8b", + [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>; + def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128, + asm, ".16b", ".16b", + [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>; + def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64, + asm, ".4h", ".4h", + [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>; + def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128, + asm, ".8h", ".8h", + [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>; +} + +// Supports only S and D element sizes, uses high bit of the size field +// as an extra opcode bit. +multiclass SIMDTwoVectorFP opc, string asm, + SDPatternOperator OpNode> { + def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64, + asm, ".2s", ".2s", + [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>; + def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128, + asm, ".4s", ".4s", + [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>; + def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128, + asm, ".2d", ".2d", + [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>; +} + +// Supports only S element size. +multiclass SIMDTwoVectorS opc, string asm, + SDPatternOperator OpNode> { + def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64, + asm, ".2s", ".2s", + [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; + def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128, + asm, ".4s", ".4s", + [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; +} + + +multiclass SIMDTwoVectorFPToInt opc, string asm, + SDPatternOperator OpNode> { + def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64, + asm, ".2s", ".2s", + [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>; + def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128, + asm, ".4s", ".4s", + [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>; + def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128, + asm, ".2d", ".2d", + [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>; +} + +multiclass SIMDTwoVectorIntToFP opc, string asm, + SDPatternOperator OpNode> { + def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64, + asm, ".2s", ".2s", + [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; + def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128, + asm, ".4s", ".4s", + [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; + def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128, + asm, ".2d", ".2d", + [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>; +} + + +class BaseSIMDMixedTwoVector size, bits<5> opcode, + RegisterOperand inreg, RegisterOperand outreg, + string asm, string outkind, string inkind, + list pattern> + : I<(outs outreg:$Rd), (ins inreg:$Rn), asm, + "{\t$Rd" # outkind # ", $Rn" # inkind # + "|" # outkind # "\t$Rd, $Rn}", "", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class BaseSIMDMixedTwoVectorTied size, bits<5> opcode, + RegisterOperand inreg, RegisterOperand outreg, + string asm, string outkind, string inkind, + list pattern> + : I<(outs outreg:$dst), (ins outreg:$Rd, inreg:$Rn), asm, + "{\t$Rd" # outkind # ", $Rn" # inkind # + "|" # outkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SIMDMixedTwoVector opc, string asm, + SDPatternOperator OpNode> { + def v8i8 : BaseSIMDMixedTwoVector<0, U, 0b00, opc, V128, V64, + asm, ".8b", ".8h", + [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn)))]>; + def v16i8 : BaseSIMDMixedTwoVectorTied<1, U, 0b00, opc, V128, V128, + asm#"2", ".16b", ".8h", []>; + def v4i16 : BaseSIMDMixedTwoVector<0, U, 0b01, opc, V128, V64, + asm, ".4h", ".4s", + [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn)))]>; + def v8i16 : BaseSIMDMixedTwoVectorTied<1, U, 0b01, opc, V128, V128, + asm#"2", ".8h", ".4s", []>; + def v2i32 : BaseSIMDMixedTwoVector<0, U, 0b10, opc, V128, V64, + asm, ".2s", ".2d", + [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn)))]>; + def v4i32 : BaseSIMDMixedTwoVectorTied<1, U, 0b10, opc, V128, V128, + asm#"2", ".4s", ".2d", []>; + + def : Pat<(concat_vectors (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn))), + (!cast(NAME # "v16i8") + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; + def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn))), + (!cast(NAME # "v8i16") + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; + def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn))), + (!cast(NAME # "v4i32") + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; +} + +class BaseSIMDCmpTwoVector size, bits<5> opcode, + RegisterOperand regtype, + string asm, string kind, string zero, + ValueType dty, ValueType sty, SDNode OpNode> + : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, + "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero # + "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "", + [(set (dty regtype:$Rd), (OpNode (sty regtype:$Rn)))]>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +// Comparisons support all element sizes, except 1xD. +multiclass SIMDCmpTwoVector opc, string asm, + SDNode OpNode> { + def v8i8rz : BaseSIMDCmpTwoVector<0, U, 0b00, opc, V64, + asm, ".8b", "0", + v8i8, v8i8, OpNode>; + def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, opc, V128, + asm, ".16b", "0", + v16i8, v16i8, OpNode>; + def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, opc, V64, + asm, ".4h", "0", + v4i16, v4i16, OpNode>; + def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, opc, V128, + asm, ".8h", "0", + v8i16, v8i16, OpNode>; + def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, opc, V64, + asm, ".2s", "0", + v2i32, v2i32, OpNode>; + def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, opc, V128, + asm, ".4s", "0", + v4i32, v4i32, OpNode>; + def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, opc, V128, + asm, ".2d", "0", + v2i64, v2i64, OpNode>; +} + +// FP Comparisons support only S and D element sizes. +multiclass SIMDFPCmpTwoVector opc, + string asm, SDNode OpNode> { + + def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, opc, V64, + asm, ".2s", "0.0", + v2i32, v2f32, OpNode>; + def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, opc, V128, + asm, ".4s", "0.0", + v4i32, v4f32, OpNode>; + def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, opc, V128, + asm, ".2d", "0.0", + v2i64, v2f64, OpNode>; + + def : InstAlias(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>; + def : InstAlias(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>; + def : InstAlias(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>; + def : InstAlias(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>; + def : InstAlias(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>; + def : InstAlias(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDFPCvtTwoVector size, bits<5> opcode, + RegisterOperand outtype, RegisterOperand intype, + string asm, string VdTy, string VnTy, + list pattern> + : I<(outs outtype:$Rd), (ins intype:$Rn), asm, + !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class BaseSIMDFPCvtTwoVectorTied size, bits<5> opcode, + RegisterOperand outtype, RegisterOperand intype, + string asm, string VdTy, string VnTy, + list pattern> + : I<(outs outtype:$dst), (ins outtype:$Rd, intype:$Rn), asm, + !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "$Rd = $dst", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SIMDFPWidenTwoVector opc, string asm> { + def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V128, V64, + asm, ".4s", ".4h", []>; + def v8i16 : BaseSIMDFPCvtTwoVector<1, U, {S,0}, opc, V128, V128, + asm#"2", ".4s", ".8h", []>; + def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V128, V64, + asm, ".2d", ".2s", []>; + def v4i32 : BaseSIMDFPCvtTwoVector<1, U, {S,1}, opc, V128, V128, + asm#"2", ".2d", ".4s", []>; +} + +multiclass SIMDFPNarrowTwoVector opc, string asm> { + def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V64, V128, + asm, ".4h", ".4s", []>; + def v8i16 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,0}, opc, V128, V128, + asm#"2", ".8h", ".4s", []>; + def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128, + asm, ".2s", ".2d", []>; + def v4i32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128, + asm#"2", ".4s", ".2d", []>; +} + +multiclass SIMDFPInexactCvtTwoVector opc, string asm, + Intrinsic OpNode> { + def v2f32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128, + asm, ".2s", ".2d", + [(set (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn)))]>; + def v4f32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128, + asm#"2", ".4s", ".2d", []>; + + def : Pat<(concat_vectors (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn))), + (!cast(NAME # "v4f32") + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; +} + +//---------------------------------------------------------------------------- +// AdvSIMD three register different-size vector instructions. +//---------------------------------------------------------------------------- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDDifferentThreeVector size, bits<4> opcode, + RegisterOperand outtype, RegisterOperand intype1, + RegisterOperand intype2, string asm, + string outkind, string inkind1, string inkind2, + list pattern> + : I<(outs outtype:$Rd), (ins intype1:$Rn, intype2:$Rm), asm, + "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 # + "|" # outkind # "\t$Rd, $Rn, $Rm}", "", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31} = 0; + let Inst{30} = size{0}; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size{2-1}; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15-12} = opcode; + let Inst{11-10} = 0b00; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDDifferentThreeVectorTied size, bits<4> opcode, + RegisterOperand outtype, RegisterOperand intype1, + RegisterOperand intype2, string asm, + string outkind, string inkind1, string inkind2, + list pattern> + : I<(outs outtype:$dst), (ins outtype:$Rd, intype1:$Rn, intype2:$Rm), asm, + "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 # + "|" # outkind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31} = 0; + let Inst{30} = size{0}; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size{2-1}; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15-12} = opcode; + let Inst{11-10} = 0b00; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +// FIXME: TableGen doesn't know how to deal with expanded types that also +// change the element count (in this case, placing the results in +// the high elements of the result register rather than the low +// elements). Until that's fixed, we can't code-gen those. +multiclass SIMDNarrowThreeVectorBHS opc, string asm, + Intrinsic IntOp> { + def v8i16_v8i8 : BaseSIMDDifferentThreeVector; + def v8i16_v16i8 : BaseSIMDDifferentThreeVectorTied; + def v4i32_v4i16 : BaseSIMDDifferentThreeVector; + def v4i32_v8i16 : BaseSIMDDifferentThreeVectorTied; + def v2i64_v2i32 : BaseSIMDDifferentThreeVector; + def v2i64_v4i32 : BaseSIMDDifferentThreeVectorTied; + + + // Patterns for the '2' variants involve INSERT_SUBREG, which you can't put in + // a version attached to an instruction. + def : Pat<(concat_vectors (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn), + (v8i16 V128:$Rm))), + (!cast(NAME # "v8i16_v16i8") + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), + V128:$Rn, V128:$Rm)>; + def : Pat<(concat_vectors (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn), + (v4i32 V128:$Rm))), + (!cast(NAME # "v4i32_v8i16") + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), + V128:$Rn, V128:$Rm)>; + def : Pat<(concat_vectors (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn), + (v2i64 V128:$Rm))), + (!cast(NAME # "v2i64_v4i32") + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), + V128:$Rn, V128:$Rm)>; +} + +multiclass SIMDDifferentThreeVectorBD opc, string asm, + Intrinsic IntOp> { + def v8i8 : BaseSIMDDifferentThreeVector; + def v16i8 : BaseSIMDDifferentThreeVector; + let Predicates = [HasCrypto] in { + def v1i64 : BaseSIMDDifferentThreeVector; + def v2i64 : BaseSIMDDifferentThreeVector; + } + + def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)), + (v8i8 (extract_high_v16i8 V128:$Rm)))), + (!cast(NAME#"v16i8") V128:$Rn, V128:$Rm)>; +} + +multiclass SIMDLongThreeVectorHS opc, string asm, + SDPatternOperator OpNode> { + def v4i16_v4i32 : BaseSIMDDifferentThreeVector; + def v8i16_v4i32 : BaseSIMDDifferentThreeVector; + def v2i32_v2i64 : BaseSIMDDifferentThreeVector; + def v4i32_v2i64 : BaseSIMDDifferentThreeVector; +} + +multiclass SIMDLongThreeVectorBHSabdl opc, string asm, + SDPatternOperator OpNode = null_frag> { + def v8i8_v8i16 : BaseSIMDDifferentThreeVector; + def v16i8_v8i16 : BaseSIMDDifferentThreeVector; + def v4i16_v4i32 : BaseSIMDDifferentThreeVector; + def v8i16_v4i32 : BaseSIMDDifferentThreeVector; + def v2i32_v2i64 : BaseSIMDDifferentThreeVector; + def v4i32_v2i64 : BaseSIMDDifferentThreeVector; +} + +multiclass SIMDLongThreeVectorTiedBHSabal opc, + string asm, + SDPatternOperator OpNode> { + def v8i8_v8i16 : BaseSIMDDifferentThreeVectorTied; + def v16i8_v8i16 : BaseSIMDDifferentThreeVectorTied; + def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied; + def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied; + def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied; + def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied; +} + +multiclass SIMDLongThreeVectorBHS opc, string asm, + SDPatternOperator OpNode = null_frag> { + def v8i8_v8i16 : BaseSIMDDifferentThreeVector; + def v16i8_v8i16 : BaseSIMDDifferentThreeVector; + def v4i16_v4i32 : BaseSIMDDifferentThreeVector; + def v8i16_v4i32 : BaseSIMDDifferentThreeVector; + def v2i32_v2i64 : BaseSIMDDifferentThreeVector; + def v4i32_v2i64 : BaseSIMDDifferentThreeVector; +} + +multiclass SIMDLongThreeVectorTiedBHS opc, + string asm, + SDPatternOperator OpNode> { + def v8i8_v8i16 : BaseSIMDDifferentThreeVectorTied; + def v16i8_v8i16 : BaseSIMDDifferentThreeVectorTied; + def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied; + def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied; + def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied; + def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied; +} + +multiclass SIMDLongThreeVectorSQDMLXTiedHS opc, string asm, + SDPatternOperator Accum> { + def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied; + def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied; + def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied; + def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied; +} + +multiclass SIMDWideThreeVectorBHS opc, string asm, + SDPatternOperator OpNode> { + def v8i8_v8i16 : BaseSIMDDifferentThreeVector; + def v16i8_v8i16 : BaseSIMDDifferentThreeVector; + def v4i16_v4i32 : BaseSIMDDifferentThreeVector; + def v8i16_v4i32 : BaseSIMDDifferentThreeVector; + def v2i32_v2i64 : BaseSIMDDifferentThreeVector; + def v4i32_v2i64 : BaseSIMDDifferentThreeVector; +} + +//---------------------------------------------------------------------------- +// AdvSIMD bitwise extract from vector +//---------------------------------------------------------------------------- + +class BaseSIMDBitwiseExtract + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, i32imm:$imm), asm, + "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $imm" # + "|" # kind # "\t$Rd, $Rn, $Rm, $imm}", "", + [(set (vty regtype:$Rd), + (AArch64ext regtype:$Rn, regtype:$Rm, (i32 imm:$imm)))]>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + bits<4> imm; + let Inst{31} = 0; + let Inst{30} = size; + let Inst{29-21} = 0b101110000; + let Inst{20-16} = Rm; + let Inst{15} = 0; + let Inst{14-11} = imm; + let Inst{10} = 0; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + + +multiclass SIMDBitwiseExtract { + def v8i8 : BaseSIMDBitwiseExtract<0, V64, v8i8, asm, ".8b"> { + let imm{3} = 0; + } + def v16i8 : BaseSIMDBitwiseExtract<1, V128, v16i8, asm, ".16b">; +} + +//---------------------------------------------------------------------------- +// AdvSIMD zip vector +//---------------------------------------------------------------------------- + +class BaseSIMDZipVector size, bits<3> opc, RegisterOperand regtype, + string asm, string kind, SDNode OpNode, ValueType valty> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, + "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # + "|" # kind # "\t$Rd, $Rn, $Rm}", "", + [(set (valty regtype:$Rd), (OpNode regtype:$Rn, regtype:$Rm))]>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31} = 0; + let Inst{30} = size{0}; + let Inst{29-24} = 0b001110; + let Inst{23-22} = size{2-1}; + let Inst{21} = 0; + let Inst{20-16} = Rm; + let Inst{15} = 0; + let Inst{14-12} = opc; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SIMDZipVectoropc, string asm, + SDNode OpNode> { + def v8i8 : BaseSIMDZipVector<0b000, opc, V64, + asm, ".8b", OpNode, v8i8>; + def v16i8 : BaseSIMDZipVector<0b001, opc, V128, + asm, ".16b", OpNode, v16i8>; + def v4i16 : BaseSIMDZipVector<0b010, opc, V64, + asm, ".4h", OpNode, v4i16>; + def v8i16 : BaseSIMDZipVector<0b011, opc, V128, + asm, ".8h", OpNode, v8i16>; + def v2i32 : BaseSIMDZipVector<0b100, opc, V64, + asm, ".2s", OpNode, v2i32>; + def v4i32 : BaseSIMDZipVector<0b101, opc, V128, + asm, ".4s", OpNode, v4i32>; + def v2i64 : BaseSIMDZipVector<0b111, opc, V128, + asm, ".2d", OpNode, v2i64>; + + def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)), + (!cast(NAME#"v2i32") V64:$Rn, V64:$Rm)>; + def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)), + (!cast(NAME#"v4i32") V128:$Rn, V128:$Rm)>; + def : Pat<(v2f64 (OpNode V128:$Rn, V128:$Rm)), + (!cast(NAME#"v2i64") V128:$Rn, V128:$Rm)>; +} + +//---------------------------------------------------------------------------- +// AdvSIMD three register scalar instructions +//---------------------------------------------------------------------------- + +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +class BaseSIMDThreeScalar size, bits<5> opcode, + RegisterClass regtype, string asm, + list pattern> + : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, + "\t$Rd, $Rn, $Rm", "", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31-30} = 0b01; + let Inst{29} = U; + let Inst{28-24} = 0b11110; + let Inst{23-22} = size; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15-11} = opcode; + let Inst{10} = 1; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SIMDThreeScalarD opc, string asm, + SDPatternOperator OpNode> { + def v1i64 : BaseSIMDThreeScalar; +} + +multiclass SIMDThreeScalarBHSD opc, string asm, + SDPatternOperator OpNode> { + def v1i64 : BaseSIMDThreeScalar; + def v1i32 : BaseSIMDThreeScalar; + def v1i16 : BaseSIMDThreeScalar; + def v1i8 : BaseSIMDThreeScalar; + + def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))), + (!cast(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>; + def : Pat<(i32 (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))), + (!cast(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>; +} + +multiclass SIMDThreeScalarHS opc, string asm, + SDPatternOperator OpNode> { + def v1i32 : BaseSIMDThreeScalar; + def v1i16 : BaseSIMDThreeScalar; +} + +multiclass SIMDThreeScalarSD opc, string asm, + SDPatternOperator OpNode = null_frag> { + let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + def #NAME#64 : BaseSIMDThreeScalar; + def #NAME#32 : BaseSIMDThreeScalar; + } + + def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (!cast(NAME # "64") FPR64:$Rn, FPR64:$Rm)>; +} + +multiclass SIMDThreeScalarFPCmp opc, string asm, + SDPatternOperator OpNode = null_frag> { + let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + def #NAME#64 : BaseSIMDThreeScalar; + def #NAME#32 : BaseSIMDThreeScalar; + } + + def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (!cast(NAME # "64") FPR64:$Rn, FPR64:$Rm)>; +} + +class BaseSIMDThreeScalarMixed size, bits<5> opcode, + dag oops, dag iops, string asm, string cstr, list pat> + : I, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31-30} = 0b01; + let Inst{29} = U; + let Inst{28-24} = 0b11110; + let Inst{23-22} = size; + let Inst{21} = 1; + let Inst{20-16} = Rm; + let Inst{15-11} = opcode; + let Inst{10} = 0; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +multiclass SIMDThreeScalarMixedHS opc, string asm, + SDPatternOperator OpNode = null_frag> { + def i16 : BaseSIMDThreeScalarMixed; + def i32 : BaseSIMDThreeScalarMixed; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +multiclass SIMDThreeScalarMixedTiedHS opc, string asm, + SDPatternOperator OpNode = null_frag> { + def i16 : BaseSIMDThreeScalarMixed; + def i32 : BaseSIMDThreeScalarMixed; +} + +//---------------------------------------------------------------------------- +// AdvSIMD two register scalar instructions +//---------------------------------------------------------------------------- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDTwoScalar size, bits<5> opcode, + RegisterClass regtype, RegisterClass regtype2, + string asm, list pat> + : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm, + "\t$Rd, $Rn", "", pat>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31-30} = 0b01; + let Inst{29} = U; + let Inst{28-24} = 0b11110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDTwoScalarTied size, bits<5> opcode, + RegisterClass regtype, RegisterClass regtype2, + string asm, list pat> + : I<(outs regtype:$dst), (ins regtype:$Rd, regtype2:$Rn), asm, + "\t$Rd, $Rn", "$Rd = $dst", pat>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31-30} = 0b01; + let Inst{29} = U; + let Inst{28-24} = 0b11110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDCmpTwoScalar size, bits<5> opcode, + RegisterClass regtype, string asm, string zero> + : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, + "\t$Rd, $Rn, #" # zero, "", []>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31-30} = 0b01; + let Inst{29} = U; + let Inst{28-24} = 0b11110; + let Inst{23-22} = size; + let Inst{21-17} = 0b10000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class SIMDInexactCvtTwoScalar opcode, string asm> + : I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "", + [(set (f32 FPR32:$Rd), (int_aarch64_sisd_fcvtxn (f64 FPR64:$Rn)))]>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31-17} = 0b011111100110000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SIMDCmpTwoScalarD opc, string asm, + SDPatternOperator OpNode> { + def v1i64rz : BaseSIMDCmpTwoScalar; + + def : Pat<(v1i64 (OpNode FPR64:$Rn)), + (!cast(NAME # v1i64rz) FPR64:$Rn)>; +} + +multiclass SIMDCmpTwoScalarSD opc, string asm, + SDPatternOperator OpNode> { + def v1i64rz : BaseSIMDCmpTwoScalar; + def v1i32rz : BaseSIMDCmpTwoScalar; + + def : InstAlias(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>; + def : InstAlias(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>; + + def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))), + (!cast(NAME # v1i64rz) FPR64:$Rn)>; +} + +multiclass SIMDTwoScalarD opc, string asm, + SDPatternOperator OpNode = null_frag> { + def v1i64 : BaseSIMDTwoScalar; + + def : Pat<(i64 (OpNode (i64 FPR64:$Rn))), + (!cast(NAME # "v1i64") FPR64:$Rn)>; +} + +multiclass SIMDTwoScalarSD opc, string asm> { + def v1i64 : BaseSIMDTwoScalar; + def v1i32 : BaseSIMDTwoScalar; +} + +multiclass SIMDTwoScalarCVTSD opc, string asm, + SDPatternOperator OpNode> { + def v1i64 : BaseSIMDTwoScalar; + def v1i32 : BaseSIMDTwoScalar; +} + +multiclass SIMDTwoScalarBHSD opc, string asm, + SDPatternOperator OpNode = null_frag> { + let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + def v1i64 : BaseSIMDTwoScalar; + def v1i32 : BaseSIMDTwoScalar; + def v1i16 : BaseSIMDTwoScalar; + def v1i8 : BaseSIMDTwoScalar; + } + + def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))), + (!cast(NAME # v1i64) FPR64:$Rn)>; +} + +multiclass SIMDTwoScalarBHSDTied opc, string asm, + Intrinsic OpNode> { + let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + def v1i64 : BaseSIMDTwoScalarTied; + def v1i32 : BaseSIMDTwoScalarTied; + def v1i16 : BaseSIMDTwoScalarTied; + def v1i8 : BaseSIMDTwoScalarTied; + } + + def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn))), + (!cast(NAME # v1i64) FPR64:$Rd, FPR64:$Rn)>; +} + + + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +multiclass SIMDTwoScalarMixedBHS opc, string asm, + SDPatternOperator OpNode = null_frag> { + def v1i32 : BaseSIMDTwoScalar; + def v1i16 : BaseSIMDTwoScalar; + def v1i8 : BaseSIMDTwoScalar; +} + +//---------------------------------------------------------------------------- +// AdvSIMD scalar pairwise instructions +//---------------------------------------------------------------------------- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDPairwiseScalar size, bits<5> opcode, + RegisterOperand regtype, RegisterOperand vectype, + string asm, string kind> + : I<(outs regtype:$Rd), (ins vectype:$Rn), asm, + "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", []>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31-30} = 0b01; + let Inst{29} = U; + let Inst{28-24} = 0b11110; + let Inst{23-22} = size; + let Inst{21-17} = 0b11000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SIMDPairwiseScalarD opc, string asm> { + def v2i64p : BaseSIMDPairwiseScalar; +} + +multiclass SIMDPairwiseScalarSD opc, string asm> { + def v2i32p : BaseSIMDPairwiseScalar; + def v2i64p : BaseSIMDPairwiseScalar; +} + +//---------------------------------------------------------------------------- +// AdvSIMD across lanes instructions +//---------------------------------------------------------------------------- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDAcrossLanes size, bits<5> opcode, + RegisterClass regtype, RegisterOperand vectype, + string asm, string kind, list pattern> + : I<(outs regtype:$Rd), (ins vectype:$Rn), asm, + "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-24} = 0b01110; + let Inst{23-22} = size; + let Inst{21-17} = 0b11000; + let Inst{16-12} = opcode; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SIMDAcrossLanesBHS opcode, + string asm> { + def v8i8v : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR8, V64, + asm, ".8b", []>; + def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR8, V128, + asm, ".16b", []>; + def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR16, V64, + asm, ".4h", []>; + def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR16, V128, + asm, ".8h", []>; + def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR32, V128, + asm, ".4s", []>; +} + +multiclass SIMDAcrossLanesHSD opcode, string asm> { + def v8i8v : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR16, V64, + asm, ".8b", []>; + def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR16, V128, + asm, ".16b", []>; + def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR32, V64, + asm, ".4h", []>; + def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR32, V128, + asm, ".8h", []>; + def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR64, V128, + asm, ".4s", []>; +} + +multiclass SIMDAcrossLanesS opcode, bit sz1, string asm, + Intrinsic intOp> { + def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128, + asm, ".4s", + [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>; +} + +//---------------------------------------------------------------------------- +// AdvSIMD INS/DUP instructions +//---------------------------------------------------------------------------- + +// FIXME: There has got to be a better way to factor these. ugh. + +class BaseSIMDInsDup pattern> + : I, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = op; + let Inst{28-21} = 0b01110000; + let Inst{15} = 0; + let Inst{10} = 1; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class SIMDDupFromMain imm5, string size, ValueType vectype, + RegisterOperand vecreg, RegisterClass regtype> + : BaseSIMDInsDup { + let Inst{20-16} = imm5; + let Inst{14-11} = 0b0001; +} + +class SIMDDupFromElement + : BaseSIMDInsDup { + let Inst{14-11} = 0b0000; +} + +class SIMDDup64FromElement + : SIMDDupFromElement<1, ".2d", ".d", v2i64, v2i64, V128, + VectorIndexD, i64, AArch64duplane64> { + bits<1> idx; + let Inst{20} = idx; + let Inst{19-16} = 0b1000; +} + +class SIMDDup32FromElement + : SIMDDupFromElement { + bits<2> idx; + let Inst{20-19} = idx; + let Inst{18-16} = 0b100; +} + +class SIMDDup16FromElement + : SIMDDupFromElement { + bits<3> idx; + let Inst{20-18} = idx; + let Inst{17-16} = 0b10; +} + +class SIMDDup8FromElement + : SIMDDupFromElement { + bits<4> idx; + let Inst{20-17} = idx; + let Inst{16} = 1; +} + +class BaseSIMDMov imm4, RegisterClass regtype, + Operand idxtype, string asm, list pattern> + : BaseSIMDInsDup { + let Inst{14-11} = imm4; +} + +class SIMDSMov + : BaseSIMDMov; +class SIMDUMov + : BaseSIMDMov; + +class SIMDMovAlias + : InstAlias; + +multiclass SMov { + def vi8to32 : SIMDSMov<0, ".b", GPR32, VectorIndexB> { + bits<4> idx; + let Inst{20-17} = idx; + let Inst{16} = 1; + } + def vi8to64 : SIMDSMov<1, ".b", GPR64, VectorIndexB> { + bits<4> idx; + let Inst{20-17} = idx; + let Inst{16} = 1; + } + def vi16to32 : SIMDSMov<0, ".h", GPR32, VectorIndexH> { + bits<3> idx; + let Inst{20-18} = idx; + let Inst{17-16} = 0b10; + } + def vi16to64 : SIMDSMov<1, ".h", GPR64, VectorIndexH> { + bits<3> idx; + let Inst{20-18} = idx; + let Inst{17-16} = 0b10; + } + def vi32to64 : SIMDSMov<1, ".s", GPR64, VectorIndexS> { + bits<2> idx; + let Inst{20-19} = idx; + let Inst{18-16} = 0b100; + } +} + +multiclass UMov { + def vi8 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndexB> { + bits<4> idx; + let Inst{20-17} = idx; + let Inst{16} = 1; + } + def vi16 : SIMDUMov<0, ".h", v8i16, GPR32, VectorIndexH> { + bits<3> idx; + let Inst{20-18} = idx; + let Inst{17-16} = 0b10; + } + def vi32 : SIMDUMov<0, ".s", v4i32, GPR32, VectorIndexS> { + bits<2> idx; + let Inst{20-19} = idx; + let Inst{18-16} = 0b100; + } + def vi64 : SIMDUMov<1, ".d", v2i64, GPR64, VectorIndexD> { + bits<1> idx; + let Inst{20} = idx; + let Inst{19-16} = 0b1000; + } + def : SIMDMovAlias<"mov", ".s", + !cast(NAME#"vi32"), + GPR32, VectorIndexS>; + def : SIMDMovAlias<"mov", ".d", + !cast(NAME#"vi64"), + GPR64, VectorIndexD>; +} + +class SIMDInsFromMain + : BaseSIMDInsDup<1, 0, (outs V128:$dst), + (ins V128:$Rd, idxtype:$idx, regtype:$Rn), "ins", + "{\t$Rd" # size # "$idx, $Rn" # + "|" # size # "\t$Rd$idx, $Rn}", + "$Rd = $dst", + [(set V128:$dst, + (vector_insert (vectype V128:$Rd), regtype:$Rn, idxtype:$idx))]> { + let Inst{14-11} = 0b0011; +} + +class SIMDInsFromElement + : BaseSIMDInsDup<1, 1, (outs V128:$dst), + (ins V128:$Rd, idxtype:$idx, V128:$Rn, idxtype:$idx2), "ins", + "{\t$Rd" # size # "$idx, $Rn" # size # "$idx2" # + "|" # size # "\t$Rd$idx, $Rn$idx2}", + "$Rd = $dst", + [(set V128:$dst, + (vector_insert + (vectype V128:$Rd), + (elttype (vector_extract (vectype V128:$Rn), idxtype:$idx2)), + idxtype:$idx))]>; + +class SIMDInsMainMovAlias + : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # + "|" # size #"\t$dst$idx, $src}", + (inst V128:$dst, idxtype:$idx, regtype:$src)>; +class SIMDInsElementMovAlias + : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" # + # "|" # size #" $dst$idx, $src$idx2}", + (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>; + + +multiclass SIMDIns { + def vi8gpr : SIMDInsFromMain<".b", v16i8, GPR32, VectorIndexB> { + bits<4> idx; + let Inst{20-17} = idx; + let Inst{16} = 1; + } + def vi16gpr : SIMDInsFromMain<".h", v8i16, GPR32, VectorIndexH> { + bits<3> idx; + let Inst{20-18} = idx; + let Inst{17-16} = 0b10; + } + def vi32gpr : SIMDInsFromMain<".s", v4i32, GPR32, VectorIndexS> { + bits<2> idx; + let Inst{20-19} = idx; + let Inst{18-16} = 0b100; + } + def vi64gpr : SIMDInsFromMain<".d", v2i64, GPR64, VectorIndexD> { + bits<1> idx; + let Inst{20} = idx; + let Inst{19-16} = 0b1000; + } + + def vi8lane : SIMDInsFromElement<".b", v16i8, i32, VectorIndexB> { + bits<4> idx; + bits<4> idx2; + let Inst{20-17} = idx; + let Inst{16} = 1; + let Inst{14-11} = idx2; + } + def vi16lane : SIMDInsFromElement<".h", v8i16, i32, VectorIndexH> { + bits<3> idx; + bits<3> idx2; + let Inst{20-18} = idx; + let Inst{17-16} = 0b10; + let Inst{14-12} = idx2; + let Inst{11} = 0; + } + def vi32lane : SIMDInsFromElement<".s", v4i32, i32, VectorIndexS> { + bits<2> idx; + bits<2> idx2; + let Inst{20-19} = idx; + let Inst{18-16} = 0b100; + let Inst{14-13} = idx2; + let Inst{12-11} = 0; + } + def vi64lane : SIMDInsFromElement<".d", v2i64, i64, VectorIndexD> { + bits<1> idx; + bits<1> idx2; + let Inst{20} = idx; + let Inst{19-16} = 0b1000; + let Inst{14} = idx2; + let Inst{13-11} = 0; + } + + // For all forms of the INS instruction, the "mov" mnemonic is the + // preferred alias. Why they didn't just call the instruction "mov" in + // the first place is a very good question indeed... + def : SIMDInsMainMovAlias<".b", !cast(NAME#"vi8gpr"), + GPR32, VectorIndexB>; + def : SIMDInsMainMovAlias<".h", !cast(NAME#"vi16gpr"), + GPR32, VectorIndexH>; + def : SIMDInsMainMovAlias<".s", !cast(NAME#"vi32gpr"), + GPR32, VectorIndexS>; + def : SIMDInsMainMovAlias<".d", !cast(NAME#"vi64gpr"), + GPR64, VectorIndexD>; + + def : SIMDInsElementMovAlias<".b", !cast(NAME#"vi8lane"), + VectorIndexB>; + def : SIMDInsElementMovAlias<".h", !cast(NAME#"vi16lane"), + VectorIndexH>; + def : SIMDInsElementMovAlias<".s", !cast(NAME#"vi32lane"), + VectorIndexS>; + def : SIMDInsElementMovAlias<".d", !cast(NAME#"vi64lane"), + VectorIndexD>; +} + +//---------------------------------------------------------------------------- +// AdvSIMD TBL/TBX +//---------------------------------------------------------------------------- + +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +class BaseSIMDTableLookup len, bit op, RegisterOperand vectype, + RegisterOperand listtype, string asm, string kind> + : I<(outs vectype:$Vd), (ins listtype:$Vn, vectype:$Vm), asm, + "\t$Vd" # kind # ", $Vn, $Vm" # kind, "", []>, + Sched<[WriteV]> { + bits<5> Vd; + bits<5> Vn; + bits<5> Vm; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29-21} = 0b001110000; + let Inst{20-16} = Vm; + let Inst{15} = 0; + let Inst{14-13} = len; + let Inst{12} = op; + let Inst{11-10} = 0b00; + let Inst{9-5} = Vn; + let Inst{4-0} = Vd; +} + +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +class BaseSIMDTableLookupTied len, bit op, RegisterOperand vectype, + RegisterOperand listtype, string asm, string kind> + : I<(outs vectype:$dst), (ins vectype:$Vd, listtype:$Vn, vectype:$Vm), asm, + "\t$Vd" # kind # ", $Vn, $Vm" # kind, "$Vd = $dst", []>, + Sched<[WriteV]> { + bits<5> Vd; + bits<5> Vn; + bits<5> Vm; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29-21} = 0b001110000; + let Inst{20-16} = Vm; + let Inst{15} = 0; + let Inst{14-13} = len; + let Inst{12} = op; + let Inst{11-10} = 0b00; + let Inst{9-5} = Vn; + let Inst{4-0} = Vd; +} + +class SIMDTableLookupAlias + : InstAlias; + +multiclass SIMDTableLookup { + def v8i8One : BaseSIMDTableLookup<0, 0b00, op, V64, VecListOne16b, + asm, ".8b">; + def v8i8Two : BaseSIMDTableLookup<0, 0b01, op, V64, VecListTwo16b, + asm, ".8b">; + def v8i8Three : BaseSIMDTableLookup<0, 0b10, op, V64, VecListThree16b, + asm, ".8b">; + def v8i8Four : BaseSIMDTableLookup<0, 0b11, op, V64, VecListFour16b, + asm, ".8b">; + def v16i8One : BaseSIMDTableLookup<1, 0b00, op, V128, VecListOne16b, + asm, ".16b">; + def v16i8Two : BaseSIMDTableLookup<1, 0b01, op, V128, VecListTwo16b, + asm, ".16b">; + def v16i8Three: BaseSIMDTableLookup<1, 0b10, op, V128, VecListThree16b, + asm, ".16b">; + def v16i8Four : BaseSIMDTableLookup<1, 0b11, op, V128, VecListFour16b, + asm, ".16b">; + + def : SIMDTableLookupAlias(NAME#"v8i8One"), + V64, VecListOne128>; + def : SIMDTableLookupAlias(NAME#"v8i8Two"), + V64, VecListTwo128>; + def : SIMDTableLookupAlias(NAME#"v8i8Three"), + V64, VecListThree128>; + def : SIMDTableLookupAlias(NAME#"v8i8Four"), + V64, VecListFour128>; + def : SIMDTableLookupAlias(NAME#"v16i8One"), + V128, VecListOne128>; + def : SIMDTableLookupAlias(NAME#"v16i8Two"), + V128, VecListTwo128>; + def : SIMDTableLookupAlias(NAME#"v16i8Three"), + V128, VecListThree128>; + def : SIMDTableLookupAlias(NAME#"v16i8Four"), + V128, VecListFour128>; +} + +multiclass SIMDTableLookupTied { + def v8i8One : BaseSIMDTableLookupTied<0, 0b00, op, V64, VecListOne16b, + asm, ".8b">; + def v8i8Two : BaseSIMDTableLookupTied<0, 0b01, op, V64, VecListTwo16b, + asm, ".8b">; + def v8i8Three : BaseSIMDTableLookupTied<0, 0b10, op, V64, VecListThree16b, + asm, ".8b">; + def v8i8Four : BaseSIMDTableLookupTied<0, 0b11, op, V64, VecListFour16b, + asm, ".8b">; + def v16i8One : BaseSIMDTableLookupTied<1, 0b00, op, V128, VecListOne16b, + asm, ".16b">; + def v16i8Two : BaseSIMDTableLookupTied<1, 0b01, op, V128, VecListTwo16b, + asm, ".16b">; + def v16i8Three: BaseSIMDTableLookupTied<1, 0b10, op, V128, VecListThree16b, + asm, ".16b">; + def v16i8Four : BaseSIMDTableLookupTied<1, 0b11, op, V128, VecListFour16b, + asm, ".16b">; + + def : SIMDTableLookupAlias(NAME#"v8i8One"), + V64, VecListOne128>; + def : SIMDTableLookupAlias(NAME#"v8i8Two"), + V64, VecListTwo128>; + def : SIMDTableLookupAlias(NAME#"v8i8Three"), + V64, VecListThree128>; + def : SIMDTableLookupAlias(NAME#"v8i8Four"), + V64, VecListFour128>; + def : SIMDTableLookupAlias(NAME#"v16i8One"), + V128, VecListOne128>; + def : SIMDTableLookupAlias(NAME#"v16i8Two"), + V128, VecListTwo128>; + def : SIMDTableLookupAlias(NAME#"v16i8Three"), + V128, VecListThree128>; + def : SIMDTableLookupAlias(NAME#"v16i8Four"), + V128, VecListFour128>; +} + + +//---------------------------------------------------------------------------- +// AdvSIMD scalar CPY +//---------------------------------------------------------------------------- +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDScalarCPY + : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), "mov", + "{\t$dst, $src" # kind # "$idx" # + "|\t$dst, $src$idx}", "", []>, + Sched<[WriteV]> { + bits<5> dst; + bits<5> src; + let Inst{31-21} = 0b01011110000; + let Inst{15-10} = 0b000001; + let Inst{9-5} = src; + let Inst{4-0} = dst; +} + +class SIMDScalarCPYAlias + : InstAlias; + + +multiclass SIMDScalarCPY { + def i8 : BaseSIMDScalarCPY { + bits<4> idx; + let Inst{20-17} = idx; + let Inst{16} = 1; + } + def i16 : BaseSIMDScalarCPY { + bits<3> idx; + let Inst{20-18} = idx; + let Inst{17-16} = 0b10; + } + def i32 : BaseSIMDScalarCPY { + bits<2> idx; + let Inst{20-19} = idx; + let Inst{18-16} = 0b100; + } + def i64 : BaseSIMDScalarCPY { + bits<1> idx; + let Inst{20} = idx; + let Inst{19-16} = 0b1000; + } + + def : Pat<(v1i64 (scalar_to_vector (i64 (vector_extract (v2i64 V128:$src), + VectorIndexD:$idx)))), + (!cast(NAME # i64) V128:$src, VectorIndexD:$idx)>; + + // 'DUP' mnemonic aliases. + def : SIMDScalarCPYAlias<"dup", ".b", + !cast(NAME#"i8"), + FPR8, V128, VectorIndexB>; + def : SIMDScalarCPYAlias<"dup", ".h", + !cast(NAME#"i16"), + FPR16, V128, VectorIndexH>; + def : SIMDScalarCPYAlias<"dup", ".s", + !cast(NAME#"i32"), + FPR32, V128, VectorIndexS>; + def : SIMDScalarCPYAlias<"dup", ".d", + !cast(NAME#"i64"), + FPR64, V128, VectorIndexD>; +} + +//---------------------------------------------------------------------------- +// AdvSIMD modified immediate instructions +//---------------------------------------------------------------------------- + +class BaseSIMDModifiedImm pattern> + : I, + Sched<[WriteV]> { + bits<5> Rd; + bits<8> imm8; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = op; + let Inst{28-19} = 0b0111100000; + let Inst{18-16} = imm8{7-5}; + let Inst{11-10} = 0b01; + let Inst{9-5} = imm8{4-0}; + let Inst{4-0} = Rd; +} + +class BaseSIMDModifiedImmVector pattern> + : BaseSIMDModifiedImm { + let DecoderMethod = "DecodeModImmInstruction"; +} + +class BaseSIMDModifiedImmVectorTied pattern> + : BaseSIMDModifiedImm { + let DecoderMethod = "DecodeModImmTiedInstruction"; +} + +class BaseSIMDModifiedImmVectorShift b15_b12, + RegisterOperand vectype, string asm, + string kind, list pattern> + : BaseSIMDModifiedImmVector { + bits<2> shift; + let Inst{15} = b15_b12{1}; + let Inst{14-13} = shift; + let Inst{12} = b15_b12{0}; +} + +class BaseSIMDModifiedImmVectorShiftTied b15_b12, + RegisterOperand vectype, string asm, + string kind, list pattern> + : BaseSIMDModifiedImmVectorTied { + bits<2> shift; + let Inst{15} = b15_b12{1}; + let Inst{14-13} = shift; + let Inst{12} = b15_b12{0}; +} + + +class BaseSIMDModifiedImmVectorShiftHalf b15_b12, + RegisterOperand vectype, string asm, + string kind, list pattern> + : BaseSIMDModifiedImmVector { + bits<2> shift; + let Inst{15} = b15_b12{1}; + let Inst{14} = 0; + let Inst{13} = shift{0}; + let Inst{12} = b15_b12{0}; +} + +class BaseSIMDModifiedImmVectorShiftHalfTied b15_b12, + RegisterOperand vectype, string asm, + string kind, list pattern> + : BaseSIMDModifiedImmVectorTied { + bits<2> shift; + let Inst{15} = b15_b12{1}; + let Inst{14} = 0; + let Inst{13} = shift{0}; + let Inst{12} = b15_b12{0}; +} + +multiclass SIMDModifiedImmVectorShift hw_cmode, bits<2> w_cmode, + string asm> { + def v4i16 : BaseSIMDModifiedImmVectorShiftHalf<0, op, hw_cmode, V64, + asm, ".4h", []>; + def v8i16 : BaseSIMDModifiedImmVectorShiftHalf<1, op, hw_cmode, V128, + asm, ".8h", []>; + + def v2i32 : BaseSIMDModifiedImmVectorShift<0, op, w_cmode, V64, + asm, ".2s", []>; + def v4i32 : BaseSIMDModifiedImmVectorShift<1, op, w_cmode, V128, + asm, ".4s", []>; +} + +multiclass SIMDModifiedImmVectorShiftTied hw_cmode, + bits<2> w_cmode, string asm, + SDNode OpNode> { + def v4i16 : BaseSIMDModifiedImmVectorShiftHalfTied<0, op, hw_cmode, V64, + asm, ".4h", + [(set (v4i16 V64:$dst), (OpNode V64:$Rd, + imm0_255:$imm8, + (i32 imm:$shift)))]>; + def v8i16 : BaseSIMDModifiedImmVectorShiftHalfTied<1, op, hw_cmode, V128, + asm, ".8h", + [(set (v8i16 V128:$dst), (OpNode V128:$Rd, + imm0_255:$imm8, + (i32 imm:$shift)))]>; + + def v2i32 : BaseSIMDModifiedImmVectorShiftTied<0, op, w_cmode, V64, + asm, ".2s", + [(set (v2i32 V64:$dst), (OpNode V64:$Rd, + imm0_255:$imm8, + (i32 imm:$shift)))]>; + def v4i32 : BaseSIMDModifiedImmVectorShiftTied<1, op, w_cmode, V128, + asm, ".4s", + [(set (v4i32 V128:$dst), (OpNode V128:$Rd, + imm0_255:$imm8, + (i32 imm:$shift)))]>; +} + +class SIMDModifiedImmMoveMSL cmode, + RegisterOperand vectype, string asm, + string kind, list pattern> + : BaseSIMDModifiedImmVector { + bits<1> shift; + let Inst{15-13} = cmode{3-1}; + let Inst{12} = shift; +} + +class SIMDModifiedImmVectorNoShift cmode, + RegisterOperand vectype, + Operand imm_type, string asm, + string kind, list pattern> + : BaseSIMDModifiedImmVector { + let Inst{15-12} = cmode; +} + +class SIMDModifiedImmScalarNoShift cmode, string asm, + list pattern> + : BaseSIMDModifiedImm { + let Inst{15-12} = cmode; + let DecoderMethod = "DecodeModImmInstruction"; +} + +//---------------------------------------------------------------------------- +// AdvSIMD indexed element +//---------------------------------------------------------------------------- + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDIndexed size, bits<4> opc, + RegisterOperand dst_reg, RegisterOperand lhs_reg, + RegisterOperand rhs_reg, Operand vec_idx, string asm, + string apple_kind, string dst_kind, string lhs_kind, + string rhs_kind, list pattern> + : I<(outs dst_reg:$Rd), (ins lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), + asm, + "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" # + "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28} = Scalar; + let Inst{27-24} = 0b1111; + let Inst{23-22} = size; + // Bit 21 must be set by the derived class. + let Inst{20-16} = Rm; + let Inst{15-12} = opc; + // Bit 11 must be set by the derived class. + let Inst{10} = 0; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDIndexedTied size, bits<4> opc, + RegisterOperand dst_reg, RegisterOperand lhs_reg, + RegisterOperand rhs_reg, Operand vec_idx, string asm, + string apple_kind, string dst_kind, string lhs_kind, + string rhs_kind, list pattern> + : I<(outs dst_reg:$dst), + (ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), asm, + "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" # + "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "$Rd = $dst", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28} = Scalar; + let Inst{27-24} = 0b1111; + let Inst{23-22} = size; + // Bit 21 must be set by the derived class. + let Inst{20-16} = Rm; + let Inst{15-12} = opc; + // Bit 11 must be set by the derived class. + let Inst{10} = 0; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SIMDFPIndexedSD opc, string asm, + SDPatternOperator OpNode> { + def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, + V64, V64, + V128, VectorIndexS, + asm, ".2s", ".2s", ".2s", ".s", + [(set (v2f32 V64:$Rd), + (OpNode (v2f32 V64:$Rn), + (v2f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc, + V128, V128, + V128, VectorIndexS, + asm, ".4s", ".4s", ".4s", ".s", + [(set (v4f32 V128:$Rd), + (OpNode (v4f32 V128:$Rn), + (v4f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v2i64_indexed : BaseSIMDIndexed<1, U, 0, 0b11, opc, + V128, V128, + V128, VectorIndexD, + asm, ".2d", ".2d", ".2d", ".d", + [(set (v2f64 V128:$Rd), + (OpNode (v2f64 V128:$Rn), + (v2f64 (AArch64duplane64 (v2f64 V128:$Rm), VectorIndexD:$idx))))]> { + bits<1> idx; + let Inst{11} = idx{0}; + let Inst{21} = 0; + } + + def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc, + FPR32Op, FPR32Op, V128, VectorIndexS, + asm, ".s", "", "", ".s", + [(set (f32 FPR32Op:$Rd), + (OpNode (f32 FPR32Op:$Rn), + (f32 (vector_extract (v4f32 V128:$Rm), + VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b11, opc, + FPR64Op, FPR64Op, V128, VectorIndexD, + asm, ".d", "", "", ".d", + [(set (f64 FPR64Op:$Rd), + (OpNode (f64 FPR64Op:$Rn), + (f64 (vector_extract (v2f64 V128:$Rm), + VectorIndexD:$idx))))]> { + bits<1> idx; + let Inst{11} = idx{0}; + let Inst{21} = 0; + } +} + +multiclass SIMDFPIndexedSDTiedPatterns { + // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar. + def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), + (AArch64duplane32 (v4f32 V128:$Rm), + VectorIndexS:$idx))), + (!cast(INST # v2i32_indexed) + V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>; + def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), + (AArch64dup (f32 FPR32Op:$Rm)))), + (!cast(INST # "v2i32_indexed") V64:$Rd, V64:$Rn, + (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>; + + + // 2 variants for the .4s version: DUPLANE from 128-bit and DUP scalar. + def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), + (AArch64duplane32 (v4f32 V128:$Rm), + VectorIndexS:$idx))), + (!cast(INST # "v4i32_indexed") + V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>; + def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), + (AArch64dup (f32 FPR32Op:$Rm)))), + (!cast(INST # "v4i32_indexed") V128:$Rd, V128:$Rn, + (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>; + + // 2 variants for the .2d version: DUPLANE from 128-bit and DUP scalar. + def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), + (AArch64duplane64 (v2f64 V128:$Rm), + VectorIndexD:$idx))), + (!cast(INST # "v2i64_indexed") + V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>; + def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), + (AArch64dup (f64 FPR64Op:$Rm)))), + (!cast(INST # "v2i64_indexed") V128:$Rd, V128:$Rn, + (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>; + + // 2 variants for 32-bit scalar version: extract from .2s or from .4s + def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), + (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))), + (!cast(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn, + V128:$Rm, VectorIndexS:$idx)>; + def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), + (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))), + (!cast(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn, + (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>; + + // 1 variant for 64-bit scalar version: extract from .1d or from .2d + def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn), + (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))), + (!cast(INST # "v1i64_indexed") FPR64:$Rd, FPR64:$Rn, + V128:$Rm, VectorIndexD:$idx)>; +} + +multiclass SIMDFPIndexedSDTied opc, string asm> { + def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64, + V128, VectorIndexS, + asm, ".2s", ".2s", ".2s", ".s", []> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc, + V128, V128, + V128, VectorIndexS, + asm, ".4s", ".4s", ".4s", ".s", []> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v2i64_indexed : BaseSIMDIndexedTied<1, U, 0, 0b11, opc, + V128, V128, + V128, VectorIndexD, + asm, ".2d", ".2d", ".2d", ".d", []> { + bits<1> idx; + let Inst{11} = idx{0}; + let Inst{21} = 0; + } + + + def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc, + FPR32Op, FPR32Op, V128, VectorIndexS, + asm, ".s", "", "", ".s", []> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b11, opc, + FPR64Op, FPR64Op, V128, VectorIndexD, + asm, ".d", "", "", ".d", []> { + bits<1> idx; + let Inst{11} = idx{0}; + let Inst{21} = 0; + } +} + +multiclass SIMDIndexedHS opc, string asm, + SDPatternOperator OpNode> { + def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64, + V128_lo, VectorIndexH, + asm, ".4h", ".4h", ".4h", ".h", + [(set (v4i16 V64:$Rd), + (OpNode (v4i16 V64:$Rn), + (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc, + V128, V128, + V128_lo, VectorIndexH, + asm, ".8h", ".8h", ".8h", ".h", + [(set (v8i16 V128:$Rd), + (OpNode (v8i16 V128:$Rn), + (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, + V64, V64, + V128, VectorIndexS, + asm, ".2s", ".2s", ".2s", ".s", + [(set (v2i32 V64:$Rd), + (OpNode (v2i32 V64:$Rn), + (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc, + V128, V128, + V128, VectorIndexS, + asm, ".4s", ".4s", ".4s", ".s", + [(set (v4i32 V128:$Rd), + (OpNode (v4i32 V128:$Rn), + (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc, + FPR16Op, FPR16Op, V128_lo, VectorIndexH, + asm, ".h", "", "", ".h", []> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc, + FPR32Op, FPR32Op, V128, VectorIndexS, + asm, ".s", "", "", ".s", + [(set (i32 FPR32Op:$Rd), + (OpNode FPR32Op:$Rn, + (i32 (vector_extract (v4i32 V128:$Rm), + VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } +} + +multiclass SIMDVectorIndexedHS opc, string asm, + SDPatternOperator OpNode> { + def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, + V64, V64, + V128_lo, VectorIndexH, + asm, ".4h", ".4h", ".4h", ".h", + [(set (v4i16 V64:$Rd), + (OpNode (v4i16 V64:$Rn), + (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc, + V128, V128, + V128_lo, VectorIndexH, + asm, ".8h", ".8h", ".8h", ".h", + [(set (v8i16 V128:$Rd), + (OpNode (v8i16 V128:$Rn), + (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, + V64, V64, + V128, VectorIndexS, + asm, ".2s", ".2s", ".2s", ".s", + [(set (v2i32 V64:$Rd), + (OpNode (v2i32 V64:$Rn), + (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc, + V128, V128, + V128, VectorIndexS, + asm, ".4s", ".4s", ".4s", ".s", + [(set (v4i32 V128:$Rd), + (OpNode (v4i32 V128:$Rn), + (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } +} + +multiclass SIMDVectorIndexedHSTied opc, string asm, + SDPatternOperator OpNode> { + def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64, + V128_lo, VectorIndexH, + asm, ".4h", ".4h", ".4h", ".h", + [(set (v4i16 V64:$dst), + (OpNode (v4i16 V64:$Rd),(v4i16 V64:$Rn), + (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc, + V128, V128, + V128_lo, VectorIndexH, + asm, ".8h", ".8h", ".8h", ".h", + [(set (v8i16 V128:$dst), + (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), + (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, + V64, V64, + V128, VectorIndexS, + asm, ".2s", ".2s", ".2s", ".s", + [(set (v2i32 V64:$dst), + (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), + (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc, + V128, V128, + V128, VectorIndexS, + asm, ".4s", ".4s", ".4s", ".s", + [(set (v4i32 V128:$dst), + (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), + (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } +} + +multiclass SIMDIndexedLongSD opc, string asm, + SDPatternOperator OpNode> { + def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, + V128, V64, + V128_lo, VectorIndexH, + asm, ".4s", ".4s", ".4h", ".h", + [(set (v4i32 V128:$Rd), + (OpNode (v4i16 V64:$Rn), + (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc, + V128, V128, + V128_lo, VectorIndexH, + asm#"2", ".4s", ".4s", ".8h", ".h", + [(set (v4i32 V128:$Rd), + (OpNode (extract_high_v8i16 V128:$Rn), + (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx))))]> { + + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, + V128, V64, + V128, VectorIndexS, + asm, ".2d", ".2d", ".2s", ".s", + [(set (v2i64 V128:$Rd), + (OpNode (v2i32 V64:$Rn), + (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc, + V128, V128, + V128, VectorIndexS, + asm#"2", ".2d", ".2d", ".4s", ".s", + [(set (v2i64 V128:$Rd), + (OpNode (extract_high_v4i32 V128:$Rn), + (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc, + FPR32Op, FPR16Op, V128_lo, VectorIndexH, + asm, ".h", "", "", ".h", []> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc, + FPR64Op, FPR32Op, V128, VectorIndexS, + asm, ".s", "", "", ".s", []> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } +} + +multiclass SIMDIndexedLongSQDMLXSDTied opc, string asm, + SDPatternOperator Accum> { + def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, + V128, V64, + V128_lo, VectorIndexH, + asm, ".4s", ".4s", ".4h", ".h", + [(set (v4i32 V128:$dst), + (Accum (v4i32 V128:$Rd), + (v4i32 (int_aarch64_neon_sqdmull + (v4i16 V64:$Rn), + (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx))))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + // FIXME: it would be nice to use the scalar (v1i32) instruction here, but an + // intermediate EXTRACT_SUBREG would be untyped. + def : Pat<(i32 (Accum (i32 FPR32Op:$Rd), + (i32 (vector_extract (v4i32 + (int_aarch64_neon_sqdmull (v4i16 V64:$Rn), + (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx)))), + (i64 0))))), + (EXTRACT_SUBREG + (!cast(NAME # v4i16_indexed) + (SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub), V64:$Rn, + V128_lo:$Rm, VectorIndexH:$idx), + ssub)>; + + def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc, + V128, V128, + V128_lo, VectorIndexH, + asm#"2", ".4s", ".4s", ".8h", ".h", + [(set (v4i32 V128:$dst), + (Accum (v4i32 V128:$Rd), + (v4i32 (int_aarch64_neon_sqdmull + (extract_high_v8i16 V128:$Rn), + (extract_high_v8i16 + (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx))))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, + V128, V64, + V128, VectorIndexS, + asm, ".2d", ".2d", ".2s", ".s", + [(set (v2i64 V128:$dst), + (Accum (v2i64 V128:$Rd), + (v2i64 (int_aarch64_neon_sqdmull + (v2i32 V64:$Rn), + (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx))))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc, + V128, V128, + V128, VectorIndexS, + asm#"2", ".2d", ".2d", ".4s", ".s", + [(set (v2i64 V128:$dst), + (Accum (v2i64 V128:$Rd), + (v2i64 (int_aarch64_neon_sqdmull + (extract_high_v4i32 V128:$Rn), + (extract_high_v4i32 + (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx))))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc, + FPR32Op, FPR16Op, V128_lo, VectorIndexH, + asm, ".h", "", "", ".h", []> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + + def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc, + FPR64Op, FPR32Op, V128, VectorIndexS, + asm, ".s", "", "", ".s", + [(set (i64 FPR64Op:$dst), + (Accum (i64 FPR64Op:$Rd), + (i64 (int_aarch64_neon_sqdmulls_scalar + (i32 FPR32Op:$Rn), + (i32 (vector_extract (v4i32 V128:$Rm), + VectorIndexS:$idx))))))]> { + + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } +} + +multiclass SIMDVectorIndexedLongSD opc, string asm, + SDPatternOperator OpNode> { + let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, + V128, V64, + V128_lo, VectorIndexH, + asm, ".4s", ".4s", ".4h", ".h", + [(set (v4i32 V128:$Rd), + (OpNode (v4i16 V64:$Rn), + (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc, + V128, V128, + V128_lo, VectorIndexH, + asm#"2", ".4s", ".4s", ".8h", ".h", + [(set (v4i32 V128:$Rd), + (OpNode (extract_high_v8i16 V128:$Rn), + (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx))))]> { + + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, + V128, V64, + V128, VectorIndexS, + asm, ".2d", ".2d", ".2s", ".s", + [(set (v2i64 V128:$Rd), + (OpNode (v2i32 V64:$Rn), + (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc, + V128, V128, + V128, VectorIndexS, + asm#"2", ".2d", ".2d", ".4s", ".s", + [(set (v2i64 V128:$Rd), + (OpNode (extract_high_v4i32 V128:$Rn), + (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + } +} + +multiclass SIMDVectorIndexedLongSDTied opc, string asm, + SDPatternOperator OpNode> { + let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { + def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, + V128, V64, + V128_lo, VectorIndexH, + asm, ".4s", ".4s", ".4h", ".h", + [(set (v4i32 V128:$dst), + (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn), + (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc, + V128, V128, + V128_lo, VectorIndexH, + asm#"2", ".4s", ".4s", ".8h", ".h", + [(set (v4i32 V128:$dst), + (OpNode (v4i32 V128:$Rd), + (extract_high_v8i16 V128:$Rn), + (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, + V128, V64, + V128, VectorIndexS, + asm, ".2d", ".2d", ".2s", ".s", + [(set (v2i64 V128:$dst), + (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn), + (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + + def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc, + V128, V128, + V128, VectorIndexS, + asm#"2", ".2d", ".2d", ".4s", ".s", + [(set (v2i64 V128:$dst), + (OpNode (v2i64 V128:$Rd), + (extract_high_v4i32 V128:$Rn), + (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx))))]> { + bits<2> idx; + let Inst{11} = idx{1}; + let Inst{21} = idx{0}; + } + } +} + +//---------------------------------------------------------------------------- +// AdvSIMD scalar shift by immediate +//---------------------------------------------------------------------------- + +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +class BaseSIMDScalarShift opc, bits<7> fixed_imm, + RegisterClass regtype1, RegisterClass regtype2, + Operand immtype, string asm, list pattern> + : I<(outs regtype1:$Rd), (ins regtype2:$Rn, immtype:$imm), + asm, "\t$Rd, $Rn, $imm", "", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<7> imm; + let Inst{31-30} = 0b01; + let Inst{29} = U; + let Inst{28-23} = 0b111110; + let Inst{22-16} = fixed_imm; + let Inst{15-11} = opc; + let Inst{10} = 1; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +class BaseSIMDScalarShiftTied opc, bits<7> fixed_imm, + RegisterClass regtype1, RegisterClass regtype2, + Operand immtype, string asm, list pattern> + : I<(outs regtype1:$dst), (ins regtype1:$Rd, regtype2:$Rn, immtype:$imm), + asm, "\t$Rd, $Rn, $imm", "$Rd = $dst", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + bits<7> imm; + let Inst{31-30} = 0b01; + let Inst{29} = U; + let Inst{28-23} = 0b111110; + let Inst{22-16} = fixed_imm; + let Inst{15-11} = opc; + let Inst{10} = 1; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + + +multiclass SIMDScalarRShiftSD opc, string asm> { + def s : BaseSIMDScalarShift { + let Inst{20-16} = imm{4-0}; + } + + def d : BaseSIMDScalarShift { + let Inst{21-16} = imm{5-0}; + } +} + +multiclass SIMDScalarRShiftD opc, string asm, + SDPatternOperator OpNode> { + def d : BaseSIMDScalarShift { + let Inst{21-16} = imm{5-0}; + } + + def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))), + (!cast(NAME # "d") FPR64:$Rn, vecshiftR64:$imm)>; +} + +multiclass SIMDScalarRShiftDTied opc, string asm, + SDPatternOperator OpNode = null_frag> { + def d : BaseSIMDScalarShiftTied { + let Inst{21-16} = imm{5-0}; + } + + def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), + (i32 vecshiftR64:$imm))), + (!cast(NAME # "d") FPR64:$Rd, FPR64:$Rn, + vecshiftR64:$imm)>; +} + +multiclass SIMDScalarLShiftD opc, string asm, + SDPatternOperator OpNode> { + def d : BaseSIMDScalarShift { + let Inst{21-16} = imm{5-0}; + } +} + +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +multiclass SIMDScalarLShiftDTied opc, string asm> { + def d : BaseSIMDScalarShiftTied { + let Inst{21-16} = imm{5-0}; + } +} + +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +multiclass SIMDScalarRShiftBHS opc, string asm, + SDPatternOperator OpNode = null_frag> { + def b : BaseSIMDScalarShift { + let Inst{18-16} = imm{2-0}; + } + + def h : BaseSIMDScalarShift { + let Inst{19-16} = imm{3-0}; + } + + def s : BaseSIMDScalarShift { + let Inst{20-16} = imm{4-0}; + } +} + +multiclass SIMDScalarLShiftBHSD opc, string asm, + SDPatternOperator OpNode> { + def b : BaseSIMDScalarShift { + let Inst{18-16} = imm{2-0}; + } + + def h : BaseSIMDScalarShift { + let Inst{19-16} = imm{3-0}; + } + + def s : BaseSIMDScalarShift { + let Inst{20-16} = imm{4-0}; + } + + def d : BaseSIMDScalarShift { + let Inst{21-16} = imm{5-0}; + } + + def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))), + (!cast(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>; +} + +multiclass SIMDScalarRShiftBHSD opc, string asm> { + def b : BaseSIMDScalarShift { + let Inst{18-16} = imm{2-0}; + } + + def h : BaseSIMDScalarShift { + let Inst{19-16} = imm{3-0}; + } + + def s : BaseSIMDScalarShift { + let Inst{20-16} = imm{4-0}; + } + + def d : BaseSIMDScalarShift { + let Inst{21-16} = imm{5-0}; + } +} + +//---------------------------------------------------------------------------- +// AdvSIMD vector x indexed element +//---------------------------------------------------------------------------- + +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +class BaseSIMDVectorShift opc, bits<7> fixed_imm, + RegisterOperand dst_reg, RegisterOperand src_reg, + Operand immtype, + string asm, string dst_kind, string src_kind, + list pattern> + : I<(outs dst_reg:$Rd), (ins src_reg:$Rn, immtype:$imm), + asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" # + "|" # dst_kind # "\t$Rd, $Rn, $imm}", "", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-23} = 0b011110; + let Inst{22-16} = fixed_imm; + let Inst{15-11} = opc; + let Inst{10} = 1; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in +class BaseSIMDVectorShiftTied opc, bits<7> fixed_imm, + RegisterOperand vectype1, RegisterOperand vectype2, + Operand immtype, + string asm, string dst_kind, string src_kind, + list pattern> + : I<(outs vectype1:$dst), (ins vectype1:$Rd, vectype2:$Rn, immtype:$imm), + asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" # + "|" # dst_kind # "\t$Rd, $Rn, $imm}", "$Rd = $dst", pattern>, + Sched<[WriteV]> { + bits<5> Rd; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29} = U; + let Inst{28-23} = 0b011110; + let Inst{22-16} = fixed_imm; + let Inst{15-11} = opc; + let Inst{10} = 1; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +multiclass SIMDVectorRShiftSD opc, string asm, + Intrinsic OpNode> { + def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, + V64, V64, vecshiftR32, + asm, ".2s", ".2s", + [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (i32 imm:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?}, + V128, V128, vecshiftR32, + asm, ".4s", ".4s", + [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (i32 imm:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?}, + V128, V128, vecshiftR64, + asm, ".2d", ".2d", + [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (i32 imm:$imm)))]> { + bits<6> imm; + let Inst{21-16} = imm; + } +} + +multiclass SIMDVectorRShiftSDToFP opc, string asm, + Intrinsic OpNode> { + def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, + V64, V64, vecshiftR32, + asm, ".2s", ".2s", + [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 imm:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?}, + V128, V128, vecshiftR32, + asm, ".4s", ".4s", + [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 imm:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?}, + V128, V128, vecshiftR64, + asm, ".2d", ".2d", + [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 imm:$imm)))]> { + bits<6> imm; + let Inst{21-16} = imm; + } +} + +multiclass SIMDVectorRShiftNarrowBHS opc, string asm, + SDPatternOperator OpNode> { + def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?}, + V64, V128, vecshiftR16Narrow, + asm, ".8b", ".8h", + [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))]> { + bits<3> imm; + let Inst{18-16} = imm; + } + + def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?}, + V128, V128, vecshiftR16Narrow, + asm#"2", ".16b", ".8h", []> { + bits<3> imm; + let Inst{18-16} = imm; + let hasSideEffects = 0; + } + + def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, + V64, V128, vecshiftR32Narrow, + asm, ".4h", ".4s", + [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?}, + V128, V128, vecshiftR32Narrow, + asm#"2", ".8h", ".4s", []> { + bits<4> imm; + let Inst{19-16} = imm; + let hasSideEffects = 0; + } + + def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, + V64, V128, vecshiftR64Narrow, + asm, ".2s", ".2d", + [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?}, + V128, V128, vecshiftR64Narrow, + asm#"2", ".4s", ".2d", []> { + bits<5> imm; + let Inst{20-16} = imm; + let hasSideEffects = 0; + } + + // TableGen doesn't like patters w/ INSERT_SUBREG on the instructions + // themselves, so put them here instead. + + // Patterns involving what's effectively an insert high and a normal + // intrinsic, represented by CONCAT_VECTORS. + def : Pat<(concat_vectors (v8i8 V64:$Rd),(OpNode (v8i16 V128:$Rn), + vecshiftR16Narrow:$imm)), + (!cast(NAME # "v16i8_shift") + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), + V128:$Rn, vecshiftR16Narrow:$imm)>; + def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), + vecshiftR32Narrow:$imm)), + (!cast(NAME # "v8i16_shift") + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), + V128:$Rn, vecshiftR32Narrow:$imm)>; + def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), + vecshiftR64Narrow:$imm)), + (!cast(NAME # "v4i32_shift") + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), + V128:$Rn, vecshiftR64Narrow:$imm)>; +} + +multiclass SIMDVectorLShiftBHSD opc, string asm, + SDPatternOperator OpNode> { + def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?}, + V64, V64, vecshiftL8, + asm, ".8b", ".8b", + [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), + (i32 vecshiftL8:$imm)))]> { + bits<3> imm; + let Inst{18-16} = imm; + } + + def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?}, + V128, V128, vecshiftL8, + asm, ".16b", ".16b", + [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), + (i32 vecshiftL8:$imm)))]> { + bits<3> imm; + let Inst{18-16} = imm; + } + + def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, + V64, V64, vecshiftL16, + asm, ".4h", ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), + (i32 vecshiftL16:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?}, + V128, V128, vecshiftL16, + asm, ".8h", ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), + (i32 vecshiftL16:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, + V64, V64, vecshiftL32, + asm, ".2s", ".2s", + [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), + (i32 vecshiftL32:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?}, + V128, V128, vecshiftL32, + asm, ".4s", ".4s", + [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), + (i32 vecshiftL32:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?}, + V128, V128, vecshiftL64, + asm, ".2d", ".2d", + [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), + (i32 vecshiftL64:$imm)))]> { + bits<6> imm; + let Inst{21-16} = imm; + } +} + +multiclass SIMDVectorRShiftBHSD opc, string asm, + SDPatternOperator OpNode> { + def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?}, + V64, V64, vecshiftR8, + asm, ".8b", ".8b", + [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), + (i32 vecshiftR8:$imm)))]> { + bits<3> imm; + let Inst{18-16} = imm; + } + + def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?}, + V128, V128, vecshiftR8, + asm, ".16b", ".16b", + [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), + (i32 vecshiftR8:$imm)))]> { + bits<3> imm; + let Inst{18-16} = imm; + } + + def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, + V64, V64, vecshiftR16, + asm, ".4h", ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), + (i32 vecshiftR16:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?}, + V128, V128, vecshiftR16, + asm, ".8h", ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), + (i32 vecshiftR16:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, + V64, V64, vecshiftR32, + asm, ".2s", ".2s", + [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), + (i32 vecshiftR32:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?}, + V128, V128, vecshiftR32, + asm, ".4s", ".4s", + [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), + (i32 vecshiftR32:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?}, + V128, V128, vecshiftR64, + asm, ".2d", ".2d", + [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), + (i32 vecshiftR64:$imm)))]> { + bits<6> imm; + let Inst{21-16} = imm; + } +} + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +multiclass SIMDVectorRShiftBHSDTied opc, string asm, + SDPatternOperator OpNode = null_frag> { + def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?}, + V64, V64, vecshiftR8, asm, ".8b", ".8b", + [(set (v8i8 V64:$dst), + (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), + (i32 vecshiftR8:$imm)))]> { + bits<3> imm; + let Inst{18-16} = imm; + } + + def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?}, + V128, V128, vecshiftR8, asm, ".16b", ".16b", + [(set (v16i8 V128:$dst), + (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), + (i32 vecshiftR8:$imm)))]> { + bits<3> imm; + let Inst{18-16} = imm; + } + + def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?}, + V64, V64, vecshiftR16, asm, ".4h", ".4h", + [(set (v4i16 V64:$dst), + (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), + (i32 vecshiftR16:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?}, + V128, V128, vecshiftR16, asm, ".8h", ".8h", + [(set (v8i16 V128:$dst), + (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), + (i32 vecshiftR16:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?}, + V64, V64, vecshiftR32, asm, ".2s", ".2s", + [(set (v2i32 V64:$dst), + (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), + (i32 vecshiftR32:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?}, + V128, V128, vecshiftR32, asm, ".4s", ".4s", + [(set (v4i32 V128:$dst), + (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), + (i32 vecshiftR32:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?}, + V128, V128, vecshiftR64, + asm, ".2d", ".2d", [(set (v2i64 V128:$dst), + (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn), + (i32 vecshiftR64:$imm)))]> { + bits<6> imm; + let Inst{21-16} = imm; + } +} + +multiclass SIMDVectorLShiftBHSDTied opc, string asm, + SDPatternOperator OpNode = null_frag> { + def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?}, + V64, V64, vecshiftL8, + asm, ".8b", ".8b", + [(set (v8i8 V64:$dst), + (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), + (i32 vecshiftL8:$imm)))]> { + bits<3> imm; + let Inst{18-16} = imm; + } + + def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?}, + V128, V128, vecshiftL8, + asm, ".16b", ".16b", + [(set (v16i8 V128:$dst), + (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), + (i32 vecshiftL8:$imm)))]> { + bits<3> imm; + let Inst{18-16} = imm; + } + + def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?}, + V64, V64, vecshiftL16, + asm, ".4h", ".4h", + [(set (v4i16 V64:$dst), + (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), + (i32 vecshiftL16:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?}, + V128, V128, vecshiftL16, + asm, ".8h", ".8h", + [(set (v8i16 V128:$dst), + (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), + (i32 vecshiftL16:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?}, + V64, V64, vecshiftL32, + asm, ".2s", ".2s", + [(set (v2i32 V64:$dst), + (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), + (i32 vecshiftL32:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?}, + V128, V128, vecshiftL32, + asm, ".4s", ".4s", + [(set (v4i32 V128:$dst), + (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), + (i32 vecshiftL32:$imm)))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?}, + V128, V128, vecshiftL64, + asm, ".2d", ".2d", + [(set (v2i64 V128:$dst), + (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn), + (i32 vecshiftL64:$imm)))]> { + bits<6> imm; + let Inst{21-16} = imm; + } +} + +multiclass SIMDVectorLShiftLongBHSD opc, string asm, + SDPatternOperator OpNode> { + def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?}, + V128, V64, vecshiftL8, asm, ".8h", ".8b", + [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), vecshiftL8:$imm))]> { + bits<3> imm; + let Inst{18-16} = imm; + } + + def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?}, + V128, V128, vecshiftL8, + asm#"2", ".8h", ".16b", + [(set (v8i16 V128:$Rd), + (OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> { + bits<3> imm; + let Inst{18-16} = imm; + } + + def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, + V128, V64, vecshiftL16, asm, ".4s", ".4h", + [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), vecshiftL16:$imm))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?}, + V128, V128, vecshiftL16, + asm#"2", ".4s", ".8h", + [(set (v4i32 V128:$Rd), + (OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> { + + bits<4> imm; + let Inst{19-16} = imm; + } + + def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, + V128, V64, vecshiftL32, asm, ".2d", ".2s", + [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), vecshiftL32:$imm))]> { + bits<5> imm; + let Inst{20-16} = imm; + } + + def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?}, + V128, V128, vecshiftL32, + asm#"2", ".2d", ".4s", + [(set (v2i64 V128:$Rd), + (OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> { + bits<5> imm; + let Inst{20-16} = imm; + } +} + + +//--- +// Vector load/store +//--- +// SIMD ldX/stX no-index memory references don't allow the optional +// ", #0" constant and handle post-indexing explicitly, so we use +// a more specialized parse method for them. Otherwise, it's the same as +// the general GPR64sp handling. + +class BaseSIMDLdSt opcode, bits<2> size, + string asm, dag oops, dag iops, list pattern> + : I { + bits<5> Vt; + bits<5> Rn; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29-23} = 0b0011000; + let Inst{22} = L; + let Inst{21-16} = 0b000000; + let Inst{15-12} = opcode; + let Inst{11-10} = size; + let Inst{9-5} = Rn; + let Inst{4-0} = Vt; +} + +class BaseSIMDLdStPost opcode, bits<2> size, + string asm, dag oops, dag iops> + : I { + bits<5> Vt; + bits<5> Rn; + bits<5> Xm; + let Inst{31} = 0; + let Inst{30} = Q; + let Inst{29-23} = 0b0011001; + let Inst{22} = L; + let Inst{21} = 0; + let Inst{20-16} = Xm; + let Inst{15-12} = opcode; + let Inst{11-10} = size; + let Inst{9-5} = Rn; + let Inst{4-0} = Vt; +} + +// The immediate form of AdvSIMD post-indexed addressing is encoded with +// register post-index addressing from the zero register. +multiclass SIMDLdStAliases { + // E.g. "ld1 { v0.8b, v1.8b }, [x1], #16" + // "ld1\t$Vt, [$Rn], #16" + // may get mapped to + // (LD1Twov8b_POST VecListTwo8b:$Vt, GPR64sp:$Rn, XZR) + def : InstAlias(NAME # Count # "v" # layout # "_POST") + GPR64sp:$Rn, + !cast("VecList" # Count # layout):$Vt, + XZR), 1>; + + // E.g. "ld1.8b { v0, v1 }, [x1], #16" + // "ld1.8b\t$Vt, [$Rn], #16" + // may get mapped to + // (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, XZR) + def : InstAlias(NAME # Count # "v" # layout # "_POST") + GPR64sp:$Rn, + !cast("VecList" # Count # Size):$Vt, + XZR), 0>; + + // E.g. "ld1.8b { v0, v1 }, [x1]" + // "ld1\t$Vt, [$Rn]" + // may get mapped to + // (LD1Twov8b VecListTwo64:$Vt, GPR64sp:$Rn) + def : InstAlias(NAME # Count # "v" # layout) + !cast("VecList" # Count # Size):$Vt, + GPR64sp:$Rn), 0>; + + // E.g. "ld1.8b { v0, v1 }, [x1], x2" + // "ld1\t$Vt, [$Rn], $Xm" + // may get mapped to + // (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, GPR64pi8:$Xm) + def : InstAlias(NAME # Count # "v" # layout # "_POST") + GPR64sp:$Rn, + !cast("VecList" # Count # Size):$Vt, + !cast("GPR64pi" # Offset):$Xm), 0>; +} + +multiclass BaseSIMDLdN opcode> { + let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { + def v16b: BaseSIMDLdSt<1, 1, opcode, 0b00, asm, + (outs !cast(veclist # "16b"):$Vt), + (ins GPR64sp:$Rn), []>; + def v8h : BaseSIMDLdSt<1, 1, opcode, 0b01, asm, + (outs !cast(veclist # "8h"):$Vt), + (ins GPR64sp:$Rn), []>; + def v4s : BaseSIMDLdSt<1, 1, opcode, 0b10, asm, + (outs !cast(veclist # "4s"):$Vt), + (ins GPR64sp:$Rn), []>; + def v2d : BaseSIMDLdSt<1, 1, opcode, 0b11, asm, + (outs !cast(veclist # "2d"):$Vt), + (ins GPR64sp:$Rn), []>; + def v8b : BaseSIMDLdSt<0, 1, opcode, 0b00, asm, + (outs !cast(veclist # "8b"):$Vt), + (ins GPR64sp:$Rn), []>; + def v4h : BaseSIMDLdSt<0, 1, opcode, 0b01, asm, + (outs !cast(veclist # "4h"):$Vt), + (ins GPR64sp:$Rn), []>; + def v2s : BaseSIMDLdSt<0, 1, opcode, 0b10, asm, + (outs !cast(veclist # "2s"):$Vt), + (ins GPR64sp:$Rn), []>; + + + def v16b_POST: BaseSIMDLdStPost<1, 1, opcode, 0b00, asm, + (outs GPR64sp:$wback, + !cast(veclist # "16b"):$Vt), + (ins GPR64sp:$Rn, + !cast("GPR64pi" # Offset128):$Xm)>; + def v8h_POST : BaseSIMDLdStPost<1, 1, opcode, 0b01, asm, + (outs GPR64sp:$wback, + !cast(veclist # "8h"):$Vt), + (ins GPR64sp:$Rn, + !cast("GPR64pi" # Offset128):$Xm)>; + def v4s_POST : BaseSIMDLdStPost<1, 1, opcode, 0b10, asm, + (outs GPR64sp:$wback, + !cast(veclist # "4s"):$Vt), + (ins GPR64sp:$Rn, + !cast("GPR64pi" # Offset128):$Xm)>; + def v2d_POST : BaseSIMDLdStPost<1, 1, opcode, 0b11, asm, + (outs GPR64sp:$wback, + !cast(veclist # "2d"):$Vt), + (ins GPR64sp:$Rn, + !cast("GPR64pi" # Offset128):$Xm)>; + def v8b_POST : BaseSIMDLdStPost<0, 1, opcode, 0b00, asm, + (outs GPR64sp:$wback, + !cast(veclist # "8b"):$Vt), + (ins GPR64sp:$Rn, + !cast("GPR64pi" # Offset64):$Xm)>; + def v4h_POST : BaseSIMDLdStPost<0, 1, opcode, 0b01, asm, + (outs GPR64sp:$wback, + !cast(veclist # "4h"):$Vt), + (ins GPR64sp:$Rn, + !cast("GPR64pi" # Offset64):$Xm)>; + def v2s_POST : BaseSIMDLdStPost<0, 1, opcode, 0b10, asm, + (outs GPR64sp:$wback, + !cast(veclist # "2s"):$Vt), + (ins GPR64sp:$Rn, + !cast("GPR64pi" # Offset64):$Xm)>; + } + + defm : SIMDLdStAliases; + defm : SIMDLdStAliases; + defm : SIMDLdStAliases; + defm : SIMDLdStAliases; + defm : SIMDLdStAliases; + defm : SIMDLdStAliases; + defm : SIMDLdStAliases; +} + +// Only ld1/st1 has a v1d version. +multiclass BaseSIMDStN opcode> { + let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in { + def v16b : BaseSIMDLdSt<1, 0, opcode, 0b00, asm, (outs), + (ins !cast(veclist # "16b"):$Vt, + GPR64sp:$Rn), []>; + def v8h : BaseSIMDLdSt<1, 0, opcode, 0b01, asm, (outs), + (ins !cast(veclist # "8h"):$Vt, + GPR64sp:$Rn), []>; + def v4s : BaseSIMDLdSt<1, 0, opcode, 0b10, asm, (outs), + (ins !cast(veclist # "4s"):$Vt, + GPR64sp:$Rn), []>; + def v2d : BaseSIMDLdSt<1, 0, opcode, 0b11, asm, (outs), + (ins !cast(veclist # "2d"):$Vt, + GPR64sp:$Rn), []>; + def v8b : BaseSIMDLdSt<0, 0, opcode, 0b00, asm, (outs), + (ins !cast(veclist # "8b"):$Vt, + GPR64sp:$Rn), []>; + def v4h : BaseSIMDLdSt<0, 0, opcode, 0b01, asm, (outs), + (ins !cast(veclist # "4h"):$Vt, + GPR64sp:$Rn), []>; + def v2s : BaseSIMDLdSt<0, 0, opcode, 0b10, asm, (outs), + (ins !cast(veclist # "2s"):$Vt, + GPR64sp:$Rn), []>; + + def v16b_POST : BaseSIMDLdStPost<1, 0, opcode, 0b00, asm, + (outs GPR64sp:$wback), + (ins !cast(veclist # "16b"):$Vt, + GPR64sp:$Rn, + !cast("GPR64pi" # Offset128):$Xm)>; + def v8h_POST : BaseSIMDLdStPost<1, 0, opcode, 0b01, asm, + (outs GPR64sp:$wback), + (ins !cast(veclist # "8h"):$Vt, + GPR64sp:$Rn, + !cast("GPR64pi" # Offset128):$Xm)>; + def v4s_POST : BaseSIMDLdStPost<1, 0, opcode, 0b10, asm, + (outs GPR64sp:$wback), + (ins !cast(veclist # "4s"):$Vt, + GPR64sp:$Rn, + !cast("GPR64pi" # Offset128):$Xm)>; + def v2d_POST : BaseSIMDLdStPost<1, 0, opcode, 0b11, asm, + (outs GPR64sp:$wback), + (ins !cast(veclist # "2d"):$Vt, + GPR64sp:$Rn, + !cast("GPR64pi" # Offset128):$Xm)>; + def v8b_POST : BaseSIMDLdStPost<0, 0, opcode, 0b00, asm, + (outs GPR64sp:$wback), + (ins !cast(veclist # "8b"):$Vt, + GPR64sp:$Rn, + !cast("GPR64pi" # Offset64):$Xm)>; + def v4h_POST : BaseSIMDLdStPost<0, 0, opcode, 0b01, asm, + (outs GPR64sp:$wback), + (ins !cast(veclist # "4h"):$Vt, + GPR64sp:$Rn, + !cast("GPR64pi" # Offset64):$Xm)>; + def v2s_POST : BaseSIMDLdStPost<0, 0, opcode, 0b10, asm, + (outs GPR64sp:$wback), + (ins !cast(veclist # "2s"):$Vt, + GPR64sp:$Rn, + !cast("GPR64pi" # Offset64):$Xm)>; + } + + defm : SIMDLdStAliases; + defm : SIMDLdStAliases; + defm : SIMDLdStAliases; + defm : SIMDLdStAliases; + defm : SIMDLdStAliases; + defm : SIMDLdStAliases; + defm : SIMDLdStAliases; +} + +multiclass BaseSIMDLd1 opcode> + : BaseSIMDLdN { + + // LD1 instructions have extra "1d" variants. + let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { + def v1d : BaseSIMDLdSt<0, 1, opcode, 0b11, asm, + (outs !cast(veclist # "1d"):$Vt), + (ins GPR64sp:$Rn), []>; + + def v1d_POST : BaseSIMDLdStPost<0, 1, opcode, 0b11, asm, + (outs GPR64sp:$wback, + !cast(veclist # "1d"):$Vt), + (ins GPR64sp:$Rn, + !cast("GPR64pi" # Offset64):$Xm)>; + } + + defm : SIMDLdStAliases; +} + +multiclass BaseSIMDSt1 opcode> + : BaseSIMDStN { + + // ST1 instructions have extra "1d" variants. + let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in { + def v1d : BaseSIMDLdSt<0, 0, opcode, 0b11, asm, (outs), + (ins !cast(veclist # "1d"):$Vt, + GPR64sp:$Rn), []>; + + def v1d_POST : BaseSIMDLdStPost<0, 0, opcode, 0b11, asm, + (outs GPR64sp:$wback), + (ins !cast(veclist # "1d"):$Vt, + GPR64sp:$Rn, + !cast("GPR64pi" # Offset64):$Xm)>; + } + + defm : SIMDLdStAliases; +} + +multiclass SIMDLd1Multiple { + defm One : BaseSIMDLd1<"One", asm, "VecListOne", 16, 8, 0b0111>; + defm Two : BaseSIMDLd1<"Two", asm, "VecListTwo", 32, 16, 0b1010>; + defm Three : BaseSIMDLd1<"Three", asm, "VecListThree", 48, 24, 0b0110>; + defm Four : BaseSIMDLd1<"Four", asm, "VecListFour", 64, 32, 0b0010>; +} + +multiclass SIMDSt1Multiple { + defm One : BaseSIMDSt1<"One", asm, "VecListOne", 16, 8, 0b0111>; + defm Two : BaseSIMDSt1<"Two", asm, "VecListTwo", 32, 16, 0b1010>; + defm Three : BaseSIMDSt1<"Three", asm, "VecListThree", 48, 24, 0b0110>; + defm Four : BaseSIMDSt1<"Four", asm, "VecListFour", 64, 32, 0b0010>; +} + +multiclass SIMDLd2Multiple { + defm Two : BaseSIMDLdN<"Two", asm, "VecListTwo", 32, 16, 0b1000>; +} + +multiclass SIMDSt2Multiple { + defm Two : BaseSIMDStN<"Two", asm, "VecListTwo", 32, 16, 0b1000>; +} + +multiclass SIMDLd3Multiple { + defm Three : BaseSIMDLdN<"Three", asm, "VecListThree", 48, 24, 0b0100>; +} + +multiclass SIMDSt3Multiple { + defm Three : BaseSIMDStN<"Three", asm, "VecListThree", 48, 24, 0b0100>; +} + +multiclass SIMDLd4Multiple { + defm Four : BaseSIMDLdN<"Four", asm, "VecListFour", 64, 32, 0b0000>; +} + +multiclass SIMDSt4Multiple { + defm Four : BaseSIMDStN<"Four", asm, "VecListFour", 64, 32, 0b0000>; +} + +//--- +// AdvSIMD Load/store single-element +//--- + +class BaseSIMDLdStSingle opcode, + string asm, string operands, string cst, + dag oops, dag iops, list pattern> + : I { + bits<5> Vt; + bits<5> Rn; + let Inst{31} = 0; + let Inst{29-24} = 0b001101; + let Inst{22} = L; + let Inst{21} = R; + let Inst{15-13} = opcode; + let Inst{9-5} = Rn; + let Inst{4-0} = Vt; +} + +class BaseSIMDLdStSingleTied opcode, + string asm, string operands, string cst, + dag oops, dag iops, list pattern> + : I { + bits<5> Vt; + bits<5> Rn; + let Inst{31} = 0; + let Inst{29-24} = 0b001101; + let Inst{22} = L; + let Inst{21} = R; + let Inst{15-13} = opcode; + let Inst{9-5} = Rn; + let Inst{4-0} = Vt; +} + + +let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDLdR opcode, bit S, bits<2> size, string asm, + Operand listtype> + : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn]", "", + (outs listtype:$Vt), (ins GPR64sp:$Rn), + []> { + let Inst{30} = Q; + let Inst{23} = 0; + let Inst{20-16} = 0b00000; + let Inst{12} = S; + let Inst{11-10} = size; +} +let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in +class BaseSIMDLdRPost opcode, bit S, bits<2> size, + string asm, Operand listtype, Operand GPR64pi> + : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn], $Xm", + "$Rn = $wback", + (outs GPR64sp:$wback, listtype:$Vt), + (ins GPR64sp:$Rn, GPR64pi:$Xm), []> { + bits<5> Xm; + let Inst{30} = Q; + let Inst{23} = 1; + let Inst{20-16} = Xm; + let Inst{12} = S; + let Inst{11-10} = size; +} + +multiclass SIMDLdrAliases { + // E.g. "ld1r { v0.8b }, [x1], #1" + // "ld1r.8b\t$Vt, [$Rn], #1" + // may get mapped to + // (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR) + def : InstAlias(NAME # "v" # layout # "_POST") + GPR64sp:$Rn, + !cast("VecList" # Count # layout):$Vt, + XZR), 1>; + + // E.g. "ld1r.8b { v0 }, [x1], #1" + // "ld1r.8b\t$Vt, [$Rn], #1" + // may get mapped to + // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR) + def : InstAlias(NAME # "v" # layout # "_POST") + GPR64sp:$Rn, + !cast("VecList" # Count # Size):$Vt, + XZR), 0>; + + // E.g. "ld1r.8b { v0 }, [x1]" + // "ld1r.8b\t$Vt, [$Rn]" + // may get mapped to + // (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn) + def : InstAlias(NAME # "v" # layout) + !cast("VecList" # Count # Size):$Vt, + GPR64sp:$Rn), 0>; + + // E.g. "ld1r.8b { v0 }, [x1], x2" + // "ld1r.8b\t$Vt, [$Rn], $Xm" + // may get mapped to + // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm) + def : InstAlias(NAME # "v" # layout # "_POST") + GPR64sp:$Rn, + !cast("VecList" # Count # Size):$Vt, + !cast("GPR64pi" # Offset):$Xm), 0>; +} + +multiclass SIMDLdR opcode, bit S, string asm, string Count, + int Offset1, int Offset2, int Offset4, int Offset8> { + def v8b : BaseSIMDLdR<0, R, opcode, S, 0b00, asm, + !cast("VecList" # Count # "8b")>; + def v16b: BaseSIMDLdR<1, R, opcode, S, 0b00, asm, + !cast("VecList" # Count #"16b")>; + def v4h : BaseSIMDLdR<0, R, opcode, S, 0b01, asm, + !cast("VecList" # Count #"4h")>; + def v8h : BaseSIMDLdR<1, R, opcode, S, 0b01, asm, + !cast("VecList" # Count #"8h")>; + def v2s : BaseSIMDLdR<0, R, opcode, S, 0b10, asm, + !cast("VecList" # Count #"2s")>; + def v4s : BaseSIMDLdR<1, R, opcode, S, 0b10, asm, + !cast("VecList" # Count #"4s")>; + def v1d : BaseSIMDLdR<0, R, opcode, S, 0b11, asm, + !cast("VecList" # Count #"1d")>; + def v2d : BaseSIMDLdR<1, R, opcode, S, 0b11, asm, + !cast("VecList" # Count #"2d")>; + + def v8b_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b00, asm, + !cast("VecList" # Count # "8b"), + !cast("GPR64pi" # Offset1)>; + def v16b_POST: BaseSIMDLdRPost<1, R, opcode, S, 0b00, asm, + !cast("VecList" # Count # "16b"), + !cast("GPR64pi" # Offset1)>; + def v4h_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b01, asm, + !cast("VecList" # Count # "4h"), + !cast("GPR64pi" # Offset2)>; + def v8h_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b01, asm, + !cast("VecList" # Count # "8h"), + !cast("GPR64pi" # Offset2)>; + def v2s_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b10, asm, + !cast("VecList" # Count # "2s"), + !cast("GPR64pi" # Offset4)>; + def v4s_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b10, asm, + !cast("VecList" # Count # "4s"), + !cast("GPR64pi" # Offset4)>; + def v1d_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b11, asm, + !cast("VecList" # Count # "1d"), + !cast("GPR64pi" # Offset8)>; + def v2d_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b11, asm, + !cast("VecList" # Count # "2d"), + !cast("GPR64pi" # Offset8)>; + + defm : SIMDLdrAliases; + defm : SIMDLdrAliases; + defm : SIMDLdrAliases; + defm : SIMDLdrAliases; + defm : SIMDLdrAliases; + defm : SIMDLdrAliases; + defm : SIMDLdrAliases; + defm : SIMDLdrAliases; +} + +class SIMDLdStSingleB opcode, string asm, + dag oops, dag iops, list pattern> + : BaseSIMDLdStSingle { + // idx encoded in Q:S:size fields. + bits<4> idx; + let Inst{30} = idx{3}; + let Inst{23} = 0; + let Inst{20-16} = 0b00000; + let Inst{12} = idx{2}; + let Inst{11-10} = idx{1-0}; +} +class SIMDLdStSingleBTied opcode, string asm, + dag oops, dag iops, list pattern> + : BaseSIMDLdStSingleTied { + // idx encoded in Q:S:size fields. + bits<4> idx; + let Inst{30} = idx{3}; + let Inst{23} = 0; + let Inst{20-16} = 0b00000; + let Inst{12} = idx{2}; + let Inst{11-10} = idx{1-0}; +} +class SIMDLdStSingleBPost opcode, string asm, + dag oops, dag iops> + : BaseSIMDLdStSingle { + // idx encoded in Q:S:size fields. + bits<4> idx; + bits<5> Xm; + let Inst{30} = idx{3}; + let Inst{23} = 1; + let Inst{20-16} = Xm; + let Inst{12} = idx{2}; + let Inst{11-10} = idx{1-0}; +} +class SIMDLdStSingleBTiedPost opcode, string asm, + dag oops, dag iops> + : BaseSIMDLdStSingleTied { + // idx encoded in Q:S:size fields. + bits<4> idx; + bits<5> Xm; + let Inst{30} = idx{3}; + let Inst{23} = 1; + let Inst{20-16} = Xm; + let Inst{12} = idx{2}; + let Inst{11-10} = idx{1-0}; +} + +class SIMDLdStSingleH opcode, bit size, string asm, + dag oops, dag iops, list pattern> + : BaseSIMDLdStSingle { + // idx encoded in Q:S:size<1> fields. + bits<3> idx; + let Inst{30} = idx{2}; + let Inst{23} = 0; + let Inst{20-16} = 0b00000; + let Inst{12} = idx{1}; + let Inst{11} = idx{0}; + let Inst{10} = size; +} +class SIMDLdStSingleHTied opcode, bit size, string asm, + dag oops, dag iops, list pattern> + : BaseSIMDLdStSingleTied { + // idx encoded in Q:S:size<1> fields. + bits<3> idx; + let Inst{30} = idx{2}; + let Inst{23} = 0; + let Inst{20-16} = 0b00000; + let Inst{12} = idx{1}; + let Inst{11} = idx{0}; + let Inst{10} = size; +} + +class SIMDLdStSingleHPost opcode, bit size, string asm, + dag oops, dag iops> + : BaseSIMDLdStSingle { + // idx encoded in Q:S:size<1> fields. + bits<3> idx; + bits<5> Xm; + let Inst{30} = idx{2}; + let Inst{23} = 1; + let Inst{20-16} = Xm; + let Inst{12} = idx{1}; + let Inst{11} = idx{0}; + let Inst{10} = size; +} +class SIMDLdStSingleHTiedPost opcode, bit size, string asm, + dag oops, dag iops> + : BaseSIMDLdStSingleTied { + // idx encoded in Q:S:size<1> fields. + bits<3> idx; + bits<5> Xm; + let Inst{30} = idx{2}; + let Inst{23} = 1; + let Inst{20-16} = Xm; + let Inst{12} = idx{1}; + let Inst{11} = idx{0}; + let Inst{10} = size; +} +class SIMDLdStSingleS opcode, bits<2> size, string asm, + dag oops, dag iops, list pattern> + : BaseSIMDLdStSingle { + // idx encoded in Q:S fields. + bits<2> idx; + let Inst{30} = idx{1}; + let Inst{23} = 0; + let Inst{20-16} = 0b00000; + let Inst{12} = idx{0}; + let Inst{11-10} = size; +} +class SIMDLdStSingleSTied opcode, bits<2> size, string asm, + dag oops, dag iops, list pattern> + : BaseSIMDLdStSingleTied { + // idx encoded in Q:S fields. + bits<2> idx; + let Inst{30} = idx{1}; + let Inst{23} = 0; + let Inst{20-16} = 0b00000; + let Inst{12} = idx{0}; + let Inst{11-10} = size; +} +class SIMDLdStSingleSPost opcode, bits<2> size, + string asm, dag oops, dag iops> + : BaseSIMDLdStSingle { + // idx encoded in Q:S fields. + bits<2> idx; + bits<5> Xm; + let Inst{30} = idx{1}; + let Inst{23} = 1; + let Inst{20-16} = Xm; + let Inst{12} = idx{0}; + let Inst{11-10} = size; +} +class SIMDLdStSingleSTiedPost opcode, bits<2> size, + string asm, dag oops, dag iops> + : BaseSIMDLdStSingleTied { + // idx encoded in Q:S fields. + bits<2> idx; + bits<5> Xm; + let Inst{30} = idx{1}; + let Inst{23} = 1; + let Inst{20-16} = Xm; + let Inst{12} = idx{0}; + let Inst{11-10} = size; +} +class SIMDLdStSingleD opcode, bits<2> size, string asm, + dag oops, dag iops, list pattern> + : BaseSIMDLdStSingle { + // idx encoded in Q field. + bits<1> idx; + let Inst{30} = idx; + let Inst{23} = 0; + let Inst{20-16} = 0b00000; + let Inst{12} = 0; + let Inst{11-10} = size; +} +class SIMDLdStSingleDTied opcode, bits<2> size, string asm, + dag oops, dag iops, list pattern> + : BaseSIMDLdStSingleTied { + // idx encoded in Q field. + bits<1> idx; + let Inst{30} = idx; + let Inst{23} = 0; + let Inst{20-16} = 0b00000; + let Inst{12} = 0; + let Inst{11-10} = size; +} +class SIMDLdStSingleDPost opcode, bits<2> size, + string asm, dag oops, dag iops> + : BaseSIMDLdStSingle { + // idx encoded in Q field. + bits<1> idx; + bits<5> Xm; + let Inst{30} = idx; + let Inst{23} = 1; + let Inst{20-16} = Xm; + let Inst{12} = 0; + let Inst{11-10} = size; +} +class SIMDLdStSingleDTiedPost opcode, bits<2> size, + string asm, dag oops, dag iops> + : BaseSIMDLdStSingleTied { + // idx encoded in Q field. + bits<1> idx; + bits<5> Xm; + let Inst{30} = idx; + let Inst{23} = 1; + let Inst{20-16} = Xm; + let Inst{12} = 0; + let Inst{11-10} = size; +} + +let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in +multiclass SIMDLdSingleBTied opcode, string asm, + RegisterOperand listtype, + RegisterOperand GPR64pi> { + def i8 : SIMDLdStSingleBTied<1, R, opcode, asm, + (outs listtype:$dst), + (ins listtype:$Vt, VectorIndexB:$idx, + GPR64sp:$Rn), []>; + + def i8_POST : SIMDLdStSingleBTiedPost<1, R, opcode, asm, + (outs GPR64sp:$wback, listtype:$dst), + (ins listtype:$Vt, VectorIndexB:$idx, + GPR64sp:$Rn, GPR64pi:$Xm)>; +} +let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in +multiclass SIMDLdSingleHTied opcode, bit size, string asm, + RegisterOperand listtype, + RegisterOperand GPR64pi> { + def i16 : SIMDLdStSingleHTied<1, R, opcode, size, asm, + (outs listtype:$dst), + (ins listtype:$Vt, VectorIndexH:$idx, + GPR64sp:$Rn), []>; + + def i16_POST : SIMDLdStSingleHTiedPost<1, R, opcode, size, asm, + (outs GPR64sp:$wback, listtype:$dst), + (ins listtype:$Vt, VectorIndexH:$idx, + GPR64sp:$Rn, GPR64pi:$Xm)>; +} +let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in +multiclass SIMDLdSingleSTied opcode, bits<2> size,string asm, + RegisterOperand listtype, + RegisterOperand GPR64pi> { + def i32 : SIMDLdStSingleSTied<1, R, opcode, size, asm, + (outs listtype:$dst), + (ins listtype:$Vt, VectorIndexS:$idx, + GPR64sp:$Rn), []>; + + def i32_POST : SIMDLdStSingleSTiedPost<1, R, opcode, size, asm, + (outs GPR64sp:$wback, listtype:$dst), + (ins listtype:$Vt, VectorIndexS:$idx, + GPR64sp:$Rn, GPR64pi:$Xm)>; +} +let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in +multiclass SIMDLdSingleDTied opcode, bits<2> size, string asm, + RegisterOperand listtype, RegisterOperand GPR64pi> { + def i64 : SIMDLdStSingleDTied<1, R, opcode, size, asm, + (outs listtype:$dst), + (ins listtype:$Vt, VectorIndexD:$idx, + GPR64sp:$Rn), []>; + + def i64_POST : SIMDLdStSingleDTiedPost<1, R, opcode, size, asm, + (outs GPR64sp:$wback, listtype:$dst), + (ins listtype:$Vt, VectorIndexD:$idx, + GPR64sp:$Rn, GPR64pi:$Xm)>; +} +let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in +multiclass SIMDStSingleB opcode, string asm, + RegisterOperand listtype, RegisterOperand GPR64pi> { + def i8 : SIMDLdStSingleB<0, R, opcode, asm, + (outs), (ins listtype:$Vt, VectorIndexB:$idx, + GPR64sp:$Rn), []>; + + def i8_POST : SIMDLdStSingleBPost<0, R, opcode, asm, + (outs GPR64sp:$wback), + (ins listtype:$Vt, VectorIndexB:$idx, + GPR64sp:$Rn, GPR64pi:$Xm)>; +} +let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in +multiclass SIMDStSingleH opcode, bit size, string asm, + RegisterOperand listtype, RegisterOperand GPR64pi> { + def i16 : SIMDLdStSingleH<0, R, opcode, size, asm, + (outs), (ins listtype:$Vt, VectorIndexH:$idx, + GPR64sp:$Rn), []>; + + def i16_POST : SIMDLdStSingleHPost<0, R, opcode, size, asm, + (outs GPR64sp:$wback), + (ins listtype:$Vt, VectorIndexH:$idx, + GPR64sp:$Rn, GPR64pi:$Xm)>; +} +let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in +multiclass SIMDStSingleS opcode, bits<2> size,string asm, + RegisterOperand listtype, RegisterOperand GPR64pi> { + def i32 : SIMDLdStSingleS<0, R, opcode, size, asm, + (outs), (ins listtype:$Vt, VectorIndexS:$idx, + GPR64sp:$Rn), []>; + + def i32_POST : SIMDLdStSingleSPost<0, R, opcode, size, asm, + (outs GPR64sp:$wback), + (ins listtype:$Vt, VectorIndexS:$idx, + GPR64sp:$Rn, GPR64pi:$Xm)>; +} +let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in +multiclass SIMDStSingleD opcode, bits<2> size, string asm, + RegisterOperand listtype, RegisterOperand GPR64pi> { + def i64 : SIMDLdStSingleD<0, R, opcode, size, asm, + (outs), (ins listtype:$Vt, VectorIndexD:$idx, + GPR64sp:$Rn), []>; + + def i64_POST : SIMDLdStSingleDPost<0, R, opcode, size, asm, + (outs GPR64sp:$wback), + (ins listtype:$Vt, VectorIndexD:$idx, + GPR64sp:$Rn, GPR64pi:$Xm)>; +} + +multiclass SIMDLdStSingleAliases { + // E.g. "ld1 { v0.8b }[0], [x1], #1" + // "ld1\t$Vt, [$Rn], #1" + // may get mapped to + // (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR) + def : InstAlias(NAME # Type # "_POST") + GPR64sp:$Rn, + !cast("VecList" # Count # layout):$Vt, + idxtype:$idx, XZR), 1>; + + // E.g. "ld1.8b { v0 }[0], [x1], #1" + // "ld1.8b\t$Vt, [$Rn], #1" + // may get mapped to + // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR) + def : InstAlias(NAME # Type # "_POST") + GPR64sp:$Rn, + !cast("VecList" # Count # "128"):$Vt, + idxtype:$idx, XZR), 0>; + + // E.g. "ld1.8b { v0 }[0], [x1]" + // "ld1.8b\t$Vt, [$Rn]" + // may get mapped to + // (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn) + def : InstAlias(NAME # Type) + !cast("VecList" # Count # "128"):$Vt, + idxtype:$idx, GPR64sp:$Rn), 0>; + + // E.g. "ld1.8b { v0 }[0], [x1], x2" + // "ld1.8b\t$Vt, [$Rn], $Xm" + // may get mapped to + // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm) + def : InstAlias(NAME # Type # "_POST") + GPR64sp:$Rn, + !cast("VecList" # Count # "128"):$Vt, + idxtype:$idx, + !cast("GPR64pi" # Offset):$Xm), 0>; +} + +multiclass SIMDLdSt1SingleAliases { + defm : SIMDLdStSingleAliases; + defm : SIMDLdStSingleAliases; + defm : SIMDLdStSingleAliases; + defm : SIMDLdStSingleAliases; +} + +multiclass SIMDLdSt2SingleAliases { + defm : SIMDLdStSingleAliases; + defm : SIMDLdStSingleAliases; + defm : SIMDLdStSingleAliases; + defm : SIMDLdStSingleAliases; +} + +multiclass SIMDLdSt3SingleAliases { + defm : SIMDLdStSingleAliases; + defm : SIMDLdStSingleAliases; + defm : SIMDLdStSingleAliases; + defm : SIMDLdStSingleAliases; +} + +multiclass SIMDLdSt4SingleAliases { + defm : SIMDLdStSingleAliases; + defm : SIMDLdStSingleAliases; + defm : SIMDLdStSingleAliases; + defm : SIMDLdStSingleAliases; +} +} // end of 'let Predicates = [HasNEON]' + +//---------------------------------------------------------------------------- +// Crypto extensions +//---------------------------------------------------------------------------- + +let Predicates = [HasCrypto] in { +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class AESBase opc, string asm, dag outs, dag ins, string cstr, + list pat> + : I, + Sched<[WriteV]>{ + bits<5> Rd; + bits<5> Rn; + let Inst{31-16} = 0b0100111000101000; + let Inst{15-12} = opc; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class AESInst opc, string asm, Intrinsic OpNode> + : AESBase; + +class AESTiedInst opc, string asm, Intrinsic OpNode> + : AESBase; + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class SHA3OpTiedInst opc, string asm, string dst_lhs_kind, + dag oops, dag iops, list pat> + : I, + Sched<[WriteV]>{ + bits<5> Rd; + bits<5> Rn; + bits<5> Rm; + let Inst{31-21} = 0b01011110000; + let Inst{20-16} = Rm; + let Inst{15} = 0; + let Inst{14-12} = opc; + let Inst{11-10} = 0b00; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class SHATiedInstQSV opc, string asm, Intrinsic OpNode> + : SHA3OpTiedInst; + +class SHATiedInstVVV opc, string asm, Intrinsic OpNode> + : SHA3OpTiedInst; + +class SHATiedInstQQV opc, string asm, Intrinsic OpNode> + : SHA3OpTiedInst; + +let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in +class SHA2OpInst opc, string asm, string kind, + string cstr, dag oops, dag iops, + list pat> + : I, + Sched<[WriteV]>{ + bits<5> Rd; + bits<5> Rn; + let Inst{31-16} = 0b0101111000101000; + let Inst{15-12} = opc; + let Inst{11-10} = 0b10; + let Inst{9-5} = Rn; + let Inst{4-0} = Rd; +} + +class SHATiedInstVV opc, string asm, Intrinsic OpNode> + : SHA2OpInst; + +class SHAInstSS opc, string asm, Intrinsic OpNode> + : SHA2OpInst; +} // end of 'let Predicates = [HasCrypto]' + +// Allow the size specifier tokens to be upper case, not just lower. +def : TokenAlias<".8B", ".8b">; +def : TokenAlias<".4H", ".4h">; +def : TokenAlias<".2S", ".2s">; +def : TokenAlias<".1D", ".1d">; +def : TokenAlias<".16B", ".16b">; +def : TokenAlias<".8H", ".8h">; +def : TokenAlias<".4S", ".4s">; +def : TokenAlias<".2D", ".2d">; +def : TokenAlias<".1Q", ".1q">; +def : TokenAlias<".B", ".b">; +def : TokenAlias<".H", ".h">; +def : TokenAlias<".S", ".s">; +def : TokenAlias<".D", ".d">; +def : TokenAlias<".Q", ".q">; diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp new file mode 100644 index 00000000000..52e3b333eb0 --- /dev/null +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -0,0 +1,2065 @@ +//===- AArch64InstrInfo.cpp - AArch64 Instruction Information -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the AArch64 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +#define GET_INSTRINFO_CTOR_DTOR +#include "AArch64GenInstrInfo.inc" + +AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) + : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP), + RI(this, &STI), Subtarget(STI) {} + +/// GetInstSize - Return the number of bytes of code the specified +/// instruction may be. This returns the maximum number of bytes. +unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { + const MCInstrDesc &Desc = MI->getDesc(); + + switch (Desc.getOpcode()) { + default: + // Anything not explicitly designated otherwise is a nomal 4-byte insn. + return 4; + case TargetOpcode::DBG_VALUE: + case TargetOpcode::EH_LABEL: + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::KILL: + return 0; + } + + llvm_unreachable("GetInstSizeInBytes()- Unable to determin insn size"); +} + +static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, + SmallVectorImpl &Cond) { + // Block ends with fall-through condbranch. + switch (LastInst->getOpcode()) { + default: + llvm_unreachable("Unknown branch instruction?"); + case AArch64::Bcc: + Target = LastInst->getOperand(1).getMBB(); + Cond.push_back(LastInst->getOperand(0)); + break; + case AArch64::CBZW: + case AArch64::CBZX: + case AArch64::CBNZW: + case AArch64::CBNZX: + Target = LastInst->getOperand(1).getMBB(); + Cond.push_back(MachineOperand::CreateImm(-1)); + Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); + Cond.push_back(LastInst->getOperand(0)); + break; + case AArch64::TBZW: + case AArch64::TBZX: + case AArch64::TBNZW: + case AArch64::TBNZX: + Target = LastInst->getOperand(2).getMBB(); + Cond.push_back(MachineOperand::CreateImm(-1)); + Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); + Cond.push_back(LastInst->getOperand(0)); + Cond.push_back(LastInst->getOperand(1)); + } +} + +// Branch analysis. +bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl &Cond, + bool AllowModify) const { + // If the block has no terminators, it just falls into the block after it. + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin()) + return false; + --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return false; + --I; + } + if (!isUnpredicatedTerminator(I)) + return false; + + // Get the last instruction in the block. + MachineInstr *LastInst = I; + + // If there is only one terminator instruction, process it. + unsigned LastOpc = LastInst->getOpcode(); + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + if (isUncondBranchOpcode(LastOpc)) { + TBB = LastInst->getOperand(0).getMBB(); + return false; + } + if (isCondBranchOpcode(LastOpc)) { + // Block ends with fall-through condbranch. + parseCondBranch(LastInst, TBB, Cond); + return false; + } + return true; // Can't handle indirect branch. + } + + // Get the instruction before it if it is a terminator. + MachineInstr *SecondLastInst = I; + unsigned SecondLastOpc = SecondLastInst->getOpcode(); + + // If AllowModify is true and the block ends with two or more unconditional + // branches, delete all but the first unconditional branch. + if (AllowModify && isUncondBranchOpcode(LastOpc)) { + while (isUncondBranchOpcode(SecondLastOpc)) { + LastInst->eraseFromParent(); + LastInst = SecondLastInst; + LastOpc = LastInst->getOpcode(); + if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + // Return now the only terminator is an unconditional branch. + TBB = LastInst->getOperand(0).getMBB(); + return false; + } else { + SecondLastInst = I; + SecondLastOpc = SecondLastInst->getOpcode(); + } + } + } + + // If there are three terminators, we don't know what sort of block this is. + if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I)) + return true; + + // If the block ends with a B and a Bcc, handle it. + if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { + parseCondBranch(SecondLastInst, TBB, Cond); + FBB = LastInst->getOperand(0).getMBB(); + return false; + } + + // If the block ends with two unconditional branches, handle it. The second + // one is not executed, so remove it. + if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { + TBB = SecondLastInst->getOperand(0).getMBB(); + I = LastInst; + if (AllowModify) + I->eraseFromParent(); + return false; + } + + // ...likewise if it ends with an indirect branch followed by an unconditional + // branch. + if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { + I = LastInst; + if (AllowModify) + I->eraseFromParent(); + return true; + } + + // Otherwise, can't handle this. + return true; +} + +bool AArch64InstrInfo::ReverseBranchCondition( + SmallVectorImpl &Cond) const { + if (Cond[0].getImm() != -1) { + // Regular Bcc + AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm(); + Cond[0].setImm(AArch64CC::getInvertedCondCode(CC)); + } else { + // Folded compare-and-branch + switch (Cond[1].getImm()) { + default: + llvm_unreachable("Unknown conditional branch!"); + case AArch64::CBZW: + Cond[1].setImm(AArch64::CBNZW); + break; + case AArch64::CBNZW: + Cond[1].setImm(AArch64::CBZW); + break; + case AArch64::CBZX: + Cond[1].setImm(AArch64::CBNZX); + break; + case AArch64::CBNZX: + Cond[1].setImm(AArch64::CBZX); + break; + case AArch64::TBZW: + Cond[1].setImm(AArch64::TBNZW); + break; + case AArch64::TBNZW: + Cond[1].setImm(AArch64::TBZW); + break; + case AArch64::TBZX: + Cond[1].setImm(AArch64::TBNZX); + break; + case AArch64::TBNZX: + Cond[1].setImm(AArch64::TBZX); + break; + } + } + + return false; +} + +unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin()) + return 0; + --I; + while (I->isDebugValue()) { + if (I == MBB.begin()) + return 0; + --I; + } + if (!isUncondBranchOpcode(I->getOpcode()) && + !isCondBranchOpcode(I->getOpcode())) + return 0; + + // Remove the branch. + I->eraseFromParent(); + + I = MBB.end(); + + if (I == MBB.begin()) + return 1; + --I; + if (!isCondBranchOpcode(I->getOpcode())) + return 1; + + // Remove the branch. + I->eraseFromParent(); + return 2; +} + +void AArch64InstrInfo::instantiateCondBranch( + MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB, + const SmallVectorImpl &Cond) const { + if (Cond[0].getImm() != -1) { + // Regular Bcc + BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB); + } else { + // Folded compare-and-branch + const MachineInstrBuilder MIB = + BuildMI(&MBB, DL, get(Cond[1].getImm())).addReg(Cond[2].getReg()); + if (Cond.size() > 3) + MIB.addImm(Cond[3].getImm()); + MIB.addMBB(TBB); + } +} + +unsigned AArch64InstrInfo::InsertBranch( + MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, + const SmallVectorImpl &Cond, DebugLoc DL) const { + // Shouldn't be a fall through. + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + + if (!FBB) { + if (Cond.empty()) // Unconditional branch? + BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB); + else + instantiateCondBranch(MBB, DL, TBB, Cond); + return 1; + } + + // Two-way conditional branch. + instantiateCondBranch(MBB, DL, TBB, Cond); + BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB); + return 2; +} + +// Find the original register that VReg is copied from. +static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { + while (TargetRegisterInfo::isVirtualRegister(VReg)) { + const MachineInstr *DefMI = MRI.getVRegDef(VReg); + if (!DefMI->isFullCopy()) + return VReg; + VReg = DefMI->getOperand(1).getReg(); + } + return VReg; +} + +// Determine if VReg is defined by an instruction that can be folded into a +// csel instruction. If so, return the folded opcode, and the replacement +// register. +static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, + unsigned *NewVReg = nullptr) { + VReg = removeCopies(MRI, VReg); + if (!TargetRegisterInfo::isVirtualRegister(VReg)) + return 0; + + bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg)); + const MachineInstr *DefMI = MRI.getVRegDef(VReg); + unsigned Opc = 0; + unsigned SrcOpNum = 0; + switch (DefMI->getOpcode()) { + case AArch64::ADDSXri: + case AArch64::ADDSWri: + // if NZCV is used, do not fold. + if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) + return 0; + // fall-through to ADDXri and ADDWri. + case AArch64::ADDXri: + case AArch64::ADDWri: + // add x, 1 -> csinc. + if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 || + DefMI->getOperand(3).getImm() != 0) + return 0; + SrcOpNum = 1; + Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr; + break; + + case AArch64::ORNXrr: + case AArch64::ORNWrr: { + // not x -> csinv, represented as orn dst, xzr, src. + unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); + if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) + return 0; + SrcOpNum = 2; + Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr; + break; + } + + case AArch64::SUBSXrr: + case AArch64::SUBSWrr: + // if NZCV is used, do not fold. + if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1) + return 0; + // fall-through to SUBXrr and SUBWrr. + case AArch64::SUBXrr: + case AArch64::SUBWrr: { + // neg x -> csneg, represented as sub dst, xzr, src. + unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); + if (ZReg != AArch64::XZR && ZReg != AArch64::WZR) + return 0; + SrcOpNum = 2; + Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr; + break; + } + default: + return 0; + } + assert(Opc && SrcOpNum && "Missing parameters"); + + if (NewVReg) + *NewVReg = DefMI->getOperand(SrcOpNum).getReg(); + return Opc; +} + +bool AArch64InstrInfo::canInsertSelect( + const MachineBasicBlock &MBB, const SmallVectorImpl &Cond, + unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles, + int &FalseCycles) const { + // Check register classes. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RC = + RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); + if (!RC) + return false; + + // Expanding cbz/tbz requires an extra cycle of latency on the condition. + unsigned ExtraCondLat = Cond.size() != 1; + + // GPRs are handled by csel. + // FIXME: Fold in x+1, -x, and ~x when applicable. + if (AArch64::GPR64allRegClass.hasSubClassEq(RC) || + AArch64::GPR32allRegClass.hasSubClassEq(RC)) { + // Single-cycle csel, csinc, csinv, and csneg. + CondCycles = 1 + ExtraCondLat; + TrueCycles = FalseCycles = 1; + if (canFoldIntoCSel(MRI, TrueReg)) + TrueCycles = 0; + else if (canFoldIntoCSel(MRI, FalseReg)) + FalseCycles = 0; + return true; + } + + // Scalar floating point is handled by fcsel. + // FIXME: Form fabs, fmin, and fmax when applicable. + if (AArch64::FPR64RegClass.hasSubClassEq(RC) || + AArch64::FPR32RegClass.hasSubClassEq(RC)) { + CondCycles = 5 + ExtraCondLat; + TrueCycles = FalseCycles = 2; + return true; + } + + // Can't do vectors. + return false; +} + +void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DstReg, + const SmallVectorImpl &Cond, + unsigned TrueReg, unsigned FalseReg) const { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + // Parse the condition code, see parseCondBranch() above. + AArch64CC::CondCode CC; + switch (Cond.size()) { + default: + llvm_unreachable("Unknown condition opcode in Cond"); + case 1: // b.cc + CC = AArch64CC::CondCode(Cond[0].getImm()); + break; + case 3: { // cbz/cbnz + // We must insert a compare against 0. + bool Is64Bit; + switch (Cond[1].getImm()) { + default: + llvm_unreachable("Unknown branch opcode in Cond"); + case AArch64::CBZW: + Is64Bit = 0; + CC = AArch64CC::EQ; + break; + case AArch64::CBZX: + Is64Bit = 1; + CC = AArch64CC::EQ; + break; + case AArch64::CBNZW: + Is64Bit = 0; + CC = AArch64CC::NE; + break; + case AArch64::CBNZX: + Is64Bit = 1; + CC = AArch64CC::NE; + break; + } + unsigned SrcReg = Cond[2].getReg(); + if (Is64Bit) { + // cmp reg, #0 is actually subs xzr, reg, #0. + MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass); + BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR) + .addReg(SrcReg) + .addImm(0) + .addImm(0); + } else { + MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass); + BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR) + .addReg(SrcReg) + .addImm(0) + .addImm(0); + } + break; + } + case 4: { // tbz/tbnz + // We must insert a tst instruction. + switch (Cond[1].getImm()) { + default: + llvm_unreachable("Unknown branch opcode in Cond"); + case AArch64::TBZW: + case AArch64::TBZX: + CC = AArch64CC::EQ; + break; + case AArch64::TBNZW: + case AArch64::TBNZX: + CC = AArch64CC::NE; + break; + } + // cmp reg, #foo is actually ands xzr, reg, #1< 64 bit extension case, these instructions can do + // much more. + if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31) + return false; + // This is a signed or unsigned 32 -> 64 bit extension. + SrcReg = MI.getOperand(1).getReg(); + DstReg = MI.getOperand(0).getReg(); + SubIdx = AArch64::sub_32; + return true; + } +} + +/// analyzeCompare - For a comparison instruction, return the source registers +/// in SrcReg and SrcReg2, and the value it compares against in CmpValue. +/// Return true if the comparison instruction can be analyzed. +bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, + unsigned &SrcReg2, int &CmpMask, + int &CmpValue) const { + switch (MI->getOpcode()) { + default: + break; + case AArch64::SUBSWrr: + case AArch64::SUBSWrs: + case AArch64::SUBSWrx: + case AArch64::SUBSXrr: + case AArch64::SUBSXrs: + case AArch64::SUBSXrx: + case AArch64::ADDSWrr: + case AArch64::ADDSWrs: + case AArch64::ADDSWrx: + case AArch64::ADDSXrr: + case AArch64::ADDSXrs: + case AArch64::ADDSXrx: + // Replace SUBSWrr with SUBWrr if NZCV is not used. + SrcReg = MI->getOperand(1).getReg(); + SrcReg2 = MI->getOperand(2).getReg(); + CmpMask = ~0; + CmpValue = 0; + return true; + case AArch64::SUBSWri: + case AArch64::ADDSWri: + case AArch64::SUBSXri: + case AArch64::ADDSXri: + SrcReg = MI->getOperand(1).getReg(); + SrcReg2 = 0; + CmpMask = ~0; + CmpValue = MI->getOperand(2).getImm(); + return true; + case AArch64::ANDSWri: + case AArch64::ANDSXri: + // ANDS does not use the same encoding scheme as the others xxxS + // instructions. + SrcReg = MI->getOperand(1).getReg(); + SrcReg2 = 0; + CmpMask = ~0; + CmpValue = AArch64_AM::decodeLogicalImmediate( + MI->getOperand(2).getImm(), + MI->getOpcode() == AArch64::ANDSWri ? 32 : 64); + return true; + } + + return false; +} + +static bool UpdateOperandRegClass(MachineInstr *Instr) { + MachineBasicBlock *MBB = Instr->getParent(); + assert(MBB && "Can't get MachineBasicBlock here"); + MachineFunction *MF = MBB->getParent(); + assert(MF && "Can't get MachineFunction here"); + const TargetMachine *TM = &MF->getTarget(); + const TargetInstrInfo *TII = TM->getInstrInfo(); + const TargetRegisterInfo *TRI = TM->getRegisterInfo(); + MachineRegisterInfo *MRI = &MF->getRegInfo(); + + for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx; + ++OpIdx) { + MachineOperand &MO = Instr->getOperand(OpIdx); + const TargetRegisterClass *OpRegCstraints = + Instr->getRegClassConstraint(OpIdx, TII, TRI); + + // If there's no constraint, there's nothing to do. + if (!OpRegCstraints) + continue; + // If the operand is a frame index, there's nothing to do here. + // A frame index operand will resolve correctly during PEI. + if (MO.isFI()) + continue; + + assert(MO.isReg() && + "Operand has register constraints without being a register!"); + + unsigned Reg = MO.getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + if (!OpRegCstraints->contains(Reg)) + return false; + } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) && + !MRI->constrainRegClass(Reg, OpRegCstraints)) + return false; + } + + return true; +} + +/// optimizeCompareInstr - Convert the instruction supplying the argument to the +/// comparison into one that sets the zero bit in the flags register. +bool AArch64InstrInfo::optimizeCompareInstr( + MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, + int CmpValue, const MachineRegisterInfo *MRI) const { + + // Replace SUBSWrr with SUBWrr if NZCV is not used. + int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true); + if (Cmp_NZCV != -1) { + unsigned NewOpc; + switch (CmpInstr->getOpcode()) { + default: + return false; + case AArch64::ADDSWrr: NewOpc = AArch64::ADDWrr; break; + case AArch64::ADDSWri: NewOpc = AArch64::ADDWri; break; + case AArch64::ADDSWrs: NewOpc = AArch64::ADDWrs; break; + case AArch64::ADDSWrx: NewOpc = AArch64::ADDWrx; break; + case AArch64::ADDSXrr: NewOpc = AArch64::ADDXrr; break; + case AArch64::ADDSXri: NewOpc = AArch64::ADDXri; break; + case AArch64::ADDSXrs: NewOpc = AArch64::ADDXrs; break; + case AArch64::ADDSXrx: NewOpc = AArch64::ADDXrx; break; + case AArch64::SUBSWrr: NewOpc = AArch64::SUBWrr; break; + case AArch64::SUBSWri: NewOpc = AArch64::SUBWri; break; + case AArch64::SUBSWrs: NewOpc = AArch64::SUBWrs; break; + case AArch64::SUBSWrx: NewOpc = AArch64::SUBWrx; break; + case AArch64::SUBSXrr: NewOpc = AArch64::SUBXrr; break; + case AArch64::SUBSXri: NewOpc = AArch64::SUBXri; break; + case AArch64::SUBSXrs: NewOpc = AArch64::SUBXrs; break; + case AArch64::SUBSXrx: NewOpc = AArch64::SUBXrx; break; + } + + const MCInstrDesc &MCID = get(NewOpc); + CmpInstr->setDesc(MCID); + CmpInstr->RemoveOperand(Cmp_NZCV); + bool succeeded = UpdateOperandRegClass(CmpInstr); + (void)succeeded; + assert(succeeded && "Some operands reg class are incompatible!"); + return true; + } + + // Continue only if we have a "ri" where immediate is zero. + if (CmpValue != 0 || SrcReg2 != 0) + return false; + + // CmpInstr is a Compare instruction if destination register is not used. + if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg())) + return false; + + // Get the unique definition of SrcReg. + MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); + if (!MI) + return false; + + // We iterate backward, starting from the instruction before CmpInstr and + // stop when reaching the definition of the source register or done with the + // basic block, to check whether NZCV is used or modified in between. + MachineBasicBlock::iterator I = CmpInstr, E = MI, + B = CmpInstr->getParent()->begin(); + + // Early exit if CmpInstr is at the beginning of the BB. + if (I == B) + return false; + + // Check whether the definition of SrcReg is in the same basic block as + // Compare. If not, we can't optimize away the Compare. + if (MI->getParent() != CmpInstr->getParent()) + return false; + + // Check that NZCV isn't set between the comparison instruction and the one we + // want to change. + const TargetRegisterInfo *TRI = &getRegisterInfo(); + for (--I; I != E; --I) { + const MachineInstr &Instr = *I; + + if (Instr.modifiesRegister(AArch64::NZCV, TRI) || + Instr.readsRegister(AArch64::NZCV, TRI)) + // This instruction modifies or uses NZCV after the one we want to + // change. We can't do this transformation. + return false; + if (I == B) + // The 'and' is below the comparison instruction. + return false; + } + + unsigned NewOpc = MI->getOpcode(); + switch (MI->getOpcode()) { + default: + return false; + case AArch64::ADDSWrr: + case AArch64::ADDSWri: + case AArch64::ADDSXrr: + case AArch64::ADDSXri: + case AArch64::SUBSWrr: + case AArch64::SUBSWri: + case AArch64::SUBSXrr: + case AArch64::SUBSXri: + break; + case AArch64::ADDWrr: NewOpc = AArch64::ADDSWrr; break; + case AArch64::ADDWri: NewOpc = AArch64::ADDSWri; break; + case AArch64::ADDXrr: NewOpc = AArch64::ADDSXrr; break; + case AArch64::ADDXri: NewOpc = AArch64::ADDSXri; break; + case AArch64::ADCWr: NewOpc = AArch64::ADCSWr; break; + case AArch64::ADCXr: NewOpc = AArch64::ADCSXr; break; + case AArch64::SUBWrr: NewOpc = AArch64::SUBSWrr; break; + case AArch64::SUBWri: NewOpc = AArch64::SUBSWri; break; + case AArch64::SUBXrr: NewOpc = AArch64::SUBSXrr; break; + case AArch64::SUBXri: NewOpc = AArch64::SUBSXri; break; + case AArch64::SBCWr: NewOpc = AArch64::SBCSWr; break; + case AArch64::SBCXr: NewOpc = AArch64::SBCSXr; break; + case AArch64::ANDWri: NewOpc = AArch64::ANDSWri; break; + case AArch64::ANDXri: NewOpc = AArch64::ANDSXri; break; + } + + // Scan forward for the use of NZCV. + // When checking against MI: if it's a conditional code requires + // checking of V bit, then this is not safe to do. + // It is safe to remove CmpInstr if NZCV is redefined or killed. + // If we are done with the basic block, we need to check whether NZCV is + // live-out. + bool IsSafe = false; + for (MachineBasicBlock::iterator I = CmpInstr, + E = CmpInstr->getParent()->end(); + !IsSafe && ++I != E;) { + const MachineInstr &Instr = *I; + for (unsigned IO = 0, EO = Instr.getNumOperands(); !IsSafe && IO != EO; + ++IO) { + const MachineOperand &MO = Instr.getOperand(IO); + if (MO.isRegMask() && MO.clobbersPhysReg(AArch64::NZCV)) { + IsSafe = true; + break; + } + if (!MO.isReg() || MO.getReg() != AArch64::NZCV) + continue; + if (MO.isDef()) { + IsSafe = true; + break; + } + + // Decode the condition code. + unsigned Opc = Instr.getOpcode(); + AArch64CC::CondCode CC; + switch (Opc) { + default: + return false; + case AArch64::Bcc: + CC = (AArch64CC::CondCode)Instr.getOperand(IO - 2).getImm(); + break; + case AArch64::CSINVWr: + case AArch64::CSINVXr: + case AArch64::CSINCWr: + case AArch64::CSINCXr: + case AArch64::CSELWr: + case AArch64::CSELXr: + case AArch64::CSNEGWr: + case AArch64::CSNEGXr: + case AArch64::FCSELSrrr: + case AArch64::FCSELDrrr: + CC = (AArch64CC::CondCode)Instr.getOperand(IO - 1).getImm(); + break; + } + + // It is not safe to remove Compare instruction if Overflow(V) is used. + switch (CC) { + default: + // NZCV can be used multiple times, we should continue. + break; + case AArch64CC::VS: + case AArch64CC::VC: + case AArch64CC::GE: + case AArch64CC::LT: + case AArch64CC::GT: + case AArch64CC::LE: + return false; + } + } + } + + // If NZCV is not killed nor re-defined, we should check whether it is + // live-out. If it is live-out, do not optimize. + if (!IsSafe) { + MachineBasicBlock *ParentBlock = CmpInstr->getParent(); + for (auto *MBB : ParentBlock->successors()) + if (MBB->isLiveIn(AArch64::NZCV)) + return false; + } + + // Update the instruction to set NZCV. + MI->setDesc(get(NewOpc)); + CmpInstr->eraseFromParent(); + bool succeeded = UpdateOperandRegClass(MI); + (void)succeeded; + assert(succeeded && "Some operands reg class are incompatible!"); + MI->addRegisterDefined(AArch64::NZCV, TRI); + return true; +} + +/// Return true if this is this instruction has a non-zero immediate +bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + default: + break; + case AArch64::ADDSWrs: + case AArch64::ADDSXrs: + case AArch64::ADDWrs: + case AArch64::ADDXrs: + case AArch64::ANDSWrs: + case AArch64::ANDSXrs: + case AArch64::ANDWrs: + case AArch64::ANDXrs: + case AArch64::BICSWrs: + case AArch64::BICSXrs: + case AArch64::BICWrs: + case AArch64::BICXrs: + case AArch64::CRC32Brr: + case AArch64::CRC32CBrr: + case AArch64::CRC32CHrr: + case AArch64::CRC32CWrr: + case AArch64::CRC32CXrr: + case AArch64::CRC32Hrr: + case AArch64::CRC32Wrr: + case AArch64::CRC32Xrr: + case AArch64::EONWrs: + case AArch64::EONXrs: + case AArch64::EORWrs: + case AArch64::EORXrs: + case AArch64::ORNWrs: + case AArch64::ORNXrs: + case AArch64::ORRWrs: + case AArch64::ORRXrs: + case AArch64::SUBSWrs: + case AArch64::SUBSXrs: + case AArch64::SUBWrs: + case AArch64::SUBXrs: + if (MI->getOperand(3).isImm()) { + unsigned val = MI->getOperand(3).getImm(); + return (val != 0); + } + break; + } + return false; +} + +/// Return true if this is this instruction has a non-zero immediate +bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + default: + break; + case AArch64::ADDSWrx: + case AArch64::ADDSXrx: + case AArch64::ADDSXrx64: + case AArch64::ADDWrx: + case AArch64::ADDXrx: + case AArch64::ADDXrx64: + case AArch64::SUBSWrx: + case AArch64::SUBSXrx: + case AArch64::SUBSXrx64: + case AArch64::SUBWrx: + case AArch64::SUBXrx: + case AArch64::SUBXrx64: + if (MI->getOperand(3).isImm()) { + unsigned val = MI->getOperand(3).getImm(); + return (val != 0); + } + break; + } + + return false; +} + +// Return true if this instruction simply sets its single destination register +// to zero. This is equivalent to a register rename of the zero-register. +bool AArch64InstrInfo::isGPRZero(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + default: + break; + case AArch64::MOVZWi: + case AArch64::MOVZXi: // movz Rd, #0 (LSL #0) + if (MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) { + assert(MI->getDesc().getNumOperands() == 3 && + MI->getOperand(2).getImm() == 0 && "invalid MOVZi operands"); + return true; + } + break; + case AArch64::ANDWri: // and Rd, Rzr, #imm + return MI->getOperand(1).getReg() == AArch64::WZR; + case AArch64::ANDXri: + return MI->getOperand(1).getReg() == AArch64::XZR; + case TargetOpcode::COPY: + return MI->getOperand(1).getReg() == AArch64::WZR; + } + return false; +} + +// Return true if this instruction simply renames a general register without +// modifying bits. +bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + default: + break; + case TargetOpcode::COPY: { + // GPR32 copies will by lowered to ORRXrs + unsigned DstReg = MI->getOperand(0).getReg(); + return (AArch64::GPR32RegClass.contains(DstReg) || + AArch64::GPR64RegClass.contains(DstReg)); + } + case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0) + if (MI->getOperand(1).getReg() == AArch64::XZR) { + assert(MI->getDesc().getNumOperands() == 4 && + MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands"); + return true; + } + case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0) + if (MI->getOperand(2).getImm() == 0) { + assert(MI->getDesc().getNumOperands() == 4 && + MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands"); + return true; + } + } + return false; +} + +// Return true if this instruction simply renames a general register without +// modifying bits. +bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + default: + break; + case TargetOpcode::COPY: { + // FPR64 copies will by lowered to ORR.16b + unsigned DstReg = MI->getOperand(0).getReg(); + return (AArch64::FPR64RegClass.contains(DstReg) || + AArch64::FPR128RegClass.contains(DstReg)); + } + case AArch64::ORRv16i8: + if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) { + assert(MI->getDesc().getNumOperands() == 3 && MI->getOperand(0).isReg() && + "invalid ORRv16i8 operands"); + return true; + } + } + return false; +} + +unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + switch (MI->getOpcode()) { + default: + break; + case AArch64::LDRWui: + case AArch64::LDRXui: + case AArch64::LDRBui: + case AArch64::LDRHui: + case AArch64::LDRSui: + case AArch64::LDRDui: + case AArch64::LDRQui: + if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() && + MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + + return 0; +} + +unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + switch (MI->getOpcode()) { + default: + break; + case AArch64::STRWui: + case AArch64::STRXui: + case AArch64::STRBui: + case AArch64::STRHui: + case AArch64::STRSui: + case AArch64::STRDui: + case AArch64::STRQui: + if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() && + MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) { + FrameIndex = MI->getOperand(1).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + return 0; +} + +/// Return true if this is load/store scales or extends its register offset. +/// This refers to scaling a dynamic index as opposed to scaled immediates. +/// MI should be a memory op that allows scaled addressing. +bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + default: + break; + case AArch64::LDRBBroW: + case AArch64::LDRBroW: + case AArch64::LDRDroW: + case AArch64::LDRHHroW: + case AArch64::LDRHroW: + case AArch64::LDRQroW: + case AArch64::LDRSBWroW: + case AArch64::LDRSBXroW: + case AArch64::LDRSHWroW: + case AArch64::LDRSHXroW: + case AArch64::LDRSWroW: + case AArch64::LDRSroW: + case AArch64::LDRWroW: + case AArch64::LDRXroW: + case AArch64::STRBBroW: + case AArch64::STRBroW: + case AArch64::STRDroW: + case AArch64::STRHHroW: + case AArch64::STRHroW: + case AArch64::STRQroW: + case AArch64::STRSroW: + case AArch64::STRWroW: + case AArch64::STRXroW: + case AArch64::LDRBBroX: + case AArch64::LDRBroX: + case AArch64::LDRDroX: + case AArch64::LDRHHroX: + case AArch64::LDRHroX: + case AArch64::LDRQroX: + case AArch64::LDRSBWroX: + case AArch64::LDRSBXroX: + case AArch64::LDRSHWroX: + case AArch64::LDRSHXroX: + case AArch64::LDRSWroX: + case AArch64::LDRSroX: + case AArch64::LDRWroX: + case AArch64::LDRXroX: + case AArch64::STRBBroX: + case AArch64::STRBroX: + case AArch64::STRDroX: + case AArch64::STRHHroX: + case AArch64::STRHroX: + case AArch64::STRQroX: + case AArch64::STRSroX: + case AArch64::STRWroX: + case AArch64::STRXroX: + + unsigned Val = MI->getOperand(3).getImm(); + AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getMemExtendType(Val); + return (ExtType != AArch64_AM::UXTX) || AArch64_AM::getMemDoShift(Val); + } + return false; +} + +/// Check all MachineMemOperands for a hint to suppress pairing. +bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr *MI) const { + assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) && + "Too many target MO flags"); + for (auto *MM : MI->memoperands()) { + if (MM->getFlags() & + (MOSuppressPair << MachineMemOperand::MOTargetStartBit)) { + return true; + } + } + return false; +} + +/// Set a flag on the first MachineMemOperand to suppress pairing. +void AArch64InstrInfo::suppressLdStPair(MachineInstr *MI) const { + if (MI->memoperands_empty()) + return; + + assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) && + "Too many target MO flags"); + (*MI->memoperands_begin()) + ->setFlags(MOSuppressPair << MachineMemOperand::MOTargetStartBit); +} + +bool +AArch64InstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, + unsigned &Offset, + const TargetRegisterInfo *TRI) const { + switch (LdSt->getOpcode()) { + default: + return false; + case AArch64::STRSui: + case AArch64::STRDui: + case AArch64::STRQui: + case AArch64::STRXui: + case AArch64::STRWui: + case AArch64::LDRSui: + case AArch64::LDRDui: + case AArch64::LDRQui: + case AArch64::LDRXui: + case AArch64::LDRWui: + if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm()) + return false; + BaseReg = LdSt->getOperand(1).getReg(); + MachineFunction &MF = *LdSt->getParent()->getParent(); + unsigned Width = getRegClass(LdSt->getDesc(), 0, TRI, MF)->getSize(); + Offset = LdSt->getOperand(2).getImm() * Width; + return true; + }; +} + +/// Detect opportunities for ldp/stp formation. +/// +/// Only called for LdSt for which getLdStBaseRegImmOfs returns true. +bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, + MachineInstr *SecondLdSt, + unsigned NumLoads) const { + // Only cluster up to a single pair. + if (NumLoads > 1) + return false; + if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode()) + return false; + // getLdStBaseRegImmOfs guarantees that oper 2 isImm. + unsigned Ofs1 = FirstLdSt->getOperand(2).getImm(); + // Allow 6 bits of positive range. + if (Ofs1 > 64) + return false; + // The caller should already have ordered First/SecondLdSt by offset. + unsigned Ofs2 = SecondLdSt->getOperand(2).getImm(); + return Ofs1 + 1 == Ofs2; +} + +bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First, + MachineInstr *Second) const { + // Cyclone can fuse CMN, CMP followed by Bcc. + + // FIXME: B0 can also fuse: + // AND, BIC, ORN, ORR, or EOR (optional S) followed by Bcc or CBZ or CBNZ. + if (Second->getOpcode() != AArch64::Bcc) + return false; + switch (First->getOpcode()) { + default: + return false; + case AArch64::SUBSWri: + case AArch64::ADDSWri: + case AArch64::ANDSWri: + case AArch64::SUBSXri: + case AArch64::ADDSXri: + case AArch64::ANDSXri: + return true; + } +} + +MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, + int FrameIx, + uint64_t Offset, + const MDNode *MDPtr, + DebugLoc DL) const { + MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE)) + .addFrameIndex(FrameIx) + .addImm(0) + .addImm(Offset) + .addMetadata(MDPtr); + return &*MIB; +} + +static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB, + unsigned Reg, unsigned SubIdx, + unsigned State, + const TargetRegisterInfo *TRI) { + if (!SubIdx) + return MIB.addReg(Reg, State); + + if (TargetRegisterInfo::isPhysicalRegister(Reg)) + return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); + return MIB.addReg(Reg, State, SubIdx); +} + +static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, + unsigned NumRegs) { + // We really want the positive remainder mod 32 here, that happens to be + // easily obtainable with a mask. + return ((DestReg - SrcReg) & 0x1f) < NumRegs; +} + +void AArch64InstrInfo::copyPhysRegTuple( + MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode, + llvm::ArrayRef Indices) const { + assert(getSubTarget().hasNEON() && + "Unexpected register copy without NEON"); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + uint16_t DestEncoding = TRI->getEncodingValue(DestReg); + uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); + unsigned NumRegs = Indices.size(); + + int SubReg = 0, End = NumRegs, Incr = 1; + if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) { + SubReg = NumRegs - 1; + End = -1; + Incr = -1; + } + + for (; SubReg != End; SubReg += Incr) { + const MachineInstrBuilder &MIB = BuildMI(MBB, I, DL, get(Opcode)); + AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); + AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI); + AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); + } +} + +void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const { + if (AArch64::GPR32spRegClass.contains(DestReg) && + (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) { + const TargetRegisterInfo *TRI = &getRegisterInfo(); + + if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) { + // If either operand is WSP, expand to ADD #0. + if (Subtarget.hasZeroCycleRegMove()) { + // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move. + unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32, + &AArch64::GPR64spRegClass); + unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32, + &AArch64::GPR64spRegClass); + // This instruction is reading and writing X registers. This may upset + // the register scavenger and machine verifier, so we need to indicate + // that we are reading an undefined value from SrcRegX, but a proper + // value from SrcReg. + BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX) + .addReg(SrcRegX, RegState::Undef) + .addImm(0) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) + .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); + } else { + BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)) + .addImm(0) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); + } + } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroing()) { + BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg).addImm(0).addImm( + AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); + } else { + if (Subtarget.hasZeroCycleRegMove()) { + // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. + unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32, + &AArch64::GPR64spRegClass); + unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32, + &AArch64::GPR64spRegClass); + // This instruction is reading and writing X registers. This may upset + // the register scavenger and machine verifier, so we need to indicate + // that we are reading an undefined value from SrcRegX, but a proper + // value from SrcReg. + BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX) + .addReg(AArch64::XZR) + .addReg(SrcRegX, RegState::Undef) + .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); + } else { + // Otherwise, expand to ORR WZR. + BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg) + .addReg(AArch64::WZR) + .addReg(SrcReg, getKillRegState(KillSrc)); + } + } + return; + } + + if (AArch64::GPR64spRegClass.contains(DestReg) && + (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) { + if (DestReg == AArch64::SP || SrcReg == AArch64::SP) { + // If either operand is SP, expand to ADD #0. + BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)) + .addImm(0) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); + } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroing()) { + BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg).addImm(0).addImm( + AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); + } else { + // Otherwise, expand to ORR XZR. + BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg) + .addReg(AArch64::XZR) + .addReg(SrcReg, getKillRegState(KillSrc)); + } + return; + } + + // Copy a DDDD register quad by copying the individual sub-registers. + if (AArch64::DDDDRegClass.contains(DestReg) && + AArch64::DDDDRegClass.contains(SrcReg)) { + static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1, + AArch64::dsub2, AArch64::dsub3 }; + copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, + Indices); + return; + } + + // Copy a DDD register triple by copying the individual sub-registers. + if (AArch64::DDDRegClass.contains(DestReg) && + AArch64::DDDRegClass.contains(SrcReg)) { + static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1, + AArch64::dsub2 }; + copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, + Indices); + return; + } + + // Copy a DD register pair by copying the individual sub-registers. + if (AArch64::DDRegClass.contains(DestReg) && + AArch64::DDRegClass.contains(SrcReg)) { + static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1 }; + copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8, + Indices); + return; + } + + // Copy a QQQQ register quad by copying the individual sub-registers. + if (AArch64::QQQQRegClass.contains(DestReg) && + AArch64::QQQQRegClass.contains(SrcReg)) { + static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1, + AArch64::qsub2, AArch64::qsub3 }; + copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, + Indices); + return; + } + + // Copy a QQQ register triple by copying the individual sub-registers. + if (AArch64::QQQRegClass.contains(DestReg) && + AArch64::QQQRegClass.contains(SrcReg)) { + static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1, + AArch64::qsub2 }; + copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, + Indices); + return; + } + + // Copy a QQ register pair by copying the individual sub-registers. + if (AArch64::QQRegClass.contains(DestReg) && + AArch64::QQRegClass.contains(SrcReg)) { + static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1 }; + copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8, + Indices); + return; + } + + if (AArch64::FPR128RegClass.contains(DestReg) && + AArch64::FPR128RegClass.contains(SrcReg)) { + if(getSubTarget().hasNEON()) { + BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) + .addReg(SrcReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } else { + BuildMI(MBB, I, DL, get(AArch64::STRQpre)) + .addReg(AArch64::SP, RegState::Define) + .addReg(SrcReg, getKillRegState(KillSrc)) + .addReg(AArch64::SP) + .addImm(-16); + BuildMI(MBB, I, DL, get(AArch64::LDRQpre)) + .addReg(AArch64::SP, RegState::Define) + .addReg(DestReg, RegState::Define) + .addReg(AArch64::SP) + .addImm(16); + } + return; + } + + if (AArch64::FPR64RegClass.contains(DestReg) && + AArch64::FPR64RegClass.contains(SrcReg)) { + if(getSubTarget().hasNEON()) { + DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub, + &AArch64::FPR128RegClass); + SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub, + &AArch64::FPR128RegClass); + BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) + .addReg(SrcReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } else { + BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } + return; + } + + if (AArch64::FPR32RegClass.contains(DestReg) && + AArch64::FPR32RegClass.contains(SrcReg)) { + if(getSubTarget().hasNEON()) { + DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub, + &AArch64::FPR128RegClass); + SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub, + &AArch64::FPR128RegClass); + BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) + .addReg(SrcReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } else { + BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } + return; + } + + if (AArch64::FPR16RegClass.contains(DestReg) && + AArch64::FPR16RegClass.contains(SrcReg)) { + if(getSubTarget().hasNEON()) { + DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub, + &AArch64::FPR128RegClass); + SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub, + &AArch64::FPR128RegClass); + BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) + .addReg(SrcReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } else { + DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub, + &AArch64::FPR32RegClass); + SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub, + &AArch64::FPR32RegClass); + BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } + return; + } + + if (AArch64::FPR8RegClass.contains(DestReg) && + AArch64::FPR8RegClass.contains(SrcReg)) { + if(getSubTarget().hasNEON()) { + DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub, + &AArch64::FPR128RegClass); + SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub, + &AArch64::FPR128RegClass); + BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg) + .addReg(SrcReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } else { + DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub, + &AArch64::FPR32RegClass); + SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub, + &AArch64::FPR32RegClass); + BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + } + return; + } + + // Copies between GPR64 and FPR64. + if (AArch64::FPR64RegClass.contains(DestReg) && + AArch64::GPR64RegClass.contains(SrcReg)) { + BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + if (AArch64::GPR64RegClass.contains(DestReg) && + AArch64::FPR64RegClass.contains(SrcReg)) { + BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + // Copies between GPR32 and FPR32. + if (AArch64::FPR32RegClass.contains(DestReg) && + AArch64::GPR32RegClass.contains(SrcReg)) { + BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + if (AArch64::GPR32RegClass.contains(DestReg) && + AArch64::FPR32RegClass.contains(SrcReg)) { + BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + + assert(0 && "unimplemented reg-to-reg copy"); +} + +void AArch64InstrInfo::storeRegToStackSlot( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg, + bool isKill, int FI, const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + DebugLoc DL; + if (MBBI != MBB.end()) + DL = MBBI->getDebugLoc(); + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + unsigned Align = MFI.getObjectAlignment(FI); + + MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align); + unsigned Opc = 0; + bool Offset = true; + switch (RC->getSize()) { + case 1: + if (AArch64::FPR8RegClass.hasSubClassEq(RC)) + Opc = AArch64::STRBui; + break; + case 2: + if (AArch64::FPR16RegClass.hasSubClassEq(RC)) + Opc = AArch64::STRHui; + break; + case 4: + if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { + Opc = AArch64::STRWui; + if (TargetRegisterInfo::isVirtualRegister(SrcReg)) + MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass); + else + assert(SrcReg != AArch64::WSP); + } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) + Opc = AArch64::STRSui; + break; + case 8: + if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { + Opc = AArch64::STRXui; + if (TargetRegisterInfo::isVirtualRegister(SrcReg)) + MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); + else + assert(SrcReg != AArch64::SP); + } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) + Opc = AArch64::STRDui; + break; + case 16: + if (AArch64::FPR128RegClass.hasSubClassEq(RC)) + Opc = AArch64::STRQui; + else if (AArch64::DDRegClass.hasSubClassEq(RC)) { + assert(getSubTarget().hasNEON() && + "Unexpected register store without NEON"); + Opc = AArch64::ST1Twov1d, Offset = false; + } + break; + case 24: + if (AArch64::DDDRegClass.hasSubClassEq(RC)) { + assert(getSubTarget().hasNEON() && + "Unexpected register store without NEON"); + Opc = AArch64::ST1Threev1d, Offset = false; + } + break; + case 32: + if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { + assert(getSubTarget().hasNEON() && + "Unexpected register store without NEON"); + Opc = AArch64::ST1Fourv1d, Offset = false; + } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { + assert(getSubTarget().hasNEON() && + "Unexpected register store without NEON"); + Opc = AArch64::ST1Twov2d, Offset = false; + } + break; + case 48: + if (AArch64::QQQRegClass.hasSubClassEq(RC)) { + assert(getSubTarget().hasNEON() && + "Unexpected register store without NEON"); + Opc = AArch64::ST1Threev2d, Offset = false; + } + break; + case 64: + if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { + assert(getSubTarget().hasNEON() && + "Unexpected register store without NEON"); + Opc = AArch64::ST1Fourv2d, Offset = false; + } + break; + } + assert(Opc && "Unknown register class"); + + const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI); + + if (Offset) + MI.addImm(0); + MI.addMemOperand(MMO); +} + +void AArch64InstrInfo::loadRegFromStackSlot( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg, + int FI, const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + DebugLoc DL; + if (MBBI != MBB.end()) + DL = MBBI->getDebugLoc(); + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + unsigned Align = MFI.getObjectAlignment(FI); + MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align); + + unsigned Opc = 0; + bool Offset = true; + switch (RC->getSize()) { + case 1: + if (AArch64::FPR8RegClass.hasSubClassEq(RC)) + Opc = AArch64::LDRBui; + break; + case 2: + if (AArch64::FPR16RegClass.hasSubClassEq(RC)) + Opc = AArch64::LDRHui; + break; + case 4: + if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) { + Opc = AArch64::LDRWui; + if (TargetRegisterInfo::isVirtualRegister(DestReg)) + MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass); + else + assert(DestReg != AArch64::WSP); + } else if (AArch64::FPR32RegClass.hasSubClassEq(RC)) + Opc = AArch64::LDRSui; + break; + case 8: + if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) { + Opc = AArch64::LDRXui; + if (TargetRegisterInfo::isVirtualRegister(DestReg)) + MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass); + else + assert(DestReg != AArch64::SP); + } else if (AArch64::FPR64RegClass.hasSubClassEq(RC)) + Opc = AArch64::LDRDui; + break; + case 16: + if (AArch64::FPR128RegClass.hasSubClassEq(RC)) + Opc = AArch64::LDRQui; + else if (AArch64::DDRegClass.hasSubClassEq(RC)) { + assert(getSubTarget().hasNEON() && + "Unexpected register load without NEON"); + Opc = AArch64::LD1Twov1d, Offset = false; + } + break; + case 24: + if (AArch64::DDDRegClass.hasSubClassEq(RC)) { + assert(getSubTarget().hasNEON() && + "Unexpected register load without NEON"); + Opc = AArch64::LD1Threev1d, Offset = false; + } + break; + case 32: + if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { + assert(getSubTarget().hasNEON() && + "Unexpected register load without NEON"); + Opc = AArch64::LD1Fourv1d, Offset = false; + } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { + assert(getSubTarget().hasNEON() && + "Unexpected register load without NEON"); + Opc = AArch64::LD1Twov2d, Offset = false; + } + break; + case 48: + if (AArch64::QQQRegClass.hasSubClassEq(RC)) { + assert(getSubTarget().hasNEON() && + "Unexpected register load without NEON"); + Opc = AArch64::LD1Threev2d, Offset = false; + } + break; + case 64: + if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { + assert(getSubTarget().hasNEON() && + "Unexpected register load without NEON"); + Opc = AArch64::LD1Fourv2d, Offset = false; + } + break; + } + assert(Opc && "Unknown register class"); + + const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc)) + .addReg(DestReg, getDefRegState(true)) + .addFrameIndex(FI); + if (Offset) + MI.addImm(0); + MI.addMemOperand(MMO); +} + +void llvm::emitFrameOffset(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, int Offset, + const AArch64InstrInfo *TII, + MachineInstr::MIFlag Flag, bool SetNZCV) { + if (DestReg == SrcReg && Offset == 0) + return; + + bool isSub = Offset < 0; + if (isSub) + Offset = -Offset; + + // FIXME: If the offset won't fit in 24-bits, compute the offset into a + // scratch register. If DestReg is a virtual register, use it as the + // scratch register; otherwise, create a new virtual register (to be + // replaced by the scavenger at the end of PEI). That case can be optimized + // slightly if DestReg is SP which is always 16-byte aligned, so the scratch + // register can be loaded with offset%8 and the add/sub can use an extending + // instruction with LSL#3. + // Currently the function handles any offsets but generates a poor sequence + // of code. + // assert(Offset < (1 << 24) && "unimplemented reg plus immediate"); + + unsigned Opc; + if (SetNZCV) + Opc = isSub ? AArch64::SUBSXri : AArch64::ADDSXri; + else + Opc = isSub ? AArch64::SUBXri : AArch64::ADDXri; + const unsigned MaxEncoding = 0xfff; + const unsigned ShiftSize = 12; + const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; + while (((unsigned)Offset) >= (1 << ShiftSize)) { + unsigned ThisVal; + if (((unsigned)Offset) > MaxEncodableValue) { + ThisVal = MaxEncodableValue; + } else { + ThisVal = Offset & MaxEncodableValue; + } + assert((ThisVal >> ShiftSize) <= MaxEncoding && + "Encoding cannot handle value that big"); + BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg) + .addReg(SrcReg) + .addImm(ThisVal >> ShiftSize) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize)) + .setMIFlag(Flag); + + SrcReg = DestReg; + Offset -= ThisVal; + if (Offset == 0) + return; + } + BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg) + .addReg(SrcReg) + .addImm(Offset) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)) + .setMIFlag(Flag); +} + +MachineInstr * +AArch64InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + const SmallVectorImpl &Ops, + int FrameIndex) const { + // This is a bit of a hack. Consider this instruction: + // + // %vreg0 = COPY %SP; GPR64all:%vreg0 + // + // We explicitly chose GPR64all for the virtual register so such a copy might + // be eliminated by RegisterCoalescer. However, that may not be possible, and + // %vreg0 may even spill. We can't spill %SP, and since it is in the GPR64all + // register class, TargetInstrInfo::foldMemoryOperand() is going to try. + // + // To prevent that, we are going to constrain the %vreg0 register class here. + // + // + // + if (MI->isCopy()) { + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned SrcReg = MI->getOperand(1).getReg(); + if (SrcReg == AArch64::SP && + TargetRegisterInfo::isVirtualRegister(DstReg)) { + MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass); + return nullptr; + } + if (DstReg == AArch64::SP && + TargetRegisterInfo::isVirtualRegister(SrcReg)) { + MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass); + return nullptr; + } + } + + // Cannot fold. + return nullptr; +} + +int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, + bool *OutUseUnscaledOp, + unsigned *OutUnscaledOp, + int *EmittableOffset) { + int Scale = 1; + bool IsSigned = false; + // The ImmIdx should be changed case by case if it is not 2. + unsigned ImmIdx = 2; + unsigned UnscaledOp = 0; + // Set output values in case of early exit. + if (EmittableOffset) + *EmittableOffset = 0; + if (OutUseUnscaledOp) + *OutUseUnscaledOp = false; + if (OutUnscaledOp) + *OutUnscaledOp = 0; + switch (MI.getOpcode()) { + default: + assert(0 && "unhandled opcode in rewriteAArch64FrameIndex"); + // Vector spills/fills can't take an immediate offset. + case AArch64::LD1Twov2d: + case AArch64::LD1Threev2d: + case AArch64::LD1Fourv2d: + case AArch64::LD1Twov1d: + case AArch64::LD1Threev1d: + case AArch64::LD1Fourv1d: + case AArch64::ST1Twov2d: + case AArch64::ST1Threev2d: + case AArch64::ST1Fourv2d: + case AArch64::ST1Twov1d: + case AArch64::ST1Threev1d: + case AArch64::ST1Fourv1d: + return AArch64FrameOffsetCannotUpdate; + case AArch64::PRFMui: + Scale = 8; + UnscaledOp = AArch64::PRFUMi; + break; + case AArch64::LDRXui: + Scale = 8; + UnscaledOp = AArch64::LDURXi; + break; + case AArch64::LDRWui: + Scale = 4; + UnscaledOp = AArch64::LDURWi; + break; + case AArch64::LDRBui: + Scale = 1; + UnscaledOp = AArch64::LDURBi; + break; + case AArch64::LDRHui: + Scale = 2; + UnscaledOp = AArch64::LDURHi; + break; + case AArch64::LDRSui: + Scale = 4; + UnscaledOp = AArch64::LDURSi; + break; + case AArch64::LDRDui: + Scale = 8; + UnscaledOp = AArch64::LDURDi; + break; + case AArch64::LDRQui: + Scale = 16; + UnscaledOp = AArch64::LDURQi; + break; + case AArch64::LDRBBui: + Scale = 1; + UnscaledOp = AArch64::LDURBBi; + break; + case AArch64::LDRHHui: + Scale = 2; + UnscaledOp = AArch64::LDURHHi; + break; + case AArch64::LDRSBXui: + Scale = 1; + UnscaledOp = AArch64::LDURSBXi; + break; + case AArch64::LDRSBWui: + Scale = 1; + UnscaledOp = AArch64::LDURSBWi; + break; + case AArch64::LDRSHXui: + Scale = 2; + UnscaledOp = AArch64::LDURSHXi; + break; + case AArch64::LDRSHWui: + Scale = 2; + UnscaledOp = AArch64::LDURSHWi; + break; + case AArch64::LDRSWui: + Scale = 4; + UnscaledOp = AArch64::LDURSWi; + break; + + case AArch64::STRXui: + Scale = 8; + UnscaledOp = AArch64::STURXi; + break; + case AArch64::STRWui: + Scale = 4; + UnscaledOp = AArch64::STURWi; + break; + case AArch64::STRBui: + Scale = 1; + UnscaledOp = AArch64::STURBi; + break; + case AArch64::STRHui: + Scale = 2; + UnscaledOp = AArch64::STURHi; + break; + case AArch64::STRSui: + Scale = 4; + UnscaledOp = AArch64::STURSi; + break; + case AArch64::STRDui: + Scale = 8; + UnscaledOp = AArch64::STURDi; + break; + case AArch64::STRQui: + Scale = 16; + UnscaledOp = AArch64::STURQi; + break; + case AArch64::STRBBui: + Scale = 1; + UnscaledOp = AArch64::STURBBi; + break; + case AArch64::STRHHui: + Scale = 2; + UnscaledOp = AArch64::STURHHi; + break; + + case AArch64::LDPXi: + case AArch64::LDPDi: + case AArch64::STPXi: + case AArch64::STPDi: + IsSigned = true; + Scale = 8; + break; + case AArch64::LDPQi: + case AArch64::STPQi: + IsSigned = true; + Scale = 16; + break; + case AArch64::LDPWi: + case AArch64::LDPSi: + case AArch64::STPWi: + case AArch64::STPSi: + IsSigned = true; + Scale = 4; + break; + + case AArch64::LDURXi: + case AArch64::LDURWi: + case AArch64::LDURBi: + case AArch64::LDURHi: + case AArch64::LDURSi: + case AArch64::LDURDi: + case AArch64::LDURQi: + case AArch64::LDURHHi: + case AArch64::LDURBBi: + case AArch64::LDURSBXi: + case AArch64::LDURSBWi: + case AArch64::LDURSHXi: + case AArch64::LDURSHWi: + case AArch64::LDURSWi: + case AArch64::STURXi: + case AArch64::STURWi: + case AArch64::STURBi: + case AArch64::STURHi: + case AArch64::STURSi: + case AArch64::STURDi: + case AArch64::STURQi: + case AArch64::STURBBi: + case AArch64::STURHHi: + Scale = 1; + break; + } + + Offset += MI.getOperand(ImmIdx).getImm() * Scale; + + bool useUnscaledOp = false; + // If the offset doesn't match the scale, we rewrite the instruction to + // use the unscaled instruction instead. Likewise, if we have a negative + // offset (and have an unscaled op to use). + if ((Offset & (Scale - 1)) != 0 || (Offset < 0 && UnscaledOp != 0)) + useUnscaledOp = true; + + // Use an unscaled addressing mode if the instruction has a negative offset + // (or if the instruction is already using an unscaled addressing mode). + unsigned MaskBits; + if (IsSigned) { + // ldp/stp instructions. + MaskBits = 7; + Offset /= Scale; + } else if (UnscaledOp == 0 || useUnscaledOp) { + MaskBits = 9; + IsSigned = true; + Scale = 1; + } else { + MaskBits = 12; + IsSigned = false; + Offset /= Scale; + } + + // Attempt to fold address computation. + int MaxOff = (1 << (MaskBits - IsSigned)) - 1; + int MinOff = (IsSigned ? (-MaxOff - 1) : 0); + if (Offset >= MinOff && Offset <= MaxOff) { + if (EmittableOffset) + *EmittableOffset = Offset; + Offset = 0; + } else { + int NewOff = Offset < 0 ? MinOff : MaxOff; + if (EmittableOffset) + *EmittableOffset = NewOff; + Offset = (Offset - NewOff) * Scale; + } + if (OutUseUnscaledOp) + *OutUseUnscaledOp = useUnscaledOp; + if (OutUnscaledOp) + *OutUnscaledOp = UnscaledOp; + return AArch64FrameOffsetCanUpdate | + (Offset == 0 ? AArch64FrameOffsetIsLegal : 0); +} + +bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const AArch64InstrInfo *TII) { + unsigned Opcode = MI.getOpcode(); + unsigned ImmIdx = FrameRegIdx + 1; + + if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) { + Offset += MI.getOperand(ImmIdx).getImm(); + emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(), + MI.getOperand(0).getReg(), FrameReg, Offset, TII, + MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri)); + MI.eraseFromParent(); + Offset = 0; + return true; + } + + int NewOffset; + unsigned UnscaledOp; + bool UseUnscaledOp; + int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, + &UnscaledOp, &NewOffset); + if (Status & AArch64FrameOffsetCanUpdate) { + if (Status & AArch64FrameOffsetIsLegal) + // Replace the FrameIndex with FrameReg. + MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); + if (UseUnscaledOp) + MI.setDesc(TII->get(UnscaledOp)); + + MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset); + return Offset == 0; + } + + return false; +} + +void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { + NopInst.setOpcode(AArch64::HINT); + NopInst.addOperand(MCOperand::CreateImm(0)); +} diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h new file mode 100644 index 00000000000..90ce75f26d4 --- /dev/null +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -0,0 +1,231 @@ +//===- AArch64InstrInfo.h - AArch64 Instruction Information -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the AArch64 implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_AArch64INSTRINFO_H +#define LLVM_TARGET_AArch64INSTRINFO_H + +#include "AArch64.h" +#include "AArch64RegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" + +#define GET_INSTRINFO_HEADER +#include "AArch64GenInstrInfo.inc" + +namespace llvm { + +class AArch64Subtarget; +class AArch64TargetMachine; + +class AArch64InstrInfo : public AArch64GenInstrInfo { + // Reserve bits in the MachineMemOperand target hint flags, starting at 1. + // They will be shifted into MOTargetHintStart when accessed. + enum TargetMemOperandFlags { + MOSuppressPair = 1 + }; + + const AArch64RegisterInfo RI; + const AArch64Subtarget &Subtarget; + +public: + explicit AArch64InstrInfo(const AArch64Subtarget &STI); + + /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + /// such, whenever a client has an instance of instruction info, it should + /// always be able to get register info as well (through this method). + const AArch64RegisterInfo &getRegisterInfo() const { return RI; } + + const AArch64Subtarget &getSubTarget() const { return Subtarget; } + + unsigned GetInstSizeInBytes(const MachineInstr *MI) const; + + bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, + unsigned &DstReg, unsigned &SubIdx) const override; + + unsigned isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const override; + unsigned isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const override; + + /// Returns true if there is a shiftable register and that the shift value + /// is non-zero. + bool hasShiftedReg(const MachineInstr *MI) const; + + /// Returns true if there is an extendable register and that the extending + /// value is non-zero. + bool hasExtendedReg(const MachineInstr *MI) const; + + /// \brief Does this instruction set its full destination register to zero? + bool isGPRZero(const MachineInstr *MI) const; + + /// \brief Does this instruction rename a GPR without modifying bits? + bool isGPRCopy(const MachineInstr *MI) const; + + /// \brief Does this instruction rename an FPR without modifying bits? + bool isFPRCopy(const MachineInstr *MI) const; + + /// Return true if this is load/store scales or extends its register offset. + /// This refers to scaling a dynamic index as opposed to scaled immediates. + /// MI should be a memory op that allows scaled addressing. + bool isScaledAddr(const MachineInstr *MI) const; + + /// Return true if pairing the given load or store is hinted to be + /// unprofitable. + bool isLdStPairSuppressed(const MachineInstr *MI) const; + + /// Hint that pairing the given load or store is unprofitable. + void suppressLdStPair(MachineInstr *MI) const; + + bool getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, + unsigned &Offset, + const TargetRegisterInfo *TRI) const override; + + bool enableClusterLoads() const override { return true; } + + bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt, + unsigned NumLoads) const override; + + bool shouldScheduleAdjacent(MachineInstr *First, + MachineInstr *Second) const override; + + MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, + uint64_t Offset, const MDNode *MDPtr, + DebugLoc DL) const; + void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + DebugLoc DL, unsigned DestReg, unsigned SrcReg, + bool KillSrc, unsigned Opcode, + llvm::ArrayRef Indices) const; + void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + DebugLoc DL, unsigned DestReg, unsigned SrcReg, + bool KillSrc) const override; + + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, unsigned SrcReg, + bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, unsigned DestReg, + int FrameIndex, const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const override; + + MachineInstr * + foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, + const SmallVectorImpl &Ops, + int FrameIndex) const override; + + bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl &Cond, + bool AllowModify = false) const override; + unsigned RemoveBranch(MachineBasicBlock &MBB) const override; + unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const SmallVectorImpl &Cond, + DebugLoc DL) const override; + bool + ReverseBranchCondition(SmallVectorImpl &Cond) const override; + bool canInsertSelect(const MachineBasicBlock &, + const SmallVectorImpl &Cond, unsigned, + unsigned, int &, int &, int &) const override; + void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + DebugLoc DL, unsigned DstReg, + const SmallVectorImpl &Cond, + unsigned TrueReg, unsigned FalseReg) const override; + void getNoopForMachoTarget(MCInst &NopInst) const override; + + /// analyzeCompare - For a comparison instruction, return the source registers + /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. + /// Return true if the comparison instruction can be analyzed. + bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, + unsigned &SrcReg2, int &CmpMask, + int &CmpValue) const override; + /// optimizeCompareInstr - Convert the instruction supplying the argument to + /// the comparison into one that sets the zero bit in the flags register. + bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, + unsigned SrcReg2, int CmpMask, int CmpValue, + const MachineRegisterInfo *MRI) const override; + +private: + void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL, + MachineBasicBlock *TBB, + const SmallVectorImpl &Cond) const; +}; + +/// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg +/// plus Offset. This is intended to be used from within the prolog/epilog +/// insertion (PEI) pass, where a virtual scratch register may be allocated +/// if necessary, to be replaced by the scavenger at the end of PEI. +void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset, + const AArch64InstrInfo *TII, + MachineInstr::MIFlag = MachineInstr::NoFlags, + bool SetNZCV = false); + +/// rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the +/// FP. Return false if the offset could not be handled directly in MI, and +/// return the left-over portion by reference. +bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, + unsigned FrameReg, int &Offset, + const AArch64InstrInfo *TII); + +/// \brief Use to report the frame offset status in isAArch64FrameOffsetLegal. +enum AArch64FrameOffsetStatus { + AArch64FrameOffsetCannotUpdate = 0x0, ///< Offset cannot apply. + AArch64FrameOffsetIsLegal = 0x1, ///< Offset is legal. + AArch64FrameOffsetCanUpdate = 0x2 ///< Offset can apply, at least partly. +}; + +/// \brief Check if the @p Offset is a valid frame offset for @p MI. +/// The returned value reports the validity of the frame offset for @p MI. +/// It uses the values defined by AArch64FrameOffsetStatus for that. +/// If result == AArch64FrameOffsetCannotUpdate, @p MI cannot be updated to +/// use an offset.eq +/// If result & AArch64FrameOffsetIsLegal, @p Offset can completely be +/// rewriten in @p MI. +/// If result & AArch64FrameOffsetCanUpdate, @p Offset contains the +/// amount that is off the limit of the legal offset. +/// If set, @p OutUseUnscaledOp will contain the whether @p MI should be +/// turned into an unscaled operator, which opcode is in @p OutUnscaledOp. +/// If set, @p EmittableOffset contains the amount that can be set in @p MI +/// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that +/// is a legal offset. +int isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, + bool *OutUseUnscaledOp = nullptr, + unsigned *OutUnscaledOp = nullptr, + int *EmittableOffset = nullptr); + +static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; } + +static inline bool isCondBranchOpcode(int Opc) { + switch (Opc) { + case AArch64::Bcc: + case AArch64::CBZW: + case AArch64::CBZX: + case AArch64::CBNZW: + case AArch64::CBNZX: + case AArch64::TBZW: + case AArch64::TBZX: + case AArch64::TBNZW: + case AArch64::TBNZX: + return true; + default: + return false; + } +} + +static inline bool isIndirectBranchOpcode(int Opc) { return Opc == AArch64::BR; } + +} // end namespace llvm + +#endif diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td new file mode 100644 index 00000000000..9ad36e8740d --- /dev/null +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -0,0 +1,5284 @@ +//=- AArch64InstrInfo.td - Describe the AArch64 Instructions -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// AArch64 Instruction definitions. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// ARM Instruction Predicate Definitions. +// +def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">, + AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">; +def HasNEON : Predicate<"Subtarget->hasNEON()">, + AssemblerPredicate<"FeatureNEON", "neon">; +def HasCrypto : Predicate<"Subtarget->hasCrypto()">, + AssemblerPredicate<"FeatureCrypto", "crypto">; +def HasCRC : Predicate<"Subtarget->hasCRC()">, + AssemblerPredicate<"FeatureCRC", "crc">; +def IsLE : Predicate<"Subtarget->isLittleEndian()">; +def IsBE : Predicate<"!Subtarget->isLittleEndian()">; + +//===----------------------------------------------------------------------===// +// AArch64-specific DAG Nodes. +// + +// SDTBinaryArithWithFlagsOut - RES1, FLAGS = op LHS, RHS +def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2, + [SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, SDTCisVT<1, i32>]>; + +// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS +def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3, + [SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisInt<0>, + SDTCisVT<3, i32>]>; + +// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS +def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, + [SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisInt<0>, + SDTCisVT<1, i32>, + SDTCisVT<4, i32>]>; + +def SDT_AArch64Brcond : SDTypeProfile<0, 3, + [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>]>; +def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>; +def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, + SDTCisVT<2, OtherVT>]>; + + +def SDT_AArch64CSel : SDTypeProfile<1, 4, + [SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisInt<3>, + SDTCisVT<4, i32>]>; +def SDT_AArch64FCmp : SDTypeProfile<0, 2, + [SDTCisFP<0>, + SDTCisSameAs<0, 1>]>; +def SDT_AArch64Dup : SDTypeProfile<1, 1, [SDTCisVec<0>]>; +def SDT_AArch64DupLane : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>; +def SDT_AArch64Zip : SDTypeProfile<1, 2, [SDTCisVec<0>, + SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>]>; +def SDT_AArch64MOVIedit : SDTypeProfile<1, 1, [SDTCisInt<1>]>; +def SDT_AArch64MOVIshift : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>; +def SDT_AArch64vecimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisInt<2>, SDTCisInt<3>]>; +def SDT_AArch64UnaryVec: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; +def SDT_AArch64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, SDTCisInt<3>]>; +def SDT_AArch64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>; + +def SDT_AArch64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; +def SDT_AArch64fcmpz : SDTypeProfile<1, 1, []>; +def SDT_AArch64fcmp : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>; +def SDT_AArch64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>; +def SDT_AArch64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisSameAs<0,3>]>; +def SDT_AArch64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>; +def SDT_AArch64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>; + +def SDT_AArch64ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>; + +def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>, + SDTCisPtrTy<1>]>; +def SDT_AArch64WrapperLarge : SDTypeProfile<1, 4, + [SDTCisVT<0, i64>, SDTCisVT<1, i32>, + SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>, + SDTCisSameAs<1, 4>]>; + + +// Node definitions. +def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>; +def AArch64addlow : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>; +def AArch64LOADgot : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>; +def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START", + SDCallSeqStart<[ SDTCisVT<0, i32> ]>, + [SDNPHasChain, SDNPOutGlue]>; +def AArch64callseq_end : SDNode<"ISD::CALLSEQ_END", + SDCallSeqEnd<[ SDTCisVT<0, i32>, + SDTCisVT<1, i32> ]>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64call : SDNode<"AArch64ISD::CALL", + SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, + SDNPVariadic]>; +def AArch64brcond : SDNode<"AArch64ISD::BRCOND", SDT_AArch64Brcond, + [SDNPHasChain]>; +def AArch64cbz : SDNode<"AArch64ISD::CBZ", SDT_AArch64cbz, + [SDNPHasChain]>; +def AArch64cbnz : SDNode<"AArch64ISD::CBNZ", SDT_AArch64cbz, + [SDNPHasChain]>; +def AArch64tbz : SDNode<"AArch64ISD::TBZ", SDT_AArch64tbz, + [SDNPHasChain]>; +def AArch64tbnz : SDNode<"AArch64ISD::TBNZ", SDT_AArch64tbz, + [SDNPHasChain]>; + + +def AArch64csel : SDNode<"AArch64ISD::CSEL", SDT_AArch64CSel>; +def AArch64csinv : SDNode<"AArch64ISD::CSINV", SDT_AArch64CSel>; +def AArch64csneg : SDNode<"AArch64ISD::CSNEG", SDT_AArch64CSel>; +def AArch64csinc : SDNode<"AArch64ISD::CSINC", SDT_AArch64CSel>; +def AArch64retflag : SDNode<"AArch64ISD::RET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +def AArch64adc : SDNode<"AArch64ISD::ADC", SDTBinaryArithWithFlagsIn >; +def AArch64sbc : SDNode<"AArch64ISD::SBC", SDTBinaryArithWithFlagsIn>; +def AArch64add_flag : SDNode<"AArch64ISD::ADDS", SDTBinaryArithWithFlagsOut, + [SDNPCommutative]>; +def AArch64sub_flag : SDNode<"AArch64ISD::SUBS", SDTBinaryArithWithFlagsOut>; +def AArch64and_flag : SDNode<"AArch64ISD::ANDS", SDTBinaryArithWithFlagsOut, + [SDNPCommutative]>; +def AArch64adc_flag : SDNode<"AArch64ISD::ADCS", SDTBinaryArithWithFlagsInOut>; +def AArch64sbc_flag : SDNode<"AArch64ISD::SBCS", SDTBinaryArithWithFlagsInOut>; + +def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>; + +def AArch64fcmp : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>; + +def AArch64fmax : SDNode<"AArch64ISD::FMAX", SDTFPBinOp>; +def AArch64fmin : SDNode<"AArch64ISD::FMIN", SDTFPBinOp>; + +def AArch64dup : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>; +def AArch64duplane8 : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>; +def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>; +def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>; +def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>; + +def AArch64zip1 : SDNode<"AArch64ISD::ZIP1", SDT_AArch64Zip>; +def AArch64zip2 : SDNode<"AArch64ISD::ZIP2", SDT_AArch64Zip>; +def AArch64uzp1 : SDNode<"AArch64ISD::UZP1", SDT_AArch64Zip>; +def AArch64uzp2 : SDNode<"AArch64ISD::UZP2", SDT_AArch64Zip>; +def AArch64trn1 : SDNode<"AArch64ISD::TRN1", SDT_AArch64Zip>; +def AArch64trn2 : SDNode<"AArch64ISD::TRN2", SDT_AArch64Zip>; + +def AArch64movi_edit : SDNode<"AArch64ISD::MOVIedit", SDT_AArch64MOVIedit>; +def AArch64movi_shift : SDNode<"AArch64ISD::MOVIshift", SDT_AArch64MOVIshift>; +def AArch64movi_msl : SDNode<"AArch64ISD::MOVImsl", SDT_AArch64MOVIshift>; +def AArch64mvni_shift : SDNode<"AArch64ISD::MVNIshift", SDT_AArch64MOVIshift>; +def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>; +def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>; +def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>; + +def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>; +def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>; +def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>; +def AArch64ext : SDNode<"AArch64ISD::EXT", SDT_AArch64ExtVec>; + +def AArch64vashr : SDNode<"AArch64ISD::VASHR", SDT_AArch64vshift>; +def AArch64vlshr : SDNode<"AArch64ISD::VLSHR", SDT_AArch64vshift>; +def AArch64vshl : SDNode<"AArch64ISD::VSHL", SDT_AArch64vshift>; +def AArch64sqshli : SDNode<"AArch64ISD::SQSHL_I", SDT_AArch64vshift>; +def AArch64uqshli : SDNode<"AArch64ISD::UQSHL_I", SDT_AArch64vshift>; +def AArch64sqshlui : SDNode<"AArch64ISD::SQSHLU_I", SDT_AArch64vshift>; +def AArch64srshri : SDNode<"AArch64ISD::SRSHR_I", SDT_AArch64vshift>; +def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>; + +def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>; +def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>; +def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>; + +def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>; +def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>; +def AArch64cmgt: SDNode<"AArch64ISD::CMGT", SDT_AArch64binvec>; +def AArch64cmhi: SDNode<"AArch64ISD::CMHI", SDT_AArch64binvec>; +def AArch64cmhs: SDNode<"AArch64ISD::CMHS", SDT_AArch64binvec>; + +def AArch64fcmeq: SDNode<"AArch64ISD::FCMEQ", SDT_AArch64fcmp>; +def AArch64fcmge: SDNode<"AArch64ISD::FCMGE", SDT_AArch64fcmp>; +def AArch64fcmgt: SDNode<"AArch64ISD::FCMGT", SDT_AArch64fcmp>; + +def AArch64cmeqz: SDNode<"AArch64ISD::CMEQz", SDT_AArch64unvec>; +def AArch64cmgez: SDNode<"AArch64ISD::CMGEz", SDT_AArch64unvec>; +def AArch64cmgtz: SDNode<"AArch64ISD::CMGTz", SDT_AArch64unvec>; +def AArch64cmlez: SDNode<"AArch64ISD::CMLEz", SDT_AArch64unvec>; +def AArch64cmltz: SDNode<"AArch64ISD::CMLTz", SDT_AArch64unvec>; +def AArch64cmtst : PatFrag<(ops node:$LHS, node:$RHS), + (AArch64not (AArch64cmeqz (and node:$LHS, node:$RHS)))>; + +def AArch64fcmeqz: SDNode<"AArch64ISD::FCMEQz", SDT_AArch64fcmpz>; +def AArch64fcmgez: SDNode<"AArch64ISD::FCMGEz", SDT_AArch64fcmpz>; +def AArch64fcmgtz: SDNode<"AArch64ISD::FCMGTz", SDT_AArch64fcmpz>; +def AArch64fcmlez: SDNode<"AArch64ISD::FCMLEz", SDT_AArch64fcmpz>; +def AArch64fcmltz: SDNode<"AArch64ISD::FCMLTz", SDT_AArch64fcmpz>; + +def AArch64bici: SDNode<"AArch64ISD::BICi", SDT_AArch64vecimm>; +def AArch64orri: SDNode<"AArch64ISD::ORRi", SDT_AArch64vecimm>; + +def AArch64neg : SDNode<"AArch64ISD::NEG", SDT_AArch64unvec>; + +def AArch64tcret: SDNode<"AArch64ISD::TC_RETURN", SDT_AArch64TCRET, + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; + +def AArch64Prefetch : SDNode<"AArch64ISD::PREFETCH", SDT_AArch64PREFETCH, + [SDNPHasChain, SDNPSideEffect]>; + +def AArch64sitof: SDNode<"AArch64ISD::SITOF", SDT_AArch64ITOF>; +def AArch64uitof: SDNode<"AArch64ISD::UITOF", SDT_AArch64ITOF>; + +def AArch64tlsdesc_call : SDNode<"AArch64ISD::TLSDESC_CALL", + SDT_AArch64TLSDescCall, + [SDNPInGlue, SDNPOutGlue, SDNPHasChain, + SDNPVariadic]>; + +def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge", + SDT_AArch64WrapperLarge>; + + +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// + +// AArch64 Instruction Predicate Definitions. +// +def HasZCZ : Predicate<"Subtarget->hasZeroCycleZeroing()">; +def NoZCZ : Predicate<"!Subtarget->hasZeroCycleZeroing()">; +def IsDarwin : Predicate<"Subtarget->isTargetDarwin()">; +def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">; +def ForCodeSize : Predicate<"ForCodeSize">; +def NotForCodeSize : Predicate<"!ForCodeSize">; + +include "AArch64InstrFormats.td" + +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Miscellaneous instructions. +//===----------------------------------------------------------------------===// + +let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in { +def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt), + [(AArch64callseq_start timm:$amt)]>; +def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), + [(AArch64callseq_end timm:$amt1, timm:$amt2)]>; +} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 + +let isReMaterializable = 1, isCodeGenOnly = 1 in { +// FIXME: The following pseudo instructions are only needed because remat +// cannot handle multiple instructions. When that changes, they can be +// removed, along with the AArch64Wrapper node. + +let AddedComplexity = 10 in +def LOADgot : Pseudo<(outs GPR64:$dst), (ins i64imm:$addr), + [(set GPR64:$dst, (AArch64LOADgot tglobaladdr:$addr))]>, + Sched<[WriteLDAdr]>; + +// The MOVaddr instruction should match only when the add is not folded +// into a load or store address. +def MOVaddr + : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), + [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaladdr:$hi), + tglobaladdr:$low))]>, + Sched<[WriteAdrAdr]>; +def MOVaddrJT + : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), + [(set GPR64:$dst, (AArch64addlow (AArch64adrp tjumptable:$hi), + tjumptable:$low))]>, + Sched<[WriteAdrAdr]>; +def MOVaddrCP + : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), + [(set GPR64:$dst, (AArch64addlow (AArch64adrp tconstpool:$hi), + tconstpool:$low))]>, + Sched<[WriteAdrAdr]>; +def MOVaddrBA + : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), + [(set GPR64:$dst, (AArch64addlow (AArch64adrp tblockaddress:$hi), + tblockaddress:$low))]>, + Sched<[WriteAdrAdr]>; +def MOVaddrTLS + : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), + [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaltlsaddr:$hi), + tglobaltlsaddr:$low))]>, + Sched<[WriteAdrAdr]>; +def MOVaddrEXT + : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), + [(set GPR64:$dst, (AArch64addlow (AArch64adrp texternalsym:$hi), + texternalsym:$low))]>, + Sched<[WriteAdrAdr]>; + +} // isReMaterializable, isCodeGenOnly + +def : Pat<(AArch64LOADgot tglobaltlsaddr:$addr), + (LOADgot tglobaltlsaddr:$addr)>; + +def : Pat<(AArch64LOADgot texternalsym:$addr), + (LOADgot texternalsym:$addr)>; + +def : Pat<(AArch64LOADgot tconstpool:$addr), + (LOADgot tconstpool:$addr)>; + +//===----------------------------------------------------------------------===// +// System instructions. +//===----------------------------------------------------------------------===// + +def HINT : HintI<"hint">; +def : InstAlias<"nop", (HINT 0b000)>; +def : InstAlias<"yield",(HINT 0b001)>; +def : InstAlias<"wfe", (HINT 0b010)>; +def : InstAlias<"wfi", (HINT 0b011)>; +def : InstAlias<"sev", (HINT 0b100)>; +def : InstAlias<"sevl", (HINT 0b101)>; + + // As far as LLVM is concerned this writes to the system's exclusive monitors. +let mayLoad = 1, mayStore = 1 in +def CLREX : CRmSystemI; + +def DMB : CRmSystemI; +def DSB : CRmSystemI; +def ISB : CRmSystemI; +def : InstAlias<"clrex", (CLREX 0xf)>; +def : InstAlias<"isb", (ISB 0xf)>; + +def MRS : MRSI; +def MSR : MSRI; +def MSRpstate: MSRpstateI; + +// The thread pointer (on Linux, at least, where this has been implemented) is +// TPIDR_EL0. +def : Pat<(AArch64threadpointer), (MRS 0xde82)>; + +// Generic system instructions +def SYSxt : SystemXtI<0, "sys">; +def SYSLxt : SystemLXtI<1, "sysl">; + +def : InstAlias<"sys $op1, $Cn, $Cm, $op2", + (SYSxt imm0_7:$op1, sys_cr_op:$Cn, + sys_cr_op:$Cm, imm0_7:$op2, XZR)>; + +//===----------------------------------------------------------------------===// +// Move immediate instructions. +//===----------------------------------------------------------------------===// + +defm MOVK : InsertImmediate<0b11, "movk">; +defm MOVN : MoveImmediate<0b00, "movn">; + +let PostEncoderMethod = "fixMOVZ" in +defm MOVZ : MoveImmediate<0b10, "movz">; + +// First group of aliases covers an implicit "lsl #0". +def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0)>; +def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0)>; +def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>; +def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>; +def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>; +def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>; + +// Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax. +def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>; +def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>; +def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>; +def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>; + +def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>; +def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>; +def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>; +def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>; + +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48)>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32)>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16)>; +def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0)>; + +def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>; +def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>; + +def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>; +def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>; + +def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16)>; +def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0)>; + +// Final group of aliases covers true "mov $Rd, $imm" cases. +multiclass movw_mov_alias { + def _asmoperand : AsmOperandClass { + let Name = basename # width # "_lsl" # shift # "MovAlias"; + let PredicateMethod = "is" # basename # "MovAlias<" # width # ", " + # shift # ">"; + let RenderMethod = "add" # basename # "MovAliasOperands<" # shift # ">"; + } + + def _movimm : Operand { + let ParserMatchClass = !cast(NAME # "_asmoperand"); + } + + def : InstAlias<"mov $Rd, $imm", + (INST GPR:$Rd, !cast(NAME # "_movimm"):$imm, shift)>; +} + +defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 0>; +defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 16>; + +defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 0>; +defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 16>; +defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 32>; +defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 48>; + +defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 0>; +defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 16>; + +defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 0>; +defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 16>; +defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 32>; +defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 48>; + +let isReMaterializable = 1, isCodeGenOnly = 1, isMoveImm = 1, + isAsCheapAsAMove = 1 in { +// FIXME: The following pseudo instructions are only needed because remat +// cannot handle multiple instructions. When that changes, we can select +// directly to the real instructions and get rid of these pseudos. + +def MOVi32imm + : Pseudo<(outs GPR32:$dst), (ins i32imm:$src), + [(set GPR32:$dst, imm:$src)]>, + Sched<[WriteImm]>; +def MOVi64imm + : Pseudo<(outs GPR64:$dst), (ins i64imm:$src), + [(set GPR64:$dst, imm:$src)]>, + Sched<[WriteImm]>; +} // isReMaterializable, isCodeGenOnly + +// If possible, we want to use MOVi32imm even for 64-bit moves. This gives the +// eventual expansion code fewer bits to worry about getting right. Marshalling +// the types is a little tricky though: +def i64imm_32bit : ImmLeaf(Imm); +}]>; + +def trunc_imm : SDNodeXFormgetTargetConstant(N->getZExtValue(), MVT::i32); +}]>; + +def : Pat<(i64 i64imm_32bit:$src), + (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>; + +// Deal with the various forms of (ELF) large addressing with MOVZ/MOVK +// sequences. +def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2, + tglobaladdr:$g1, tglobaladdr:$g0), + (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48), + tglobaladdr:$g2, 32), + tglobaladdr:$g1, 16), + tglobaladdr:$g0, 0)>; + +def : Pat<(AArch64WrapperLarge tblockaddress:$g3, tblockaddress:$g2, + tblockaddress:$g1, tblockaddress:$g0), + (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48), + tblockaddress:$g2, 32), + tblockaddress:$g1, 16), + tblockaddress:$g0, 0)>; + +def : Pat<(AArch64WrapperLarge tconstpool:$g3, tconstpool:$g2, + tconstpool:$g1, tconstpool:$g0), + (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48), + tconstpool:$g2, 32), + tconstpool:$g1, 16), + tconstpool:$g0, 0)>; + +def : Pat<(AArch64WrapperLarge tjumptable:$g3, tjumptable:$g2, + tjumptable:$g1, tjumptable:$g0), + (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g3, 48), + tjumptable:$g2, 32), + tjumptable:$g1, 16), + tjumptable:$g0, 0)>; + + +//===----------------------------------------------------------------------===// +// Arithmetic instructions. +//===----------------------------------------------------------------------===// + +// Add/subtract with carry. +defm ADC : AddSubCarry<0, "adc", "adcs", AArch64adc, AArch64adc_flag>; +defm SBC : AddSubCarry<1, "sbc", "sbcs", AArch64sbc, AArch64sbc_flag>; + +def : InstAlias<"ngc $dst, $src", (SBCWr GPR32:$dst, WZR, GPR32:$src)>; +def : InstAlias<"ngc $dst, $src", (SBCXr GPR64:$dst, XZR, GPR64:$src)>; +def : InstAlias<"ngcs $dst, $src", (SBCSWr GPR32:$dst, WZR, GPR32:$src)>; +def : InstAlias<"ngcs $dst, $src", (SBCSXr GPR64:$dst, XZR, GPR64:$src)>; + +// Add/subtract +defm ADD : AddSub<0, "add", add>; +defm SUB : AddSub<1, "sub">; + +def : InstAlias<"mov $dst, $src", + (ADDWri GPR32sponly:$dst, GPR32sp:$src, 0, 0)>; +def : InstAlias<"mov $dst, $src", + (ADDWri GPR32sp:$dst, GPR32sponly:$src, 0, 0)>; +def : InstAlias<"mov $dst, $src", + (ADDXri GPR64sponly:$dst, GPR64sp:$src, 0, 0)>; +def : InstAlias<"mov $dst, $src", + (ADDXri GPR64sp:$dst, GPR64sponly:$src, 0, 0)>; + +defm ADDS : AddSubS<0, "adds", AArch64add_flag, "cmn">; +defm SUBS : AddSubS<1, "subs", AArch64sub_flag, "cmp">; + +// Use SUBS instead of SUB to enable CSE between SUBS and SUB. +def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm), + (SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>; +def : Pat<(sub GPR64sp:$Rn, addsub_shifted_imm64:$imm), + (SUBSXri GPR64sp:$Rn, addsub_shifted_imm64:$imm)>; +def : Pat<(sub GPR32:$Rn, GPR32:$Rm), + (SUBSWrr GPR32:$Rn, GPR32:$Rm)>; +def : Pat<(sub GPR64:$Rn, GPR64:$Rm), + (SUBSXrr GPR64:$Rn, GPR64:$Rm)>; +def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm), + (SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>; +def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm), + (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>; +def : Pat<(sub GPR32sp:$R2, arith_extended_reg32:$R3), + (SUBSWrx GPR32sp:$R2, arith_extended_reg32:$R3)>; +def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64:$R3), + (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64:$R3)>; + +// Because of the immediate format for add/sub-imm instructions, the +// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1). +// These patterns capture that transformation. +let AddedComplexity = 1 in { +def : Pat<(add GPR32:$Rn, neg_addsub_shifted_imm32:$imm), + (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>; +def : Pat<(add GPR64:$Rn, neg_addsub_shifted_imm64:$imm), + (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>; +def : Pat<(sub GPR32:$Rn, neg_addsub_shifted_imm32:$imm), + (ADDWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>; +def : Pat<(sub GPR64:$Rn, neg_addsub_shifted_imm64:$imm), + (ADDXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>; +} + +// Because of the immediate format for add/sub-imm instructions, the +// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1). +// These patterns capture that transformation. +let AddedComplexity = 1 in { +def : Pat<(AArch64add_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm), + (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>; +def : Pat<(AArch64add_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm), + (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>; +def : Pat<(AArch64sub_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm), + (ADDSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>; +def : Pat<(AArch64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm), + (ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>; +} + +def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>; +def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>; +def : InstAlias<"neg $dst, $src$shift", + (SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>; +def : InstAlias<"neg $dst, $src$shift", + (SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>; + +def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>; +def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>; +def : InstAlias<"negs $dst, $src$shift", + (SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>; +def : InstAlias<"negs $dst, $src$shift", + (SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>; + + +// Unsigned/Signed divide +defm UDIV : Div<0, "udiv", udiv>; +defm SDIV : Div<1, "sdiv", sdiv>; +let isCodeGenOnly = 1 in { +defm UDIV_Int : Div<0, "udiv", int_aarch64_udiv>; +defm SDIV_Int : Div<1, "sdiv", int_aarch64_sdiv>; +} + +// Variable shift +defm ASRV : Shift<0b10, "asr", sra>; +defm LSLV : Shift<0b00, "lsl", shl>; +defm LSRV : Shift<0b01, "lsr", srl>; +defm RORV : Shift<0b11, "ror", rotr>; + +def : ShiftAlias<"asrv", ASRVWr, GPR32>; +def : ShiftAlias<"asrv", ASRVXr, GPR64>; +def : ShiftAlias<"lslv", LSLVWr, GPR32>; +def : ShiftAlias<"lslv", LSLVXr, GPR64>; +def : ShiftAlias<"lsrv", LSRVWr, GPR32>; +def : ShiftAlias<"lsrv", LSRVXr, GPR64>; +def : ShiftAlias<"rorv", RORVWr, GPR32>; +def : ShiftAlias<"rorv", RORVXr, GPR64>; + +// Multiply-add +let AddedComplexity = 7 in { +defm MADD : MulAccum<0, "madd", add>; +defm MSUB : MulAccum<1, "msub", sub>; + +def : Pat<(i32 (mul GPR32:$Rn, GPR32:$Rm)), + (MADDWrrr GPR32:$Rn, GPR32:$Rm, WZR)>; +def : Pat<(i64 (mul GPR64:$Rn, GPR64:$Rm)), + (MADDXrrr GPR64:$Rn, GPR64:$Rm, XZR)>; + +def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))), + (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>; +def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))), + (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>; +} // AddedComplexity = 7 + +let AddedComplexity = 5 in { +def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>; +def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>; +def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>; +def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>; + +def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))), + (SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>; +def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))), + (UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>; + +def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))), + (SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>; +def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))), + (UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>; +} // AddedComplexity = 5 + +def : MulAccumWAlias<"mul", MADDWrrr>; +def : MulAccumXAlias<"mul", MADDXrrr>; +def : MulAccumWAlias<"mneg", MSUBWrrr>; +def : MulAccumXAlias<"mneg", MSUBXrrr>; +def : WideMulAccumAlias<"smull", SMADDLrrr>; +def : WideMulAccumAlias<"smnegl", SMSUBLrrr>; +def : WideMulAccumAlias<"umull", UMADDLrrr>; +def : WideMulAccumAlias<"umnegl", UMSUBLrrr>; + +// Multiply-high +def SMULHrr : MulHi<0b010, "smulh", mulhs>; +def UMULHrr : MulHi<0b110, "umulh", mulhu>; + +// CRC32 +def CRC32Brr : BaseCRC32<0, 0b00, 0, GPR32, int_aarch64_crc32b, "crc32b">; +def CRC32Hrr : BaseCRC32<0, 0b01, 0, GPR32, int_aarch64_crc32h, "crc32h">; +def CRC32Wrr : BaseCRC32<0, 0b10, 0, GPR32, int_aarch64_crc32w, "crc32w">; +def CRC32Xrr : BaseCRC32<1, 0b11, 0, GPR64, int_aarch64_crc32x, "crc32x">; + +def CRC32CBrr : BaseCRC32<0, 0b00, 1, GPR32, int_aarch64_crc32cb, "crc32cb">; +def CRC32CHrr : BaseCRC32<0, 0b01, 1, GPR32, int_aarch64_crc32ch, "crc32ch">; +def CRC32CWrr : BaseCRC32<0, 0b10, 1, GPR32, int_aarch64_crc32cw, "crc32cw">; +def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_aarch64_crc32cx, "crc32cx">; + + +//===----------------------------------------------------------------------===// +// Logical instructions. +//===----------------------------------------------------------------------===// + +// (immediate) +defm ANDS : LogicalImmS<0b11, "ands", AArch64and_flag>; +defm AND : LogicalImm<0b00, "and", and>; +defm EOR : LogicalImm<0b10, "eor", xor>; +defm ORR : LogicalImm<0b01, "orr", or>; + +// FIXME: these aliases *are* canonical sometimes (when movz can't be +// used). Actually, it seems to be working right now, but putting logical_immXX +// here is a bit dodgy on the AsmParser side too. +def : InstAlias<"mov $dst, $imm", (ORRWri GPR32sp:$dst, WZR, + logical_imm32:$imm), 0>; +def : InstAlias<"mov $dst, $imm", (ORRXri GPR64sp:$dst, XZR, + logical_imm64:$imm), 0>; + + +// (register) +defm ANDS : LogicalRegS<0b11, 0, "ands", AArch64and_flag>; +defm BICS : LogicalRegS<0b11, 1, "bics", + BinOpFrag<(AArch64and_flag node:$LHS, (not node:$RHS))>>; +defm AND : LogicalReg<0b00, 0, "and", and>; +defm BIC : LogicalReg<0b00, 1, "bic", + BinOpFrag<(and node:$LHS, (not node:$RHS))>>; +defm EON : LogicalReg<0b10, 1, "eon", + BinOpFrag<(xor node:$LHS, (not node:$RHS))>>; +defm EOR : LogicalReg<0b10, 0, "eor", xor>; +defm ORN : LogicalReg<0b01, 1, "orn", + BinOpFrag<(or node:$LHS, (not node:$RHS))>>; +defm ORR : LogicalReg<0b01, 0, "orr", or>; + +def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0), 2>; +def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0), 2>; + +def : InstAlias<"mvn $Wd, $Wm", (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0), 3>; +def : InstAlias<"mvn $Xd, $Xm", (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0), 3>; + +def : InstAlias<"mvn $Wd, $Wm$sh", + (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, logical_shift32:$sh), 2>; +def : InstAlias<"mvn $Xd, $Xm$sh", + (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, logical_shift64:$sh), 2>; + +def : InstAlias<"tst $src1, $src2", + (ANDSWri WZR, GPR32:$src1, logical_imm32:$src2), 2>; +def : InstAlias<"tst $src1, $src2", + (ANDSXri XZR, GPR64:$src1, logical_imm64:$src2), 2>; + +def : InstAlias<"tst $src1, $src2", + (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0), 3>; +def : InstAlias<"tst $src1, $src2", + (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0), 3>; + +def : InstAlias<"tst $src1, $src2$sh", + (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift32:$sh), 2>; +def : InstAlias<"tst $src1, $src2$sh", + (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift64:$sh), 2>; + + +def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>; +def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>; + + +//===----------------------------------------------------------------------===// +// One operand data processing instructions. +//===----------------------------------------------------------------------===// + +defm CLS : OneOperandData<0b101, "cls">; +defm CLZ : OneOperandData<0b100, "clz", ctlz>; +defm RBIT : OneOperandData<0b000, "rbit">; +def REV16Wr : OneWRegData<0b001, "rev16", + UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>; +def REV16Xr : OneXRegData<0b001, "rev16", null_frag>; + +def : Pat<(cttz GPR32:$Rn), + (CLZWr (RBITWr GPR32:$Rn))>; +def : Pat<(cttz GPR64:$Rn), + (CLZXr (RBITXr GPR64:$Rn))>; +def : Pat<(ctlz (or (shl (xor (sra GPR32:$Rn, (i64 31)), GPR32:$Rn), (i64 1)), + (i32 1))), + (CLSWr GPR32:$Rn)>; +def : Pat<(ctlz (or (shl (xor (sra GPR64:$Rn, (i64 63)), GPR64:$Rn), (i64 1)), + (i64 1))), + (CLSXr GPR64:$Rn)>; + +// Unlike the other one operand instructions, the instructions with the "rev" +// mnemonic do *not* just different in the size bit, but actually use different +// opcode bits for the different sizes. +def REVWr : OneWRegData<0b010, "rev", bswap>; +def REVXr : OneXRegData<0b011, "rev", bswap>; +def REV32Xr : OneXRegData<0b010, "rev32", + UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>; + +// The bswap commutes with the rotr so we want a pattern for both possible +// orders. +def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>; +def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>; + +//===----------------------------------------------------------------------===// +// Bitfield immediate extraction instruction. +//===----------------------------------------------------------------------===// +let neverHasSideEffects = 1 in +defm EXTR : ExtractImm<"extr">; +def : InstAlias<"ror $dst, $src, $shift", + (EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>; +def : InstAlias<"ror $dst, $src, $shift", + (EXTRXrri GPR64:$dst, GPR64:$src, GPR64:$src, imm0_63:$shift)>; + +def : Pat<(rotr GPR32:$Rn, (i64 imm0_31:$imm)), + (EXTRWrri GPR32:$Rn, GPR32:$Rn, imm0_31:$imm)>; +def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)), + (EXTRXrri GPR64:$Rn, GPR64:$Rn, imm0_63:$imm)>; + +//===----------------------------------------------------------------------===// +// Other bitfield immediate instructions. +//===----------------------------------------------------------------------===// +let neverHasSideEffects = 1 in { +defm BFM : BitfieldImmWith2RegArgs<0b01, "bfm">; +defm SBFM : BitfieldImm<0b00, "sbfm">; +defm UBFM : BitfieldImm<0b10, "ubfm">; +} + +def i32shift_a : Operand, SDNodeXFormgetZExtValue()) & 0x1f; + return CurDAG->getTargetConstant(enc, MVT::i64); +}]>; + +def i32shift_b : Operand, SDNodeXFormgetZExtValue(); + return CurDAG->getTargetConstant(enc, MVT::i64); +}]>; + +// min(7, 31 - shift_amt) +def i32shift_sext_i8 : Operand, SDNodeXFormgetZExtValue(); + enc = enc > 7 ? 7 : enc; + return CurDAG->getTargetConstant(enc, MVT::i64); +}]>; + +// min(15, 31 - shift_amt) +def i32shift_sext_i16 : Operand, SDNodeXFormgetZExtValue(); + enc = enc > 15 ? 15 : enc; + return CurDAG->getTargetConstant(enc, MVT::i64); +}]>; + +def i64shift_a : Operand, SDNodeXFormgetZExtValue()) & 0x3f; + return CurDAG->getTargetConstant(enc, MVT::i64); +}]>; + +def i64shift_b : Operand, SDNodeXFormgetZExtValue(); + return CurDAG->getTargetConstant(enc, MVT::i64); +}]>; + +// min(7, 63 - shift_amt) +def i64shift_sext_i8 : Operand, SDNodeXFormgetZExtValue(); + enc = enc > 7 ? 7 : enc; + return CurDAG->getTargetConstant(enc, MVT::i64); +}]>; + +// min(15, 63 - shift_amt) +def i64shift_sext_i16 : Operand, SDNodeXFormgetZExtValue(); + enc = enc > 15 ? 15 : enc; + return CurDAG->getTargetConstant(enc, MVT::i64); +}]>; + +// min(31, 63 - shift_amt) +def i64shift_sext_i32 : Operand, SDNodeXFormgetZExtValue(); + enc = enc > 31 ? 31 : enc; + return CurDAG->getTargetConstant(enc, MVT::i64); +}]>; + +def : Pat<(shl GPR32:$Rn, (i64 imm0_31:$imm)), + (UBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)), + (i64 (i32shift_b imm0_31:$imm)))>; +def : Pat<(shl GPR64:$Rn, (i64 imm0_63:$imm)), + (UBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)), + (i64 (i64shift_b imm0_63:$imm)))>; + +let AddedComplexity = 10 in { +def : Pat<(sra GPR32:$Rn, (i64 imm0_31:$imm)), + (SBFMWri GPR32:$Rn, imm0_31:$imm, 31)>; +def : Pat<(sra GPR64:$Rn, (i64 imm0_63:$imm)), + (SBFMXri GPR64:$Rn, imm0_63:$imm, 63)>; +} + +def : InstAlias<"asr $dst, $src, $shift", + (SBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>; +def : InstAlias<"asr $dst, $src, $shift", + (SBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>; +def : InstAlias<"sxtb $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 7)>; +def : InstAlias<"sxtb $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 7)>; +def : InstAlias<"sxth $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 15)>; +def : InstAlias<"sxth $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 15)>; +def : InstAlias<"sxtw $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 31)>; + +def : Pat<(srl GPR32:$Rn, (i64 imm0_31:$imm)), + (UBFMWri GPR32:$Rn, imm0_31:$imm, 31)>; +def : Pat<(srl GPR64:$Rn, (i64 imm0_63:$imm)), + (UBFMXri GPR64:$Rn, imm0_63:$imm, 63)>; + +def : InstAlias<"lsr $dst, $src, $shift", + (UBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>; +def : InstAlias<"lsr $dst, $src, $shift", + (UBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>; +def : InstAlias<"uxtb $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 7)>; +def : InstAlias<"uxtb $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 7)>; +def : InstAlias<"uxth $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 15)>; +def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>; +def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>; + +//===----------------------------------------------------------------------===// +// Conditionally set flags instructions. +//===----------------------------------------------------------------------===// +defm CCMN : CondSetFlagsImm<0, "ccmn">; +defm CCMP : CondSetFlagsImm<1, "ccmp">; + +defm CCMN : CondSetFlagsReg<0, "ccmn">; +defm CCMP : CondSetFlagsReg<1, "ccmp">; + +//===----------------------------------------------------------------------===// +// Conditional select instructions. +//===----------------------------------------------------------------------===// +defm CSEL : CondSelect<0, 0b00, "csel">; + +def inc : PatFrag<(ops node:$in), (add node:$in, 1)>; +defm CSINC : CondSelectOp<0, 0b01, "csinc", inc>; +defm CSINV : CondSelectOp<1, 0b00, "csinv", not>; +defm CSNEG : CondSelectOp<1, 0b01, "csneg", ineg>; + +def : Pat<(AArch64csinv GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV), + (CSINVWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>; +def : Pat<(AArch64csinv GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV), + (CSINVXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>; +def : Pat<(AArch64csneg GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV), + (CSNEGWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>; +def : Pat<(AArch64csneg GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV), + (CSNEGXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>; +def : Pat<(AArch64csinc GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV), + (CSINCWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>; +def : Pat<(AArch64csinc GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV), + (CSINCXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>; + +def : Pat<(AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV), + (CSINCWr WZR, WZR, (i32 imm:$cc))>; +def : Pat<(AArch64csel (i64 0), (i64 1), (i32 imm:$cc), NZCV), + (CSINCXr XZR, XZR, (i32 imm:$cc))>; +def : Pat<(AArch64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV), + (CSINVWr WZR, WZR, (i32 imm:$cc))>; +def : Pat<(AArch64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV), + (CSINVXr XZR, XZR, (i32 imm:$cc))>; + +// The inverse of the condition code from the alias instruction is what is used +// in the aliased instruction. The parser all ready inverts the condition code +// for these aliases. +def : InstAlias<"cset $dst, $cc", + (CSINCWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>; +def : InstAlias<"cset $dst, $cc", + (CSINCXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>; + +def : InstAlias<"csetm $dst, $cc", + (CSINVWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>; +def : InstAlias<"csetm $dst, $cc", + (CSINVXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>; + +def : InstAlias<"cinc $dst, $src, $cc", + (CSINCWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>; +def : InstAlias<"cinc $dst, $src, $cc", + (CSINCXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>; + +def : InstAlias<"cinv $dst, $src, $cc", + (CSINVWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>; +def : InstAlias<"cinv $dst, $src, $cc", + (CSINVXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>; + +def : InstAlias<"cneg $dst, $src, $cc", + (CSNEGWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>; +def : InstAlias<"cneg $dst, $src, $cc", + (CSNEGXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>; + +//===----------------------------------------------------------------------===// +// PC-relative instructions. +//===----------------------------------------------------------------------===// +let isReMaterializable = 1 in { +let neverHasSideEffects = 1, mayStore = 0, mayLoad = 0 in { +def ADR : ADRI<0, "adr", adrlabel, []>; +} // neverHasSideEffects = 1 + +def ADRP : ADRI<1, "adrp", adrplabel, + [(set GPR64:$Xd, (AArch64adrp tglobaladdr:$label))]>; +} // isReMaterializable = 1 + +// page address of a constant pool entry, block address +def : Pat<(AArch64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>; +def : Pat<(AArch64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>; + +//===----------------------------------------------------------------------===// +// Unconditional branch (register) instructions. +//===----------------------------------------------------------------------===// + +let isReturn = 1, isTerminator = 1, isBarrier = 1 in { +def RET : BranchReg<0b0010, "ret", []>; +def DRPS : SpecialReturn<0b0101, "drps">; +def ERET : SpecialReturn<0b0100, "eret">; +} // isReturn = 1, isTerminator = 1, isBarrier = 1 + +// Default to the LR register. +def : InstAlias<"ret", (RET LR)>; + +let isCall = 1, Defs = [LR], Uses = [SP] in { +def BLR : BranchReg<0b0001, "blr", [(AArch64call GPR64:$Rn)]>; +} // isCall + +let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { +def BR : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>; +} // isBranch, isTerminator, isBarrier, isIndirectBranch + +// Create a separate pseudo-instruction for codegen to use so that we don't +// flag lr as used in every function. It'll be restored before the RET by the +// epilogue if it's legitimately used. +def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> { + let isTerminator = 1; + let isBarrier = 1; + let isReturn = 1; +} + +// This is a directive-like pseudo-instruction. The purpose is to insert an +// R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction +// (which in the usual case is a BLR). +let hasSideEffects = 1 in +def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> { + let AsmString = ".tlsdesccall $sym"; +} + +// Pseudo-instruction representing a BLR with attached TLSDESC relocation. It +// gets expanded to two MCInsts during lowering. +let isCall = 1, Defs = [LR] in +def TLSDESC_BLR + : Pseudo<(outs), (ins GPR64:$dest, i64imm:$sym), + [(AArch64tlsdesc_call GPR64:$dest, tglobaltlsaddr:$sym)]>; + +def : Pat<(AArch64tlsdesc_call GPR64:$dest, texternalsym:$sym), + (TLSDESC_BLR GPR64:$dest, texternalsym:$sym)>; +//===----------------------------------------------------------------------===// +// Conditional branch (immediate) instruction. +//===----------------------------------------------------------------------===// +def Bcc : BranchCond; + +//===----------------------------------------------------------------------===// +// Compare-and-branch instructions. +//===----------------------------------------------------------------------===// +defm CBZ : CmpBranch<0, "cbz", AArch64cbz>; +defm CBNZ : CmpBranch<1, "cbnz", AArch64cbnz>; + +//===----------------------------------------------------------------------===// +// Test-bit-and-branch instructions. +//===----------------------------------------------------------------------===// +defm TBZ : TestBranch<0, "tbz", AArch64tbz>; +defm TBNZ : TestBranch<1, "tbnz", AArch64tbnz>; + +//===----------------------------------------------------------------------===// +// Unconditional branch (immediate) instructions. +//===----------------------------------------------------------------------===// +let isBranch = 1, isTerminator = 1, isBarrier = 1 in { +def B : BranchImm<0, "b", [(br bb:$addr)]>; +} // isBranch, isTerminator, isBarrier + +let isCall = 1, Defs = [LR], Uses = [SP] in { +def BL : CallImm<1, "bl", [(AArch64call tglobaladdr:$addr)]>; +} // isCall +def : Pat<(AArch64call texternalsym:$func), (BL texternalsym:$func)>; + +//===----------------------------------------------------------------------===// +// Exception generation instructions. +//===----------------------------------------------------------------------===// +def BRK : ExceptionGeneration<0b001, 0b00, "brk">; +def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">; +def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">; +def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">; +def HLT : ExceptionGeneration<0b010, 0b00, "hlt">; +def HVC : ExceptionGeneration<0b000, 0b10, "hvc">; +def SMC : ExceptionGeneration<0b000, 0b11, "smc">; +def SVC : ExceptionGeneration<0b000, 0b01, "svc">; + +// DCPSn defaults to an immediate operand of zero if unspecified. +def : InstAlias<"dcps1", (DCPS1 0)>; +def : InstAlias<"dcps2", (DCPS2 0)>; +def : InstAlias<"dcps3", (DCPS3 0)>; + +//===----------------------------------------------------------------------===// +// Load instructions. +//===----------------------------------------------------------------------===// + +// Pair (indexed, offset) +defm LDPW : LoadPairOffset<0b00, 0, GPR32, simm7s4, "ldp">; +defm LDPX : LoadPairOffset<0b10, 0, GPR64, simm7s8, "ldp">; +defm LDPS : LoadPairOffset<0b00, 1, FPR32, simm7s4, "ldp">; +defm LDPD : LoadPairOffset<0b01, 1, FPR64, simm7s8, "ldp">; +defm LDPQ : LoadPairOffset<0b10, 1, FPR128, simm7s16, "ldp">; + +defm LDPSW : LoadPairOffset<0b01, 0, GPR64, simm7s4, "ldpsw">; + +// Pair (pre-indexed) +def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32, simm7s4, "ldp">; +def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64, simm7s8, "ldp">; +def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32, simm7s4, "ldp">; +def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64, simm7s8, "ldp">; +def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128, simm7s16, "ldp">; + +def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64, simm7s4, "ldpsw">; + +// Pair (post-indexed) +def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32, simm7s4, "ldp">; +def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64, simm7s8, "ldp">; +def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32, simm7s4, "ldp">; +def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64, simm7s8, "ldp">; +def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128, simm7s16, "ldp">; + +def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64, simm7s4, "ldpsw">; + + +// Pair (no allocate) +defm LDNPW : LoadPairNoAlloc<0b00, 0, GPR32, simm7s4, "ldnp">; +defm LDNPX : LoadPairNoAlloc<0b10, 0, GPR64, simm7s8, "ldnp">; +defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32, simm7s4, "ldnp">; +defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64, simm7s8, "ldnp">; +defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128, simm7s16, "ldnp">; + +//--- +// (register offset) +//--- + +// Integer +defm LDRBB : Load8RO<0b00, 0, 0b01, GPR32, "ldrb", i32, zextloadi8>; +defm LDRHH : Load16RO<0b01, 0, 0b01, GPR32, "ldrh", i32, zextloadi16>; +defm LDRW : Load32RO<0b10, 0, 0b01, GPR32, "ldr", i32, load>; +defm LDRX : Load64RO<0b11, 0, 0b01, GPR64, "ldr", i64, load>; + +// Floating-point +defm LDRB : Load8RO<0b00, 1, 0b01, FPR8, "ldr", untyped, load>; +defm LDRH : Load16RO<0b01, 1, 0b01, FPR16, "ldr", f16, load>; +defm LDRS : Load32RO<0b10, 1, 0b01, FPR32, "ldr", f32, load>; +defm LDRD : Load64RO<0b11, 1, 0b01, FPR64, "ldr", f64, load>; +defm LDRQ : Load128RO<0b00, 1, 0b11, FPR128, "ldr", f128, load>; + +// Load sign-extended half-word +defm LDRSHW : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh", i32, sextloadi16>; +defm LDRSHX : Load16RO<0b01, 0, 0b10, GPR64, "ldrsh", i64, sextloadi16>; + +// Load sign-extended byte +defm LDRSBW : Load8RO<0b00, 0, 0b11, GPR32, "ldrsb", i32, sextloadi8>; +defm LDRSBX : Load8RO<0b00, 0, 0b10, GPR64, "ldrsb", i64, sextloadi8>; + +// Load sign-extended word +defm LDRSW : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>; + +// Pre-fetch. +defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">; + +// For regular load, we do not have any alignment requirement. +// Thus, it is safe to directly map the vector loads with interesting +// addressing modes. +// FIXME: We could do the same for bitconvert to floating point vectors. +multiclass ScalToVecROLoadPat { + def : Pat<(VecTy (scalar_to_vector (ScalTy + (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))), + (INSERT_SUBREG (VecTy (IMPLICIT_DEF)), + (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset), + sub)>; + + def : Pat<(VecTy (scalar_to_vector (ScalTy + (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))), + (INSERT_SUBREG (VecTy (IMPLICIT_DEF)), + (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset), + sub)>; +} + +let AddedComplexity = 10 in { +defm : ScalToVecROLoadPat; +defm : ScalToVecROLoadPat; + +defm : ScalToVecROLoadPat; +defm : ScalToVecROLoadPat; + +defm : ScalToVecROLoadPat; +defm : ScalToVecROLoadPat; + +defm : ScalToVecROLoadPat; +defm : ScalToVecROLoadPat; + +defm : ScalToVecROLoadPat; + +defm : ScalToVecROLoadPat; + + +def : Pat <(v1i64 (scalar_to_vector (i64 + (load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm, + ro_Wextend64:$extend))))), + (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>; + +def : Pat <(v1i64 (scalar_to_vector (i64 + (load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm, + ro_Xextend64:$extend))))), + (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>; +} + +// Match all load 64 bits width whose type is compatible with FPR64 +multiclass VecROLoadPat { + + def : Pat<(VecTy (load (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))), + (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>; + + def : Pat<(VecTy (load (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))), + (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>; +} + +let AddedComplexity = 10 in { +let Predicates = [IsLE] in { + // We must do vector loads with LD1 in big-endian. + defm : VecROLoadPat; + defm : VecROLoadPat; + defm : VecROLoadPat; + defm : VecROLoadPat; +} + +defm : VecROLoadPat; +defm : VecROLoadPat; + +// Match all load 128 bits width whose type is compatible with FPR128 +let Predicates = [IsLE] in { + // We must do vector loads with LD1 in big-endian. + defm : VecROLoadPat; + defm : VecROLoadPat; + defm : VecROLoadPat; + defm : VecROLoadPat; + defm : VecROLoadPat; + defm : VecROLoadPat; +} +} // AddedComplexity = 10 + +// zextload -> i64 +multiclass ExtLoadTo64ROPat { + def : Pat<(i64 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))), + (SUBREG_TO_REG (i64 0), + (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend), + sub_32)>; + + def : Pat<(i64 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))), + (SUBREG_TO_REG (i64 0), + (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend), + sub_32)>; +} + +let AddedComplexity = 10 in { + defm : ExtLoadTo64ROPat; + defm : ExtLoadTo64ROPat; + defm : ExtLoadTo64ROPat; + + // zextloadi1 -> zextloadi8 + defm : ExtLoadTo64ROPat; + + // extload -> zextload + defm : ExtLoadTo64ROPat; + defm : ExtLoadTo64ROPat; + defm : ExtLoadTo64ROPat; + + // extloadi1 -> zextloadi8 + defm : ExtLoadTo64ROPat; +} + + +// zextload -> i64 +multiclass ExtLoadTo32ROPat { + def : Pat<(i32 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))), + (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>; + + def : Pat<(i32 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))), + (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>; + +} + +let AddedComplexity = 10 in { + // extload -> zextload + defm : ExtLoadTo32ROPat; + defm : ExtLoadTo32ROPat; + defm : ExtLoadTo32ROPat; + + // zextloadi1 -> zextloadi8 + defm : ExtLoadTo32ROPat; +} + +//--- +// (unsigned immediate) +//--- +defm LDRX : LoadUI<0b11, 0, 0b01, GPR64, uimm12s8, "ldr", + [(set GPR64:$Rt, + (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>; +defm LDRW : LoadUI<0b10, 0, 0b01, GPR32, uimm12s4, "ldr", + [(set GPR32:$Rt, + (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>; +defm LDRB : LoadUI<0b00, 1, 0b01, FPR8, uimm12s1, "ldr", + [(set FPR8:$Rt, + (load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>; +defm LDRH : LoadUI<0b01, 1, 0b01, FPR16, uimm12s2, "ldr", + [(set (f16 FPR16:$Rt), + (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)))]>; +defm LDRS : LoadUI<0b10, 1, 0b01, FPR32, uimm12s4, "ldr", + [(set (f32 FPR32:$Rt), + (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>; +defm LDRD : LoadUI<0b11, 1, 0b01, FPR64, uimm12s8, "ldr", + [(set (f64 FPR64:$Rt), + (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>; +defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128, uimm12s16, "ldr", + [(set (f128 FPR128:$Rt), + (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>; + +// For regular load, we do not have any alignment requirement. +// Thus, it is safe to directly map the vector loads with interesting +// addressing modes. +// FIXME: We could do the same for bitconvert to floating point vectors. +def : Pat <(v8i8 (scalar_to_vector (i32 + (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), + (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)), + (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; +def : Pat <(v16i8 (scalar_to_vector (i32 + (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; +def : Pat <(v4i16 (scalar_to_vector (i32 + (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), + (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)), + (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; +def : Pat <(v8i16 (scalar_to_vector (i32 + (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), + (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; +def : Pat <(v2i32 (scalar_to_vector (i32 + (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), + (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), + (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; +def : Pat <(v4i32 (scalar_to_vector (i32 + (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), + (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; +def : Pat <(v1i64 (scalar_to_vector (i64 + (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; +def : Pat <(v2i64 (scalar_to_vector (i64 + (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))), + (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>; + +// Match all load 64 bits width whose type is compatible with FPR64 +let Predicates = [IsLE] in { + // We must use LD1 to perform vector loads in big-endian. + def : Pat<(v2f32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(v8i8 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(v4i16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(v2i32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; +} +def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; +def : Pat<(v1i64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), + (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; + +// Match all load 128 bits width whose type is compatible with FPR128 +let Predicates = [IsLE] in { + // We must use LD1 to perform vector loads in big-endian. + def : Pat<(v4f32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), + (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(v2f64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), + (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(v16i8 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), + (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(v8i16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), + (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(v4i32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), + (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(v2i64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), + (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; +} +def : Pat<(f128 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), + (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; + +defm LDRHH : LoadUI<0b01, 0, 0b01, GPR32, uimm12s2, "ldrh", + [(set GPR32:$Rt, + (zextloadi16 (am_indexed16 GPR64sp:$Rn, + uimm12s2:$offset)))]>; +defm LDRBB : LoadUI<0b00, 0, 0b01, GPR32, uimm12s1, "ldrb", + [(set GPR32:$Rt, + (zextloadi8 (am_indexed8 GPR64sp:$Rn, + uimm12s1:$offset)))]>; +// zextload -> i64 +def : Pat<(i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))), + (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>; +def : Pat<(i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))), + (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>; + +// zextloadi1 -> zextloadi8 +def : Pat<(i32 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))), + (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>; +def : Pat<(i64 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))), + (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>; + +// extload -> zextload +def : Pat<(i32 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))), + (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>; +def : Pat<(i32 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))), + (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>; +def : Pat<(i32 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))), + (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>; +def : Pat<(i64 (extloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))), + (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>; +def : Pat<(i64 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))), + (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>; +def : Pat<(i64 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))), + (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>; +def : Pat<(i64 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))), + (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>; + +// load sign-extended half-word +defm LDRSHW : LoadUI<0b01, 0, 0b11, GPR32, uimm12s2, "ldrsh", + [(set GPR32:$Rt, + (sextloadi16 (am_indexed16 GPR64sp:$Rn, + uimm12s2:$offset)))]>; +defm LDRSHX : LoadUI<0b01, 0, 0b10, GPR64, uimm12s2, "ldrsh", + [(set GPR64:$Rt, + (sextloadi16 (am_indexed16 GPR64sp:$Rn, + uimm12s2:$offset)))]>; + +// load sign-extended byte +defm LDRSBW : LoadUI<0b00, 0, 0b11, GPR32, uimm12s1, "ldrsb", + [(set GPR32:$Rt, + (sextloadi8 (am_indexed8 GPR64sp:$Rn, + uimm12s1:$offset)))]>; +defm LDRSBX : LoadUI<0b00, 0, 0b10, GPR64, uimm12s1, "ldrsb", + [(set GPR64:$Rt, + (sextloadi8 (am_indexed8 GPR64sp:$Rn, + uimm12s1:$offset)))]>; + +// load sign-extended word +defm LDRSW : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw", + [(set GPR64:$Rt, + (sextloadi32 (am_indexed32 GPR64sp:$Rn, + uimm12s4:$offset)))]>; + +// load zero-extended word +def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))), + (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>; + +// Pre-fetch. +def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm", + [(AArch64Prefetch imm:$Rt, + (am_indexed64 GPR64sp:$Rn, + uimm12s8:$offset))]>; + +def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>; + +//--- +// (literal) +def LDRWl : LoadLiteral<0b00, 0, GPR32, "ldr">; +def LDRXl : LoadLiteral<0b01, 0, GPR64, "ldr">; +def LDRSl : LoadLiteral<0b00, 1, FPR32, "ldr">; +def LDRDl : LoadLiteral<0b01, 1, FPR64, "ldr">; +def LDRQl : LoadLiteral<0b10, 1, FPR128, "ldr">; + +// load sign-extended word +def LDRSWl : LoadLiteral<0b10, 0, GPR64, "ldrsw">; + +// prefetch +def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>; +// [(AArch64Prefetch imm:$Rt, tglobaladdr:$label)]>; + +//--- +// (unscaled immediate) +defm LDURX : LoadUnscaled<0b11, 0, 0b01, GPR64, "ldur", + [(set GPR64:$Rt, + (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>; +defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32, "ldur", + [(set GPR32:$Rt, + (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>; +defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8, "ldur", + [(set FPR8:$Rt, + (load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>; +defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16, "ldur", + [(set FPR16:$Rt, + (load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>; +defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32, "ldur", + [(set (f32 FPR32:$Rt), + (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>; +defm LDURD : LoadUnscaled<0b11, 1, 0b01, FPR64, "ldur", + [(set (f64 FPR64:$Rt), + (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>; +defm LDURQ : LoadUnscaled<0b00, 1, 0b11, FPR128, "ldur", + [(set (f128 FPR128:$Rt), + (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset)))]>; + +defm LDURHH + : LoadUnscaled<0b01, 0, 0b01, GPR32, "ldurh", + [(set GPR32:$Rt, + (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>; +defm LDURBB + : LoadUnscaled<0b00, 0, 0b01, GPR32, "ldurb", + [(set GPR32:$Rt, + (zextloadi8 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>; + +// Match all load 64 bits width whose type is compatible with FPR64 +let Predicates = [IsLE] in { + def : Pat<(v2f32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), + (LDURDi GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(v2i32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), + (LDURDi GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(v4i16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), + (LDURDi GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(v8i8 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), + (LDURDi GPR64sp:$Rn, simm9:$offset)>; +} +def : Pat<(v1f64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), + (LDURDi GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(v1i64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), + (LDURDi GPR64sp:$Rn, simm9:$offset)>; + +// Match all load 128 bits width whose type is compatible with FPR128 +let Predicates = [IsLE] in { + def : Pat<(v2f64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), + (LDURQi GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(v2i64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), + (LDURQi GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(v4f32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), + (LDURQi GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(v4i32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), + (LDURQi GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(v8i16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), + (LDURQi GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(v16i8 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), + (LDURQi GPR64sp:$Rn, simm9:$offset)>; +} + +// anyext -> zext +def : Pat<(i32 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))), + (LDURHHi GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(i32 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), + (LDURBBi GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(i32 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), + (LDURBBi GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(i64 (extloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))), + (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>; +def : Pat<(i64 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))), + (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>; +def : Pat<(i64 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), + (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>; +def : Pat<(i64 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), + (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>; +// unscaled zext +def : Pat<(i32 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))), + (LDURHHi GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(i32 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), + (LDURBBi GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(i32 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), + (LDURBBi GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(i64 (zextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))), + (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>; +def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))), + (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>; +def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), + (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>; +def : Pat<(i64 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), + (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>; + + +//--- +// LDR mnemonics fall back to LDUR for negative or unaligned offsets. + +// Define new assembler match classes as we want to only match these when +// the don't otherwise match the scaled addressing mode for LDR/STR. Don't +// associate a DiagnosticType either, as we want the diagnostic for the +// canonical form (the scaled operand) to take precedence. +class SImm9OffsetOperand : AsmOperandClass { + let Name = "SImm9OffsetFB" # Width; + let PredicateMethod = "isSImm9OffsetFB<" # Width # ">"; + let RenderMethod = "addImmOperands"; +} + +def SImm9OffsetFB8Operand : SImm9OffsetOperand<8>; +def SImm9OffsetFB16Operand : SImm9OffsetOperand<16>; +def SImm9OffsetFB32Operand : SImm9OffsetOperand<32>; +def SImm9OffsetFB64Operand : SImm9OffsetOperand<64>; +def SImm9OffsetFB128Operand : SImm9OffsetOperand<128>; + +def simm9_offset_fb8 : Operand { + let ParserMatchClass = SImm9OffsetFB8Operand; +} +def simm9_offset_fb16 : Operand { + let ParserMatchClass = SImm9OffsetFB16Operand; +} +def simm9_offset_fb32 : Operand { + let ParserMatchClass = SImm9OffsetFB32Operand; +} +def simm9_offset_fb64 : Operand { + let ParserMatchClass = SImm9OffsetFB64Operand; +} +def simm9_offset_fb128 : Operand { + let ParserMatchClass = SImm9OffsetFB128Operand; +} + +def : InstAlias<"ldr $Rt, [$Rn, $offset]", + (LDURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>; +def : InstAlias<"ldr $Rt, [$Rn, $offset]", + (LDURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>; +def : InstAlias<"ldr $Rt, [$Rn, $offset]", + (LDURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>; +def : InstAlias<"ldr $Rt, [$Rn, $offset]", + (LDURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>; +def : InstAlias<"ldr $Rt, [$Rn, $offset]", + (LDURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>; +def : InstAlias<"ldr $Rt, [$Rn, $offset]", + (LDURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>; +def : InstAlias<"ldr $Rt, [$Rn, $offset]", + (LDURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>; + +// zextload -> i64 +def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), + (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>; +def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))), + (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>; + +// load sign-extended half-word +defm LDURSHW + : LoadUnscaled<0b01, 0, 0b11, GPR32, "ldursh", + [(set GPR32:$Rt, + (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>; +defm LDURSHX + : LoadUnscaled<0b01, 0, 0b10, GPR64, "ldursh", + [(set GPR64:$Rt, + (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>; + +// load sign-extended byte +defm LDURSBW + : LoadUnscaled<0b00, 0, 0b11, GPR32, "ldursb", + [(set GPR32:$Rt, + (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>; +defm LDURSBX + : LoadUnscaled<0b00, 0, 0b10, GPR64, "ldursb", + [(set GPR64:$Rt, + (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>; + +// load sign-extended word +defm LDURSW + : LoadUnscaled<0b10, 0, 0b10, GPR64, "ldursw", + [(set GPR64:$Rt, + (sextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>; + +// zero and sign extending aliases from generic LDR* mnemonics to LDUR*. +def : InstAlias<"ldrb $Rt, [$Rn, $offset]", + (LDURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>; +def : InstAlias<"ldrh $Rt, [$Rn, $offset]", + (LDURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>; +def : InstAlias<"ldrsb $Rt, [$Rn, $offset]", + (LDURSBWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>; +def : InstAlias<"ldrsb $Rt, [$Rn, $offset]", + (LDURSBXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>; +def : InstAlias<"ldrsh $Rt, [$Rn, $offset]", + (LDURSHWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>; +def : InstAlias<"ldrsh $Rt, [$Rn, $offset]", + (LDURSHXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>; +def : InstAlias<"ldrsw $Rt, [$Rn, $offset]", + (LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>; + +// Pre-fetch. +defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum", + [(AArch64Prefetch imm:$Rt, + (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>; + +//--- +// (unscaled immediate, unprivileged) +defm LDTRX : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">; +defm LDTRW : LoadUnprivileged<0b10, 0, 0b01, GPR32, "ldtr">; + +defm LDTRH : LoadUnprivileged<0b01, 0, 0b01, GPR32, "ldtrh">; +defm LDTRB : LoadUnprivileged<0b00, 0, 0b01, GPR32, "ldtrb">; + +// load sign-extended half-word +defm LDTRSHW : LoadUnprivileged<0b01, 0, 0b11, GPR32, "ldtrsh">; +defm LDTRSHX : LoadUnprivileged<0b01, 0, 0b10, GPR64, "ldtrsh">; + +// load sign-extended byte +defm LDTRSBW : LoadUnprivileged<0b00, 0, 0b11, GPR32, "ldtrsb">; +defm LDTRSBX : LoadUnprivileged<0b00, 0, 0b10, GPR64, "ldtrsb">; + +// load sign-extended word +defm LDTRSW : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">; + +//--- +// (immediate pre-indexed) +def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32, "ldr">; +def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64, "ldr">; +def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8, "ldr">; +def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16, "ldr">; +def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32, "ldr">; +def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64, "ldr">; +def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128, "ldr">; + +// load sign-extended half-word +def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32, "ldrsh">; +def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64, "ldrsh">; + +// load sign-extended byte +def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32, "ldrsb">; +def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64, "ldrsb">; + +// load zero-extended byte +def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32, "ldrb">; +def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32, "ldrh">; + +// load sign-extended word +def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64, "ldrsw">; + +//--- +// (immediate post-indexed) +def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32, "ldr">; +def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64, "ldr">; +def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8, "ldr">; +def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16, "ldr">; +def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32, "ldr">; +def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64, "ldr">; +def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128, "ldr">; + +// load sign-extended half-word +def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32, "ldrsh">; +def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64, "ldrsh">; + +// load sign-extended byte +def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32, "ldrsb">; +def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64, "ldrsb">; + +// load zero-extended byte +def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32, "ldrb">; +def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32, "ldrh">; + +// load sign-extended word +def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64, "ldrsw">; + +//===----------------------------------------------------------------------===// +// Store instructions. +//===----------------------------------------------------------------------===// + +// Pair (indexed, offset) +// FIXME: Use dedicated range-checked addressing mode operand here. +defm STPW : StorePairOffset<0b00, 0, GPR32, simm7s4, "stp">; +defm STPX : StorePairOffset<0b10, 0, GPR64, simm7s8, "stp">; +defm STPS : StorePairOffset<0b00, 1, FPR32, simm7s4, "stp">; +defm STPD : StorePairOffset<0b01, 1, FPR64, simm7s8, "stp">; +defm STPQ : StorePairOffset<0b10, 1, FPR128, simm7s16, "stp">; + +// Pair (pre-indexed) +def STPWpre : StorePairPreIdx<0b00, 0, GPR32, simm7s4, "stp">; +def STPXpre : StorePairPreIdx<0b10, 0, GPR64, simm7s8, "stp">; +def STPSpre : StorePairPreIdx<0b00, 1, FPR32, simm7s4, "stp">; +def STPDpre : StorePairPreIdx<0b01, 1, FPR64, simm7s8, "stp">; +def STPQpre : StorePairPreIdx<0b10, 1, FPR128, simm7s16, "stp">; + +// Pair (pre-indexed) +def STPWpost : StorePairPostIdx<0b00, 0, GPR32, simm7s4, "stp">; +def STPXpost : StorePairPostIdx<0b10, 0, GPR64, simm7s8, "stp">; +def STPSpost : StorePairPostIdx<0b00, 1, FPR32, simm7s4, "stp">; +def STPDpost : StorePairPostIdx<0b01, 1, FPR64, simm7s8, "stp">; +def STPQpost : StorePairPostIdx<0b10, 1, FPR128, simm7s16, "stp">; + +// Pair (no allocate) +defm STNPW : StorePairNoAlloc<0b00, 0, GPR32, simm7s4, "stnp">; +defm STNPX : StorePairNoAlloc<0b10, 0, GPR64, simm7s8, "stnp">; +defm STNPS : StorePairNoAlloc<0b00, 1, FPR32, simm7s4, "stnp">; +defm STNPD : StorePairNoAlloc<0b01, 1, FPR64, simm7s8, "stnp">; +defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128, simm7s16, "stnp">; + +//--- +// (Register offset) + +// Integer +defm STRBB : Store8RO< 0b00, 0, 0b00, GPR32, "strb", i32, truncstorei8>; +defm STRHH : Store16RO<0b01, 0, 0b00, GPR32, "strh", i32, truncstorei16>; +defm STRW : Store32RO<0b10, 0, 0b00, GPR32, "str", i32, store>; +defm STRX : Store64RO<0b11, 0, 0b00, GPR64, "str", i64, store>; + + +// Floating-point +defm STRB : Store8RO< 0b00, 1, 0b00, FPR8, "str", untyped, store>; +defm STRH : Store16RO<0b01, 1, 0b00, FPR16, "str", f16, store>; +defm STRS : Store32RO<0b10, 1, 0b00, FPR32, "str", f32, store>; +defm STRD : Store64RO<0b11, 1, 0b00, FPR64, "str", f64, store>; +defm STRQ : Store128RO<0b00, 1, 0b10, FPR128, "str", f128, store>; + +multiclass TruncStoreFrom64ROPat { + + def : Pat<(storeop GPR64:$Rt, + (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)), + (STRW (EXTRACT_SUBREG GPR64:$Rt, sub_32), + GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>; + + def : Pat<(storeop GPR64:$Rt, + (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)), + (STRX (EXTRACT_SUBREG GPR64:$Rt, sub_32), + GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>; +} + +let AddedComplexity = 10 in { + // truncstore i64 + defm : TruncStoreFrom64ROPat; + defm : TruncStoreFrom64ROPat; + defm : TruncStoreFrom64ROPat; +} + +multiclass VecROStorePat { + def : Pat<(store (VecTy FPR:$Rt), + (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)), + (STRW FPR:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>; + + def : Pat<(store (VecTy FPR:$Rt), + (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)), + (STRX FPR:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>; +} + +let AddedComplexity = 10 in { +// Match all store 64 bits width whose type is compatible with FPR64 +let Predicates = [IsLE] in { + // We must use ST1 to store vectors in big-endian. + defm : VecROStorePat; + defm : VecROStorePat; + defm : VecROStorePat; + defm : VecROStorePat; +} + +defm : VecROStorePat; +defm : VecROStorePat; + +// Match all store 128 bits width whose type is compatible with FPR128 +let Predicates = [IsLE] in { + // We must use ST1 to store vectors in big-endian. + defm : VecROStorePat; + defm : VecROStorePat; + defm : VecROStorePat; + defm : VecROStorePat; + defm : VecROStorePat; + defm : VecROStorePat; +} +} // AddedComplexity = 10 + +//--- +// (unsigned immediate) +defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str", + [(store GPR64:$Rt, + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>; +defm STRW : StoreUI<0b10, 0, 0b00, GPR32, uimm12s4, "str", + [(store GPR32:$Rt, + (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>; +defm STRB : StoreUI<0b00, 1, 0b00, FPR8, uimm12s1, "str", + [(store FPR8:$Rt, + (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>; +defm STRH : StoreUI<0b01, 1, 0b00, FPR16, uimm12s2, "str", + [(store (f16 FPR16:$Rt), + (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))]>; +defm STRS : StoreUI<0b10, 1, 0b00, FPR32, uimm12s4, "str", + [(store (f32 FPR32:$Rt), + (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>; +defm STRD : StoreUI<0b11, 1, 0b00, FPR64, uimm12s8, "str", + [(store (f64 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>; +defm STRQ : StoreUI<0b00, 1, 0b10, FPR128, uimm12s16, "str", []>; + +defm STRHH : StoreUI<0b01, 0, 0b00, GPR32, uimm12s2, "strh", + [(truncstorei16 GPR32:$Rt, + (am_indexed16 GPR64sp:$Rn, + uimm12s2:$offset))]>; +defm STRBB : StoreUI<0b00, 0, 0b00, GPR32, uimm12s1, "strb", + [(truncstorei8 GPR32:$Rt, + (am_indexed8 GPR64sp:$Rn, + uimm12s1:$offset))]>; + +// Match all store 64 bits width whose type is compatible with FPR64 +let AddedComplexity = 10 in { +let Predicates = [IsLE] in { + // We must use ST1 to store vectors in big-endian. + def : Pat<(store (v2f32 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), + (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(store (v8i8 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), + (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(store (v4i16 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), + (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; + def : Pat<(store (v2i32 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), + (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; +} +def : Pat<(store (v1f64 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), + (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; +def : Pat<(store (v1i64 FPR64:$Rt), + (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), + (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; + +// Match all store 128 bits width whose type is compatible with FPR128 +let Predicates = [IsLE] in { + // We must use ST1 to store vectors in big-endian. + def : Pat<(store (v4f32 FPR128:$Rt), + (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), + (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(store (v2f64 FPR128:$Rt), + (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), + (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(store (v16i8 FPR128:$Rt), + (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), + (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(store (v8i16 FPR128:$Rt), + (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), + (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(store (v4i32 FPR128:$Rt), + (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), + (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; + def : Pat<(store (v2i64 FPR128:$Rt), + (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), + (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; +} +def : Pat<(store (f128 FPR128:$Rt), + (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), + (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; + +// truncstore i64 +def : Pat<(truncstorei32 GPR64:$Rt, + (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)), + (STRWui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s4:$offset)>; +def : Pat<(truncstorei16 GPR64:$Rt, + (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)), + (STRHHui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s2:$offset)>; +def : Pat<(truncstorei8 GPR64:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)), + (STRBBui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s1:$offset)>; + +} // AddedComplexity = 10 + +//--- +// (unscaled immediate) +defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64, "stur", + [(store GPR64:$Rt, + (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>; +defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32, "stur", + [(store GPR32:$Rt, + (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>; +defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8, "stur", + [(store FPR8:$Rt, + (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>; +defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16, "stur", + [(store (f16 FPR16:$Rt), + (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>; +defm STURS : StoreUnscaled<0b10, 1, 0b00, FPR32, "stur", + [(store (f32 FPR32:$Rt), + (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>; +defm STURD : StoreUnscaled<0b11, 1, 0b00, FPR64, "stur", + [(store (f64 FPR64:$Rt), + (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>; +defm STURQ : StoreUnscaled<0b00, 1, 0b10, FPR128, "stur", + [(store (f128 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset))]>; +defm STURHH : StoreUnscaled<0b01, 0, 0b00, GPR32, "sturh", + [(truncstorei16 GPR32:$Rt, + (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>; +defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32, "sturb", + [(truncstorei8 GPR32:$Rt, + (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>; + +// Match all store 64 bits width whose type is compatible with FPR64 +let Predicates = [IsLE] in { + // We must use ST1 to store vectors in big-endian. + def : Pat<(store (v2f32 FPR64:$Rt), + (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v8i8 FPR64:$Rt), + (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v4i16 FPR64:$Rt), + (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v2i32 FPR64:$Rt), + (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; +} +def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; + +// Match all store 128 bits width whose type is compatible with FPR128 +let Predicates = [IsLE] in { + // We must use ST1 to store vectors in big-endian. + def : Pat<(store (v4f32 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v2f64 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v16i8 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v8i16 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v4i32 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v2i64 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; + def : Pat<(store (v2f64 FPR128:$Rt), + (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; +} + +// unscaled i64 truncating stores +def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)), + (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)), + (STURHHi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>; +def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)), + (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>; + +//--- +// STR mnemonics fall back to STUR for negative or unaligned offsets. +def : InstAlias<"str $Rt, [$Rn, $offset]", + (STURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>; +def : InstAlias<"str $Rt, [$Rn, $offset]", + (STURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>; +def : InstAlias<"str $Rt, [$Rn, $offset]", + (STURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>; +def : InstAlias<"str $Rt, [$Rn, $offset]", + (STURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>; +def : InstAlias<"str $Rt, [$Rn, $offset]", + (STURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>; +def : InstAlias<"str $Rt, [$Rn, $offset]", + (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>; +def : InstAlias<"str $Rt, [$Rn, $offset]", + (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>; + +def : InstAlias<"strb $Rt, [$Rn, $offset]", + (STURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>; +def : InstAlias<"strh $Rt, [$Rn, $offset]", + (STURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>; + +//--- +// (unscaled immediate, unprivileged) +defm STTRW : StoreUnprivileged<0b10, 0, 0b00, GPR32, "sttr">; +defm STTRX : StoreUnprivileged<0b11, 0, 0b00, GPR64, "sttr">; + +defm STTRH : StoreUnprivileged<0b01, 0, 0b00, GPR32, "sttrh">; +defm STTRB : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">; + +//--- +// (immediate pre-indexed) +def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32, "str", pre_store, i32>; +def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64, "str", pre_store, i64>; +def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8, "str", pre_store, untyped>; +def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16, "str", pre_store, f16>; +def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32, "str", pre_store, f32>; +def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64, "str", pre_store, f64>; +def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128, "str", pre_store, f128>; + +def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32, "strb", pre_truncsti8, i32>; +def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32, "strh", pre_truncsti16, i32>; + +// truncstore i64 +def : Pat<(pre_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off), + (STRWpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr, + simm9:$off)>; +def : Pat<(pre_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off), + (STRHHpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr, + simm9:$off)>; +def : Pat<(pre_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off), + (STRBBpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr, + simm9:$off)>; + +def : Pat<(pre_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; + +def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; + +//--- +// (immediate post-indexed) +def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32, "str", post_store, i32>; +def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64, "str", post_store, i64>; +def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8, "str", post_store, untyped>; +def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16, "str", post_store, f16>; +def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32, "str", post_store, f32>; +def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64, "str", post_store, f64>; +def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128, "str", post_store, f128>; + +def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32, "strb", post_truncsti8, i32>; +def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32, "strh", post_truncsti16, i32>; + +// truncstore i64 +def : Pat<(post_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off), + (STRWpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr, + simm9:$off)>; +def : Pat<(post_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off), + (STRHHpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr, + simm9:$off)>; +def : Pat<(post_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off), + (STRBBpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr, + simm9:$off)>; + +def : Pat<(post_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off), + (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; + +def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; +def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off), + (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; + +//===----------------------------------------------------------------------===// +// Load/store exclusive instructions. +//===----------------------------------------------------------------------===// + +def LDARW : LoadAcquire <0b10, 1, 1, 0, 1, GPR32, "ldar">; +def LDARX : LoadAcquire <0b11, 1, 1, 0, 1, GPR64, "ldar">; +def LDARB : LoadAcquire <0b00, 1, 1, 0, 1, GPR32, "ldarb">; +def LDARH : LoadAcquire <0b01, 1, 1, 0, 1, GPR32, "ldarh">; + +def LDAXRW : LoadExclusive <0b10, 0, 1, 0, 1, GPR32, "ldaxr">; +def LDAXRX : LoadExclusive <0b11, 0, 1, 0, 1, GPR64, "ldaxr">; +def LDAXRB : LoadExclusive <0b00, 0, 1, 0, 1, GPR32, "ldaxrb">; +def LDAXRH : LoadExclusive <0b01, 0, 1, 0, 1, GPR32, "ldaxrh">; + +def LDXRW : LoadExclusive <0b10, 0, 1, 0, 0, GPR32, "ldxr">; +def LDXRX : LoadExclusive <0b11, 0, 1, 0, 0, GPR64, "ldxr">; +def LDXRB : LoadExclusive <0b00, 0, 1, 0, 0, GPR32, "ldxrb">; +def LDXRH : LoadExclusive <0b01, 0, 1, 0, 0, GPR32, "ldxrh">; + +def STLRW : StoreRelease <0b10, 1, 0, 0, 1, GPR32, "stlr">; +def STLRX : StoreRelease <0b11, 1, 0, 0, 1, GPR64, "stlr">; +def STLRB : StoreRelease <0b00, 1, 0, 0, 1, GPR32, "stlrb">; +def STLRH : StoreRelease <0b01, 1, 0, 0, 1, GPR32, "stlrh">; + +def STLXRW : StoreExclusive<0b10, 0, 0, 0, 1, GPR32, "stlxr">; +def STLXRX : StoreExclusive<0b11, 0, 0, 0, 1, GPR64, "stlxr">; +def STLXRB : StoreExclusive<0b00, 0, 0, 0, 1, GPR32, "stlxrb">; +def STLXRH : StoreExclusive<0b01, 0, 0, 0, 1, GPR32, "stlxrh">; + +def STXRW : StoreExclusive<0b10, 0, 0, 0, 0, GPR32, "stxr">; +def STXRX : StoreExclusive<0b11, 0, 0, 0, 0, GPR64, "stxr">; +def STXRB : StoreExclusive<0b00, 0, 0, 0, 0, GPR32, "stxrb">; +def STXRH : StoreExclusive<0b01, 0, 0, 0, 0, GPR32, "stxrh">; + +def LDAXPW : LoadExclusivePair<0b10, 0, 1, 1, 1, GPR32, "ldaxp">; +def LDAXPX : LoadExclusivePair<0b11, 0, 1, 1, 1, GPR64, "ldaxp">; + +def LDXPW : LoadExclusivePair<0b10, 0, 1, 1, 0, GPR32, "ldxp">; +def LDXPX : LoadExclusivePair<0b11, 0, 1, 1, 0, GPR64, "ldxp">; + +def STLXPW : StoreExclusivePair<0b10, 0, 0, 1, 1, GPR32, "stlxp">; +def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">; + +def STXPW : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">; +def STXPX : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">; + +//===----------------------------------------------------------------------===// +// Scaled floating point to integer conversion instructions. +//===----------------------------------------------------------------------===// + +defm FCVTAS : FPToIntegerUnscaled<0b00, 0b100, "fcvtas", int_aarch64_neon_fcvtas>; +defm FCVTAU : FPToIntegerUnscaled<0b00, 0b101, "fcvtau", int_aarch64_neon_fcvtau>; +defm FCVTMS : FPToIntegerUnscaled<0b10, 0b000, "fcvtms", int_aarch64_neon_fcvtms>; +defm FCVTMU : FPToIntegerUnscaled<0b10, 0b001, "fcvtmu", int_aarch64_neon_fcvtmu>; +defm FCVTNS : FPToIntegerUnscaled<0b00, 0b000, "fcvtns", int_aarch64_neon_fcvtns>; +defm FCVTNU : FPToIntegerUnscaled<0b00, 0b001, "fcvtnu", int_aarch64_neon_fcvtnu>; +defm FCVTPS : FPToIntegerUnscaled<0b01, 0b000, "fcvtps", int_aarch64_neon_fcvtps>; +defm FCVTPU : FPToIntegerUnscaled<0b01, 0b001, "fcvtpu", int_aarch64_neon_fcvtpu>; +defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>; +defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>; +defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>; +defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>; +let isCodeGenOnly = 1 in { +defm FCVTZS_Int : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>; +defm FCVTZU_Int : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>; +defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>; +defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>; +} + +//===----------------------------------------------------------------------===// +// Scaled integer to floating point conversion instructions. +//===----------------------------------------------------------------------===// + +defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>; +defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>; + +//===----------------------------------------------------------------------===// +// Unscaled integer to floating point conversion instruction. +//===----------------------------------------------------------------------===// + +defm FMOV : UnscaledConversion<"fmov">; + +def : Pat<(f32 (fpimm0)), (FMOVWSr WZR)>, Requires<[NoZCZ]>; +def : Pat<(f64 (fpimm0)), (FMOVXDr XZR)>, Requires<[NoZCZ]>; + +//===----------------------------------------------------------------------===// +// Floating point conversion instruction. +//===----------------------------------------------------------------------===// + +defm FCVT : FPConversion<"fcvt">; + +def : Pat<(f32_to_f16 FPR32:$Rn), + (i32 (COPY_TO_REGCLASS + (f32 (SUBREG_TO_REG (i32 0), (FCVTHSr FPR32:$Rn), hsub)), + GPR32))>; + +def FCVTSHpseudo : Pseudo<(outs FPR32:$Rd), (ins FPR32:$Rn), + [(set (f32 FPR32:$Rd), (f16_to_f32 i32:$Rn))]>; + +//===----------------------------------------------------------------------===// +// Floating point single operand instructions. +//===----------------------------------------------------------------------===// + +defm FABS : SingleOperandFPData<0b0001, "fabs", fabs>; +defm FMOV : SingleOperandFPData<0b0000, "fmov">; +defm FNEG : SingleOperandFPData<0b0010, "fneg", fneg>; +defm FRINTA : SingleOperandFPData<0b1100, "frinta", frnd>; +defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>; +defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>; +defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_aarch64_neon_frintn>; +defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>; + +def : Pat<(v1f64 (int_aarch64_neon_frintn (v1f64 FPR64:$Rn))), + (FRINTNDr FPR64:$Rn)>; + +// FRINTX is inserted to set the flags as required by FENV_ACCESS ON behavior +// in the C spec. Setting hasSideEffects ensures it is not DCE'd. +// +// TODO: We should really model the FPSR flags correctly. This is really ugly. +let hasSideEffects = 1 in { +defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>; +} + +defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>; + +let SchedRW = [WriteFDiv] in { +defm FSQRT : SingleOperandFPData<0b0011, "fsqrt", fsqrt>; +} + +//===----------------------------------------------------------------------===// +// Floating point two operand instructions. +//===----------------------------------------------------------------------===// + +defm FADD : TwoOperandFPData<0b0010, "fadd", fadd>; +let SchedRW = [WriteFDiv] in { +defm FDIV : TwoOperandFPData<0b0001, "fdiv", fdiv>; +} +defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", int_aarch64_neon_fmaxnm>; +defm FMAX : TwoOperandFPData<0b0100, "fmax", AArch64fmax>; +defm FMINNM : TwoOperandFPData<0b0111, "fminnm", int_aarch64_neon_fminnm>; +defm FMIN : TwoOperandFPData<0b0101, "fmin", AArch64fmin>; +let SchedRW = [WriteFMul] in { +defm FMUL : TwoOperandFPData<0b0000, "fmul", fmul>; +defm FNMUL : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>; +} +defm FSUB : TwoOperandFPData<0b0011, "fsub", fsub>; + +def : Pat<(v1f64 (AArch64fmax (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (FMAXDrr FPR64:$Rn, FPR64:$Rm)>; +def : Pat<(v1f64 (AArch64fmin (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (FMINDrr FPR64:$Rn, FPR64:$Rm)>; +def : Pat<(v1f64 (int_aarch64_neon_fmaxnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>; +def : Pat<(v1f64 (int_aarch64_neon_fminnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (FMINNMDrr FPR64:$Rn, FPR64:$Rm)>; + +//===----------------------------------------------------------------------===// +// Floating point three operand instructions. +//===----------------------------------------------------------------------===// + +defm FMADD : ThreeOperandFPData<0, 0, "fmadd", fma>; +defm FMSUB : ThreeOperandFPData<0, 1, "fmsub", + TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >; +defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd", + TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >; +defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub", + TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >; + +// The following def pats catch the case where the LHS of an FMA is negated. +// The TriOpFrag above catches the case where the middle operand is negated. + +// N.b. FMSUB etc have the accumulator at the *end* of (outs), unlike +// the NEON variant. +def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)), + (FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; + +def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)), + (FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; + +// We handled -(a + b*c) for FNMADD above, now it's time for "(-a) + (-b)*c" and +// "(-a) + b*(-c)". +def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))), + (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; + +def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))), + (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; + +def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))), + (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; + +def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))), + (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; + +//===----------------------------------------------------------------------===// +// Floating point comparison instructions. +//===----------------------------------------------------------------------===// + +defm FCMPE : FPComparison<1, "fcmpe">; +defm FCMP : FPComparison<0, "fcmp", AArch64fcmp>; + +//===----------------------------------------------------------------------===// +// Floating point conditional comparison instructions. +//===----------------------------------------------------------------------===// + +defm FCCMPE : FPCondComparison<1, "fccmpe">; +defm FCCMP : FPCondComparison<0, "fccmp">; + +//===----------------------------------------------------------------------===// +// Floating point conditional select instruction. +//===----------------------------------------------------------------------===// + +defm FCSEL : FPCondSelect<"fcsel">; + +// CSEL instructions providing f128 types need to be handled by a +// pseudo-instruction since the eventual code will need to introduce basic +// blocks and control flow. +def F128CSEL : Pseudo<(outs FPR128:$Rd), + (ins FPR128:$Rn, FPR128:$Rm, ccode:$cond), + [(set (f128 FPR128:$Rd), + (AArch64csel FPR128:$Rn, FPR128:$Rm, + (i32 imm:$cond), NZCV))]> { + let Uses = [NZCV]; + let usesCustomInserter = 1; +} + + +//===----------------------------------------------------------------------===// +// Floating point immediate move. +//===----------------------------------------------------------------------===// + +let isReMaterializable = 1 in { +defm FMOV : FPMoveImmediate<"fmov">; +} + +//===----------------------------------------------------------------------===// +// Advanced SIMD two vector instructions. +//===----------------------------------------------------------------------===// + +defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>; +defm CLS : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>; +defm CLZ : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>; +defm CMEQ : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>; +defm CMGE : SIMDCmpTwoVector<1, 0b01000, "cmge", AArch64cmgez>; +defm CMGT : SIMDCmpTwoVector<0, 0b01000, "cmgt", AArch64cmgtz>; +defm CMLE : SIMDCmpTwoVector<1, 0b01001, "cmle", AArch64cmlez>; +defm CMLT : SIMDCmpTwoVector<0, 0b01010, "cmlt", AArch64cmltz>; +defm CNT : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>; +defm FABS : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>; + +defm FCMEQ : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>; +defm FCMGE : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", AArch64fcmgez>; +defm FCMGT : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>; +defm FCMLE : SIMDFPCmpTwoVector<1, 1, 0b01101, "fcmle", AArch64fcmlez>; +defm FCMLT : SIMDFPCmpTwoVector<0, 1, 0b01110, "fcmlt", AArch64fcmltz>; +defm FCVTAS : SIMDTwoVectorFPToInt<0,0,0b11100, "fcvtas",int_aarch64_neon_fcvtas>; +defm FCVTAU : SIMDTwoVectorFPToInt<1,0,0b11100, "fcvtau",int_aarch64_neon_fcvtau>; +defm FCVTL : SIMDFPWidenTwoVector<0, 0, 0b10111, "fcvtl">; +def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))), + (FCVTLv4i16 V64:$Rn)>; +def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn), + (i64 4)))), + (FCVTLv8i16 V128:$Rn)>; +def : Pat<(v2f64 (fextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>; +def : Pat<(v2f64 (fextend (v2f32 (extract_subvector (v4f32 V128:$Rn), + (i64 2))))), + (FCVTLv4i32 V128:$Rn)>; + +defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>; +defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>; +defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_aarch64_neon_fcvtns>; +defm FCVTNU : SIMDTwoVectorFPToInt<1,0,0b11010, "fcvtnu",int_aarch64_neon_fcvtnu>; +defm FCVTN : SIMDFPNarrowTwoVector<0, 0, 0b10110, "fcvtn">; +def : Pat<(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn))), + (FCVTNv4i16 V128:$Rn)>; +def : Pat<(concat_vectors V64:$Rd, + (v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))), + (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; +def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>; +def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))), + (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; +defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>; +defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>; +defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn", + int_aarch64_neon_fcvtxn>; +defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>; +defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>; +let isCodeGenOnly = 1 in { +defm FCVTZS_Int : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", + int_aarch64_neon_fcvtzs>; +defm FCVTZU_Int : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", + int_aarch64_neon_fcvtzu>; +} +defm FNEG : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>; +defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>; +defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>; +defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>; +defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>; +defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_aarch64_neon_frintn>; +defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>; +defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>; +defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>; +defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_aarch64_neon_frsqrte>; +defm FSQRT : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>; +defm NEG : SIMDTwoVectorBHSD<1, 0b01011, "neg", + UnOpFrag<(sub immAllZerosV, node:$LHS)> >; +defm NOT : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>; +// Aliases for MVN -> NOT. +def : InstAlias<"mvn{ $Vd.8b, $Vn.8b|.8b $Vd, $Vn}", + (NOTv8i8 V64:$Vd, V64:$Vn)>; +def : InstAlias<"mvn{ $Vd.16b, $Vn.16b|.16b $Vd, $Vn}", + (NOTv16i8 V128:$Vd, V128:$Vn)>; + +def : Pat<(AArch64neg (v8i8 V64:$Rn)), (NEGv8i8 V64:$Rn)>; +def : Pat<(AArch64neg (v16i8 V128:$Rn)), (NEGv16i8 V128:$Rn)>; +def : Pat<(AArch64neg (v4i16 V64:$Rn)), (NEGv4i16 V64:$Rn)>; +def : Pat<(AArch64neg (v8i16 V128:$Rn)), (NEGv8i16 V128:$Rn)>; +def : Pat<(AArch64neg (v2i32 V64:$Rn)), (NEGv2i32 V64:$Rn)>; +def : Pat<(AArch64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>; +def : Pat<(AArch64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>; + +def : Pat<(AArch64not (v8i8 V64:$Rn)), (NOTv8i8 V64:$Rn)>; +def : Pat<(AArch64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>; +def : Pat<(AArch64not (v4i16 V64:$Rn)), (NOTv8i8 V64:$Rn)>; +def : Pat<(AArch64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>; +def : Pat<(AArch64not (v2i32 V64:$Rn)), (NOTv8i8 V64:$Rn)>; +def : Pat<(AArch64not (v1i64 V64:$Rn)), (NOTv8i8 V64:$Rn)>; +def : Pat<(AArch64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>; +def : Pat<(AArch64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>; + +def : Pat<(vnot (v4i16 V64:$Rn)), (NOTv8i8 V64:$Rn)>; +def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>; +def : Pat<(vnot (v2i32 V64:$Rn)), (NOTv8i8 V64:$Rn)>; +def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>; +def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>; + +defm RBIT : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_aarch64_neon_rbit>; +defm REV16 : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>; +defm REV32 : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>; +defm REV64 : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>; +defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp", + BinOpFrag<(add node:$LHS, (int_aarch64_neon_saddlp node:$RHS))> >; +defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_aarch64_neon_saddlp>; +defm SCVTF : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>; +defm SHLL : SIMDVectorLShiftLongBySizeBHS; +defm SQABS : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>; +defm SQNEG : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>; +defm SQXTN : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_aarch64_neon_sqxtn>; +defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_aarch64_neon_sqxtun>; +defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd>; +defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp", + BinOpFrag<(add node:$LHS, (int_aarch64_neon_uaddlp node:$RHS))> >; +defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp", + int_aarch64_neon_uaddlp>; +defm UCVTF : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>; +defm UQXTN : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>; +defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>; +defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>; +defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>; +defm XTN : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>; + +def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>; +def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>; + +// Patterns for vector long shift (by element width). These need to match all +// three of zext, sext and anyext so it's easier to pull the patterns out of the +// definition. +multiclass SIMDVectorLShiftLongBySizeBHSPats { + def : Pat<(AArch64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)), + (SHLLv8i8 V64:$Rn)>; + def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)), + (SHLLv16i8 V128:$Rn)>; + def : Pat<(AArch64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)), + (SHLLv4i16 V64:$Rn)>; + def : Pat<(AArch64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)), + (SHLLv8i16 V128:$Rn)>; + def : Pat<(AArch64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)), + (SHLLv2i32 V64:$Rn)>; + def : Pat<(AArch64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)), + (SHLLv4i32 V128:$Rn)>; +} + +defm : SIMDVectorLShiftLongBySizeBHSPats; +defm : SIMDVectorLShiftLongBySizeBHSPats; +defm : SIMDVectorLShiftLongBySizeBHSPats; + +//===----------------------------------------------------------------------===// +// Advanced SIMD three vector instructions. +//===----------------------------------------------------------------------===// + +defm ADD : SIMDThreeSameVector<0, 0b10000, "add", add>; +defm ADDP : SIMDThreeSameVector<0, 0b10111, "addp", int_aarch64_neon_addp>; +defm CMEQ : SIMDThreeSameVector<1, 0b10001, "cmeq", AArch64cmeq>; +defm CMGE : SIMDThreeSameVector<0, 0b00111, "cmge", AArch64cmge>; +defm CMGT : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>; +defm CMHI : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>; +defm CMHS : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>; +defm CMTST : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>; +defm FABD : SIMDThreeSameVectorFP<1,1,0b11010,"fabd", int_aarch64_neon_fabd>; +defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b11101,"facge",int_aarch64_neon_facge>; +defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b11101,"facgt",int_aarch64_neon_facgt>; +defm FADDP : SIMDThreeSameVectorFP<1,0,0b11010,"faddp",int_aarch64_neon_addp>; +defm FADD : SIMDThreeSameVectorFP<0,0,0b11010,"fadd", fadd>; +defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>; +defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>; +defm FCMGT : SIMDThreeSameVectorFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>; +defm FDIV : SIMDThreeSameVectorFP<1,0,0b11111,"fdiv", fdiv>; +defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b11000,"fmaxnmp", int_aarch64_neon_fmaxnmp>; +defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b11000,"fmaxnm", int_aarch64_neon_fmaxnm>; +defm FMAXP : SIMDThreeSameVectorFP<1,0,0b11110,"fmaxp", int_aarch64_neon_fmaxp>; +defm FMAX : SIMDThreeSameVectorFP<0,0,0b11110,"fmax", AArch64fmax>; +defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b11000,"fminnmp", int_aarch64_neon_fminnmp>; +defm FMINNM : SIMDThreeSameVectorFP<0,1,0b11000,"fminnm", int_aarch64_neon_fminnm>; +defm FMINP : SIMDThreeSameVectorFP<1,1,0b11110,"fminp", int_aarch64_neon_fminp>; +defm FMIN : SIMDThreeSameVectorFP<0,1,0b11110,"fmin", AArch64fmin>; + +// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the +// instruction expects the addend first, while the fma intrinsic puts it last. +defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b11001, "fmla", + TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >; +defm FMLS : SIMDThreeSameVectorFPTied<0, 1, 0b11001, "fmls", + TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >; + +// The following def pats catch the case where the LHS of an FMA is negated. +// The TriOpFrag above catches the case where the middle operand is negated. +def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)), + (FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>; + +def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)), + (FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>; + +def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)), + (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>; + +defm FMULX : SIMDThreeSameVectorFP<0,0,0b11011,"fmulx", int_aarch64_neon_fmulx>; +defm FMUL : SIMDThreeSameVectorFP<1,0,0b11011,"fmul", fmul>; +defm FRECPS : SIMDThreeSameVectorFP<0,0,0b11111,"frecps", int_aarch64_neon_frecps>; +defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b11111,"frsqrts", int_aarch64_neon_frsqrts>; +defm FSUB : SIMDThreeSameVectorFP<0,1,0b11010,"fsub", fsub>; +defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", + TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >; +defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", + TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >; +defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>; +defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>; +defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba", + TriOpFrag<(add node:$LHS, (int_aarch64_neon_sabd node:$MHS, node:$RHS))> >; +defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>; +defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>; +defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>; +defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>; +defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", int_aarch64_neon_smax>; +defm SMINP : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>; +defm SMIN : SIMDThreeSameVectorBHS<0,0b01101,"smin", int_aarch64_neon_smin>; +defm SQADD : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>; +defm SQDMULH : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>; +defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>; +defm SQRSHL : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>; +defm SQSHL : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>; +defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>; +defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_aarch64_neon_srhadd>; +defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>; +defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>; +defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>; +defm UABA : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba", + TriOpFrag<(add node:$LHS, (int_aarch64_neon_uabd node:$MHS, node:$RHS))> >; +defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>; +defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>; +defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>; +defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>; +defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", int_aarch64_neon_umax>; +defm UMINP : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>; +defm UMIN : SIMDThreeSameVectorBHS<1,0b01101,"umin", int_aarch64_neon_umin>; +defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>; +defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>; +defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>; +defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>; +defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>; +defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>; +defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>; + +defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>; +defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic", + BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >; +defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">; +defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>; +defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl", + TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>; +defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>; +defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn", + BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >; +defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>; + +def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), + (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm), + (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm), + (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; +def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm), + (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; + +def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm), + (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm), + (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm), + (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; +def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm), + (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; + +def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}", + (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>; +def : InstAlias<"mov{\t$dst.8h, $src.8h|.8h\t$dst, $src}", + (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>; +def : InstAlias<"mov{\t$dst.4s, $src.4s|.4s\t$dst, $src}", + (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>; +def : InstAlias<"mov{\t$dst.2d, $src.2d|.2d\t$dst, $src}", + (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>; + +def : InstAlias<"mov{\t$dst.8b, $src.8b|.8b\t$dst, $src}", + (ORRv8i8 V64:$dst, V64:$src, V64:$src), 1>; +def : InstAlias<"mov{\t$dst.4h, $src.4h|.4h\t$dst, $src}", + (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>; +def : InstAlias<"mov{\t$dst.2s, $src.2s|.2s\t$dst, $src}", + (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>; +def : InstAlias<"mov{\t$dst.1d, $src.1d|.1d\t$dst, $src}", + (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>; + +def : InstAlias<"{cmls\t$dst.8b, $src1.8b, $src2.8b" # + "|cmls.8b\t$dst, $src1, $src2}", + (CMHSv8i8 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmls\t$dst.16b, $src1.16b, $src2.16b" # + "|cmls.16b\t$dst, $src1, $src2}", + (CMHSv16i8 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmls\t$dst.4h, $src1.4h, $src2.4h" # + "|cmls.4h\t$dst, $src1, $src2}", + (CMHSv4i16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmls\t$dst.8h, $src1.8h, $src2.8h" # + "|cmls.8h\t$dst, $src1, $src2}", + (CMHSv8i16 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmls\t$dst.2s, $src1.2s, $src2.2s" # + "|cmls.2s\t$dst, $src1, $src2}", + (CMHSv2i32 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmls\t$dst.4s, $src1.4s, $src2.4s" # + "|cmls.4s\t$dst, $src1, $src2}", + (CMHSv4i32 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmls\t$dst.2d, $src1.2d, $src2.2d" # + "|cmls.2d\t$dst, $src1, $src2}", + (CMHSv2i64 V128:$dst, V128:$src2, V128:$src1), 0>; + +def : InstAlias<"{cmlo\t$dst.8b, $src1.8b, $src2.8b" # + "|cmlo.8b\t$dst, $src1, $src2}", + (CMHIv8i8 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmlo\t$dst.16b, $src1.16b, $src2.16b" # + "|cmlo.16b\t$dst, $src1, $src2}", + (CMHIv16i8 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmlo\t$dst.4h, $src1.4h, $src2.4h" # + "|cmlo.4h\t$dst, $src1, $src2}", + (CMHIv4i16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmlo\t$dst.8h, $src1.8h, $src2.8h" # + "|cmlo.8h\t$dst, $src1, $src2}", + (CMHIv8i16 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmlo\t$dst.2s, $src1.2s, $src2.2s" # + "|cmlo.2s\t$dst, $src1, $src2}", + (CMHIv2i32 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmlo\t$dst.4s, $src1.4s, $src2.4s" # + "|cmlo.4s\t$dst, $src1, $src2}", + (CMHIv4i32 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmlo\t$dst.2d, $src1.2d, $src2.2d" # + "|cmlo.2d\t$dst, $src1, $src2}", + (CMHIv2i64 V128:$dst, V128:$src2, V128:$src1), 0>; + +def : InstAlias<"{cmle\t$dst.8b, $src1.8b, $src2.8b" # + "|cmle.8b\t$dst, $src1, $src2}", + (CMGEv8i8 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmle\t$dst.16b, $src1.16b, $src2.16b" # + "|cmle.16b\t$dst, $src1, $src2}", + (CMGEv16i8 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmle\t$dst.4h, $src1.4h, $src2.4h" # + "|cmle.4h\t$dst, $src1, $src2}", + (CMGEv4i16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmle\t$dst.8h, $src1.8h, $src2.8h" # + "|cmle.8h\t$dst, $src1, $src2}", + (CMGEv8i16 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmle\t$dst.2s, $src1.2s, $src2.2s" # + "|cmle.2s\t$dst, $src1, $src2}", + (CMGEv2i32 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmle\t$dst.4s, $src1.4s, $src2.4s" # + "|cmle.4s\t$dst, $src1, $src2}", + (CMGEv4i32 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmle\t$dst.2d, $src1.2d, $src2.2d" # + "|cmle.2d\t$dst, $src1, $src2}", + (CMGEv2i64 V128:$dst, V128:$src2, V128:$src1), 0>; + +def : InstAlias<"{cmlt\t$dst.8b, $src1.8b, $src2.8b" # + "|cmlt.8b\t$dst, $src1, $src2}", + (CMGTv8i8 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmlt\t$dst.16b, $src1.16b, $src2.16b" # + "|cmlt.16b\t$dst, $src1, $src2}", + (CMGTv16i8 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmlt\t$dst.4h, $src1.4h, $src2.4h" # + "|cmlt.4h\t$dst, $src1, $src2}", + (CMGTv4i16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmlt\t$dst.8h, $src1.8h, $src2.8h" # + "|cmlt.8h\t$dst, $src1, $src2}", + (CMGTv8i16 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmlt\t$dst.2s, $src1.2s, $src2.2s" # + "|cmlt.2s\t$dst, $src1, $src2}", + (CMGTv2i32 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{cmlt\t$dst.4s, $src1.4s, $src2.4s" # + "|cmlt.4s\t$dst, $src1, $src2}", + (CMGTv4i32 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" # + "|cmlt.2d\t$dst, $src1, $src2}", + (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>; + +def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" # + "|fcmle.2s\t$dst, $src1, $src2}", + (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{fcmle\t$dst.4s, $src1.4s, $src2.4s" # + "|fcmle.4s\t$dst, $src1, $src2}", + (FCMGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" # + "|fcmle.2d\t$dst, $src1, $src2}", + (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>; + +def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" # + "|fcmlt.2s\t$dst, $src1, $src2}", + (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{fcmlt\t$dst.4s, $src1.4s, $src2.4s" # + "|fcmlt.4s\t$dst, $src1, $src2}", + (FCMGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" # + "|fcmlt.2d\t$dst, $src1, $src2}", + (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>; + +def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" # + "|facle.2s\t$dst, $src1, $src2}", + (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{facle\t$dst.4s, $src1.4s, $src2.4s" # + "|facle.4s\t$dst, $src1, $src2}", + (FACGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" # + "|facle.2d\t$dst, $src1, $src2}", + (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>; + +def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" # + "|faclt.2s\t$dst, $src1, $src2}", + (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{faclt\t$dst.4s, $src1.4s, $src2.4s" # + "|faclt.4s\t$dst, $src1, $src2}", + (FACGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>; +def : InstAlias<"{faclt\t$dst.2d, $src1.2d, $src2.2d" # + "|faclt.2d\t$dst, $src1, $src2}", + (FACGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>; + +//===----------------------------------------------------------------------===// +// Advanced SIMD three scalar instructions. +//===----------------------------------------------------------------------===// + +defm ADD : SIMDThreeScalarD<0, 0b10000, "add", add>; +defm CMEQ : SIMDThreeScalarD<1, 0b10001, "cmeq", AArch64cmeq>; +defm CMGE : SIMDThreeScalarD<0, 0b00111, "cmge", AArch64cmge>; +defm CMGT : SIMDThreeScalarD<0, 0b00110, "cmgt", AArch64cmgt>; +defm CMHI : SIMDThreeScalarD<1, 0b00110, "cmhi", AArch64cmhi>; +defm CMHS : SIMDThreeScalarD<1, 0b00111, "cmhs", AArch64cmhs>; +defm CMTST : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>; +defm FABD : SIMDThreeScalarSD<1, 1, 0b11010, "fabd", int_aarch64_sisd_fabd>; +def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (FABD64 FPR64:$Rn, FPR64:$Rm)>; +defm FACGE : SIMDThreeScalarFPCmp<1, 0, 0b11101, "facge", + int_aarch64_neon_facge>; +defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b11101, "facgt", + int_aarch64_neon_facgt>; +defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>; +defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>; +defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>; +defm FMULX : SIMDThreeScalarSD<0, 0, 0b11011, "fmulx", int_aarch64_neon_fmulx>; +defm FRECPS : SIMDThreeScalarSD<0, 0, 0b11111, "frecps", int_aarch64_neon_frecps>; +defm FRSQRTS : SIMDThreeScalarSD<0, 1, 0b11111, "frsqrts", int_aarch64_neon_frsqrts>; +defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>; +defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>; +defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>; +defm SQRSHL : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl",int_aarch64_neon_sqrshl>; +defm SQSHL : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", int_aarch64_neon_sqshl>; +defm SQSUB : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", int_aarch64_neon_sqsub>; +defm SRSHL : SIMDThreeScalarD< 0, 0b01010, "srshl", int_aarch64_neon_srshl>; +defm SSHL : SIMDThreeScalarD< 0, 0b01000, "sshl", int_aarch64_neon_sshl>; +defm SUB : SIMDThreeScalarD< 1, 0b10000, "sub", sub>; +defm UQADD : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", int_aarch64_neon_uqadd>; +defm UQRSHL : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl",int_aarch64_neon_uqrshl>; +defm UQSHL : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl>; +defm UQSUB : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>; +defm URSHL : SIMDThreeScalarD< 1, 0b01010, "urshl", int_aarch64_neon_urshl>; +defm USHL : SIMDThreeScalarD< 1, 0b01000, "ushl", int_aarch64_neon_ushl>; + +def : InstAlias<"cmls $dst, $src1, $src2", + (CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; +def : InstAlias<"cmle $dst, $src1, $src2", + (CMGEv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; +def : InstAlias<"cmlo $dst, $src1, $src2", + (CMHIv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; +def : InstAlias<"cmlt $dst, $src1, $src2", + (CMGTv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; +def : InstAlias<"fcmle $dst, $src1, $src2", + (FCMGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>; +def : InstAlias<"fcmle $dst, $src1, $src2", + (FCMGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; +def : InstAlias<"fcmlt $dst, $src1, $src2", + (FCMGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>; +def : InstAlias<"fcmlt $dst, $src1, $src2", + (FCMGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; +def : InstAlias<"facle $dst, $src1, $src2", + (FACGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>; +def : InstAlias<"facle $dst, $src1, $src2", + (FACGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; +def : InstAlias<"faclt $dst, $src1, $src2", + (FACGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>; +def : InstAlias<"faclt $dst, $src1, $src2", + (FACGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; + +//===----------------------------------------------------------------------===// +// Advanced SIMD three scalar instructions (mixed operands). +//===----------------------------------------------------------------------===// +defm SQDMULL : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull", + int_aarch64_neon_sqdmulls_scalar>; +defm SQDMLAL : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">; +defm SQDMLSL : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">; + +def : Pat<(i64 (int_aarch64_neon_sqadd (i64 FPR64:$Rd), + (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn), + (i32 FPR32:$Rm))))), + (SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>; +def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd), + (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn), + (i32 FPR32:$Rm))))), + (SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>; + +//===----------------------------------------------------------------------===// +// Advanced SIMD two scalar instructions. +//===----------------------------------------------------------------------===// + +defm ABS : SIMDTwoScalarD< 0, 0b01011, "abs", int_aarch64_neon_abs>; +defm CMEQ : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", AArch64cmeqz>; +defm CMGE : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>; +defm CMGT : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>; +defm CMLE : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", AArch64cmlez>; +defm CMLT : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", AArch64cmltz>; +defm FCMEQ : SIMDCmpTwoScalarSD<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>; +defm FCMGE : SIMDCmpTwoScalarSD<1, 1, 0b01100, "fcmge", AArch64fcmgez>; +defm FCMGT : SIMDCmpTwoScalarSD<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>; +defm FCMLE : SIMDCmpTwoScalarSD<1, 1, 0b01101, "fcmle", AArch64fcmlez>; +defm FCMLT : SIMDCmpTwoScalarSD<0, 1, 0b01110, "fcmlt", AArch64fcmltz>; +defm FCVTAS : SIMDTwoScalarSD< 0, 0, 0b11100, "fcvtas">; +defm FCVTAU : SIMDTwoScalarSD< 1, 0, 0b11100, "fcvtau">; +defm FCVTMS : SIMDTwoScalarSD< 0, 0, 0b11011, "fcvtms">; +defm FCVTMU : SIMDTwoScalarSD< 1, 0, 0b11011, "fcvtmu">; +defm FCVTNS : SIMDTwoScalarSD< 0, 0, 0b11010, "fcvtns">; +defm FCVTNU : SIMDTwoScalarSD< 1, 0, 0b11010, "fcvtnu">; +defm FCVTPS : SIMDTwoScalarSD< 0, 1, 0b11010, "fcvtps">; +defm FCVTPU : SIMDTwoScalarSD< 1, 1, 0b11010, "fcvtpu">; +def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">; +defm FCVTZS : SIMDTwoScalarSD< 0, 1, 0b11011, "fcvtzs">; +defm FCVTZU : SIMDTwoScalarSD< 1, 1, 0b11011, "fcvtzu">; +defm FRECPE : SIMDTwoScalarSD< 0, 1, 0b11101, "frecpe">; +defm FRECPX : SIMDTwoScalarSD< 0, 1, 0b11111, "frecpx">; +defm FRSQRTE : SIMDTwoScalarSD< 1, 1, 0b11101, "frsqrte">; +defm NEG : SIMDTwoScalarD< 1, 0b01011, "neg", + UnOpFrag<(sub immAllZerosV, node:$LHS)> >; +defm SCVTF : SIMDTwoScalarCVTSD< 0, 0, 0b11101, "scvtf", AArch64sitof>; +defm SQABS : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_aarch64_neon_sqabs>; +defm SQNEG : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_aarch64_neon_sqneg>; +defm SQXTN : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_aarch64_neon_scalar_sqxtn>; +defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_aarch64_neon_scalar_sqxtun>; +defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd", + int_aarch64_neon_suqadd>; +defm UCVTF : SIMDTwoScalarCVTSD< 1, 0, 0b11101, "ucvtf", AArch64uitof>; +defm UQXTN : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar_uqxtn>; +defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd", + int_aarch64_neon_usqadd>; + +def : Pat<(AArch64neg (v1i64 V64:$Rn)), (NEGv1i64 V64:$Rn)>; + +def : Pat<(v1i64 (int_aarch64_neon_fcvtas (v1f64 FPR64:$Rn))), + (FCVTASv1i64 FPR64:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_fcvtau (v1f64 FPR64:$Rn))), + (FCVTAUv1i64 FPR64:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_fcvtms (v1f64 FPR64:$Rn))), + (FCVTMSv1i64 FPR64:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_fcvtmu (v1f64 FPR64:$Rn))), + (FCVTMUv1i64 FPR64:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_fcvtns (v1f64 FPR64:$Rn))), + (FCVTNSv1i64 FPR64:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_fcvtnu (v1f64 FPR64:$Rn))), + (FCVTNUv1i64 FPR64:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_fcvtps (v1f64 FPR64:$Rn))), + (FCVTPSv1i64 FPR64:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))), + (FCVTPUv1i64 FPR64:$Rn)>; + +def : Pat<(f32 (int_aarch64_neon_frecpe (f32 FPR32:$Rn))), + (FRECPEv1i32 FPR32:$Rn)>; +def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))), + (FRECPEv1i64 FPR64:$Rn)>; +def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))), + (FRECPEv1i64 FPR64:$Rn)>; + +def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))), + (FRECPXv1i32 FPR32:$Rn)>; +def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))), + (FRECPXv1i64 FPR64:$Rn)>; + +def : Pat<(f32 (int_aarch64_neon_frsqrte (f32 FPR32:$Rn))), + (FRSQRTEv1i32 FPR32:$Rn)>; +def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))), + (FRSQRTEv1i64 FPR64:$Rn)>; +def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))), + (FRSQRTEv1i64 FPR64:$Rn)>; + +// If an integer is about to be converted to a floating point value, +// just load it on the floating point unit. +// Here are the patterns for 8 and 16-bits to float. +// 8-bits -> float. +multiclass UIntToFPROLoadPat { + def : Pat<(DstTy (uint_to_fp (SrcTy + (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, + ro.Wext:$extend))))), + (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)), + (LDRW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend), + sub))>; + + def : Pat<(DstTy (uint_to_fp (SrcTy + (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, + ro.Wext:$extend))))), + (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)), + (LDRX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend), + sub))>; +} + +defm : UIntToFPROLoadPat; +def : Pat <(f32 (uint_to_fp (i32 + (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), + (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)), + (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>; +def : Pat <(f32 (uint_to_fp (i32 + (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))), + (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)), + (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>; +// 16-bits -> float. +defm : UIntToFPROLoadPat; +def : Pat <(f32 (uint_to_fp (i32 + (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), + (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)), + (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>; +def : Pat <(f32 (uint_to_fp (i32 + (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))), + (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)), + (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>; +// 32-bits are handled in target specific dag combine: +// performIntToFpCombine. +// 64-bits integer to 32-bits floating point, not possible with +// UCVTF on floating point registers (both source and destination +// must have the same size). + +// Here are the patterns for 8, 16, 32, and 64-bits to double. +// 8-bits -> double. +defm : UIntToFPROLoadPat; +def : Pat <(f64 (uint_to_fp (i32 + (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), + (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>; +def : Pat <(f64 (uint_to_fp (i32 + (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))), + (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>; +// 16-bits -> double. +defm : UIntToFPROLoadPat; +def : Pat <(f64 (uint_to_fp (i32 + (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), + (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>; +def : Pat <(f64 (uint_to_fp (i32 + (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))), + (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>; +// 32-bits -> double. +defm : UIntToFPROLoadPat; +def : Pat <(f64 (uint_to_fp (i32 + (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), + (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub))>; +def : Pat <(f64 (uint_to_fp (i32 + (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))), + (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + (LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>; +// 64-bits -> double are handled in target specific dag combine: +// performIntToFpCombine. + +//===----------------------------------------------------------------------===// +// Advanced SIMD three different-sized vector instructions. +//===----------------------------------------------------------------------===// + +defm ADDHN : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>; +defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>; +defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>; +defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>; +defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_aarch64_neon_pmull>; +defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal", + int_aarch64_neon_sabd>; +defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl", + int_aarch64_neon_sabd>; +defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl", + BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>; +defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw", + BinOpFrag<(add node:$LHS, (sext node:$RHS))>>; +defm SMLAL : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal", + TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>; +defm SMLSL : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl", + TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>; +defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_aarch64_neon_smull>; +defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal", + int_aarch64_neon_sqadd>; +defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl", + int_aarch64_neon_sqsub>; +defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull", + int_aarch64_neon_sqdmull>; +defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl", + BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>; +defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw", + BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>; +defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal", + int_aarch64_neon_uabd>; +defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", + int_aarch64_neon_uabd>; +defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl", + BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>; +defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw", + BinOpFrag<(add node:$LHS, (zext node:$RHS))>>; +defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal", + TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; +defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl", + TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; +defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>; +defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl", + BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>; +defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw", + BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>; + +// Patterns for 64-bit pmull +def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm), + (PMULLv1i64 V64:$Rn, V64:$Rm)>; +def : Pat<(int_aarch64_neon_pmull64 (vector_extract (v2i64 V128:$Rn), (i64 1)), + (vector_extract (v2i64 V128:$Rm), (i64 1))), + (PMULLv2i64 V128:$Rn, V128:$Rm)>; + +// CodeGen patterns for addhn and subhn instructions, which can actually be +// written in LLVM IR without too much difficulty. + +// ADDHN +def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))), + (ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>; +def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm), + (i32 16))))), + (ADDHNv4i32_v4i16 V128:$Rn, V128:$Rm)>; +def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm), + (i32 32))))), + (ADDHNv2i64_v2i32 V128:$Rn, V128:$Rm)>; +def : Pat<(concat_vectors (v8i8 V64:$Rd), + (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm), + (i32 8))))), + (ADDHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), + V128:$Rn, V128:$Rm)>; +def : Pat<(concat_vectors (v4i16 V64:$Rd), + (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm), + (i32 16))))), + (ADDHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), + V128:$Rn, V128:$Rm)>; +def : Pat<(concat_vectors (v2i32 V64:$Rd), + (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm), + (i32 32))))), + (ADDHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), + V128:$Rn, V128:$Rm)>; + +// SUBHN +def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm), (i32 8))))), + (SUBHNv8i16_v8i8 V128:$Rn, V128:$Rm)>; +def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm), + (i32 16))))), + (SUBHNv4i32_v4i16 V128:$Rn, V128:$Rm)>; +def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm), + (i32 32))))), + (SUBHNv2i64_v2i32 V128:$Rn, V128:$Rm)>; +def : Pat<(concat_vectors (v8i8 V64:$Rd), + (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm), + (i32 8))))), + (SUBHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), + V128:$Rn, V128:$Rm)>; +def : Pat<(concat_vectors (v4i16 V64:$Rd), + (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm), + (i32 16))))), + (SUBHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), + V128:$Rn, V128:$Rm)>; +def : Pat<(concat_vectors (v2i32 V64:$Rd), + (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm), + (i32 32))))), + (SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), + V128:$Rn, V128:$Rm)>; + +//---------------------------------------------------------------------------- +// AdvSIMD bitwise extract from vector instruction. +//---------------------------------------------------------------------------- + +defm EXT : SIMDBitwiseExtract<"ext">; + +def : Pat<(v4i16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))), + (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>; +def : Pat<(v8i16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), + (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; +def : Pat<(v2i32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))), + (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>; +def : Pat<(v2f32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))), + (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>; +def : Pat<(v4i32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), + (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; +def : Pat<(v4f32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), + (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; +def : Pat<(v2i64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), + (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; +def : Pat<(v2f64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), + (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; + +// We use EXT to handle extract_subvector to copy the upper 64-bits of a +// 128-bit vector. +def : Pat<(v8i8 (extract_subvector V128:$Rn, (i64 8))), + (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; +def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 4))), + (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; +def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))), + (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; +def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))), + (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; +def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))), + (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; +def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))), + (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; + + +//---------------------------------------------------------------------------- +// AdvSIMD zip vector +//---------------------------------------------------------------------------- + +defm TRN1 : SIMDZipVector<0b010, "trn1", AArch64trn1>; +defm TRN2 : SIMDZipVector<0b110, "trn2", AArch64trn2>; +defm UZP1 : SIMDZipVector<0b001, "uzp1", AArch64uzp1>; +defm UZP2 : SIMDZipVector<0b101, "uzp2", AArch64uzp2>; +defm ZIP1 : SIMDZipVector<0b011, "zip1", AArch64zip1>; +defm ZIP2 : SIMDZipVector<0b111, "zip2", AArch64zip2>; + +//---------------------------------------------------------------------------- +// AdvSIMD TBL/TBX instructions +//---------------------------------------------------------------------------- + +defm TBL : SIMDTableLookup< 0, "tbl">; +defm TBX : SIMDTableLookupTied<1, "tbx">; + +def : Pat<(v8i8 (int_aarch64_neon_tbl1 (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))), + (TBLv8i8One VecListOne128:$Rn, V64:$Ri)>; +def : Pat<(v16i8 (int_aarch64_neon_tbl1 (v16i8 V128:$Ri), (v16i8 V128:$Rn))), + (TBLv16i8One V128:$Ri, V128:$Rn)>; + +def : Pat<(v8i8 (int_aarch64_neon_tbx1 (v8i8 V64:$Rd), + (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))), + (TBXv8i8One V64:$Rd, VecListOne128:$Rn, V64:$Ri)>; +def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd), + (v16i8 V128:$Ri), (v16i8 V128:$Rn))), + (TBXv16i8One V128:$Rd, V128:$Ri, V128:$Rn)>; + + +//---------------------------------------------------------------------------- +// AdvSIMD scalar CPY instruction +//---------------------------------------------------------------------------- + +defm CPY : SIMDScalarCPY<"cpy">; + +//---------------------------------------------------------------------------- +// AdvSIMD scalar pairwise instructions +//---------------------------------------------------------------------------- + +defm ADDP : SIMDPairwiseScalarD<0, 0b11011, "addp">; +defm FADDP : SIMDPairwiseScalarSD<1, 0, 0b01101, "faddp">; +defm FMAXNMP : SIMDPairwiseScalarSD<1, 0, 0b01100, "fmaxnmp">; +defm FMAXP : SIMDPairwiseScalarSD<1, 0, 0b01111, "fmaxp">; +defm FMINNMP : SIMDPairwiseScalarSD<1, 1, 0b01100, "fminnmp">; +defm FMINP : SIMDPairwiseScalarSD<1, 1, 0b01111, "fminp">; +def : Pat<(i64 (int_aarch64_neon_saddv (v2i64 V128:$Rn))), + (ADDPv2i64p V128:$Rn)>; +def : Pat<(i64 (int_aarch64_neon_uaddv (v2i64 V128:$Rn))), + (ADDPv2i64p V128:$Rn)>; +def : Pat<(f32 (int_aarch64_neon_faddv (v2f32 V64:$Rn))), + (FADDPv2i32p V64:$Rn)>; +def : Pat<(f32 (int_aarch64_neon_faddv (v4f32 V128:$Rn))), + (FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>; +def : Pat<(f64 (int_aarch64_neon_faddv (v2f64 V128:$Rn))), + (FADDPv2i64p V128:$Rn)>; +def : Pat<(f32 (int_aarch64_neon_fmaxnmv (v2f32 V64:$Rn))), + (FMAXNMPv2i32p V64:$Rn)>; +def : Pat<(f64 (int_aarch64_neon_fmaxnmv (v2f64 V128:$Rn))), + (FMAXNMPv2i64p V128:$Rn)>; +def : Pat<(f32 (int_aarch64_neon_fmaxv (v2f32 V64:$Rn))), + (FMAXPv2i32p V64:$Rn)>; +def : Pat<(f64 (int_aarch64_neon_fmaxv (v2f64 V128:$Rn))), + (FMAXPv2i64p V128:$Rn)>; +def : Pat<(f32 (int_aarch64_neon_fminnmv (v2f32 V64:$Rn))), + (FMINNMPv2i32p V64:$Rn)>; +def : Pat<(f64 (int_aarch64_neon_fminnmv (v2f64 V128:$Rn))), + (FMINNMPv2i64p V128:$Rn)>; +def : Pat<(f32 (int_aarch64_neon_fminv (v2f32 V64:$Rn))), + (FMINPv2i32p V64:$Rn)>; +def : Pat<(f64 (int_aarch64_neon_fminv (v2f64 V128:$Rn))), + (FMINPv2i64p V128:$Rn)>; + +//---------------------------------------------------------------------------- +// AdvSIMD INS/DUP instructions +//---------------------------------------------------------------------------- + +def DUPv8i8gpr : SIMDDupFromMain<0, 0b00001, ".8b", v8i8, V64, GPR32>; +def DUPv16i8gpr : SIMDDupFromMain<1, 0b00001, ".16b", v16i8, V128, GPR32>; +def DUPv4i16gpr : SIMDDupFromMain<0, 0b00010, ".4h", v4i16, V64, GPR32>; +def DUPv8i16gpr : SIMDDupFromMain<1, 0b00010, ".8h", v8i16, V128, GPR32>; +def DUPv2i32gpr : SIMDDupFromMain<0, 0b00100, ".2s", v2i32, V64, GPR32>; +def DUPv4i32gpr : SIMDDupFromMain<1, 0b00100, ".4s", v4i32, V128, GPR32>; +def DUPv2i64gpr : SIMDDupFromMain<1, 0b01000, ".2d", v2i64, V128, GPR64>; + +def DUPv2i64lane : SIMDDup64FromElement; +def DUPv2i32lane : SIMDDup32FromElement<0, ".2s", v2i32, V64>; +def DUPv4i32lane : SIMDDup32FromElement<1, ".4s", v4i32, V128>; +def DUPv4i16lane : SIMDDup16FromElement<0, ".4h", v4i16, V64>; +def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>; +def DUPv8i8lane : SIMDDup8FromElement <0, ".8b", v8i8, V64>; +def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>; + +def : Pat<(v2f32 (AArch64dup (f32 FPR32:$Rn))), + (v2f32 (DUPv2i32lane + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub), + (i64 0)))>; +def : Pat<(v4f32 (AArch64dup (f32 FPR32:$Rn))), + (v4f32 (DUPv4i32lane + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub), + (i64 0)))>; +def : Pat<(v2f64 (AArch64dup (f64 FPR64:$Rn))), + (v2f64 (DUPv2i64lane + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub), + (i64 0)))>; + +def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)), + (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>; +def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)), + (DUPv4i32lane V128:$Rn, VectorIndexS:$imm)>; +def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)), + (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>; + +// If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane +// instruction even if the types don't match: we just have to remap the lane +// carefully. N.b. this trick only applies to truncations. +def VecIndex_x2 : SDNodeXFormgetTargetConstant(2 * N->getZExtValue(), MVT::i64); +}]>; +def VecIndex_x4 : SDNodeXFormgetTargetConstant(4 * N->getZExtValue(), MVT::i64); +}]>; +def VecIndex_x8 : SDNodeXFormgetTargetConstant(8 * N->getZExtValue(), MVT::i64); +}]>; + +multiclass DUPWithTruncPats { + def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn), + imm:$idx)))), + (DUP V128:$Rn, (IdxXFORM imm:$idx))>; + + def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn), + imm:$idx)))), + (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>; +} + +defm : DUPWithTruncPats; +defm : DUPWithTruncPats; +defm : DUPWithTruncPats; + +defm : DUPWithTruncPats; +defm : DUPWithTruncPats; +defm : DUPWithTruncPats; + +multiclass DUPWithTrunci64Pats { + def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v2i64 V128:$Rn), + imm:$idx))))), + (DUP V128:$Rn, (IdxXFORM imm:$idx))>; + + def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v1i64 V64:$Rn), + imm:$idx))))), + (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>; +} + +defm : DUPWithTrunci64Pats; +defm : DUPWithTrunci64Pats; +defm : DUPWithTrunci64Pats; + +defm : DUPWithTrunci64Pats; +defm : DUPWithTrunci64Pats; +defm : DUPWithTrunci64Pats; + +// SMOV and UMOV definitions, with some extra patterns for convenience +defm SMOV : SMov; +defm UMOV : UMov; + +def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8), + (i32 (SMOVvi8to32 V128:$Rn, VectorIndexB:$idx))>; +def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8), + (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>; +def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16), + (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>; +def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16), + (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>; +def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16), + (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>; +def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))), + (i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>; + +// Extracting i8 or i16 elements will have the zero-extend transformed to +// an 'and' mask by type legalization since neither i8 nor i16 are legal types +// for AArch64. Match these patterns here since UMOV already zeroes out the high +// bits of the destination register. +def : Pat<(and (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), + (i32 0xff)), + (i32 (UMOVvi8 V128:$Rn, VectorIndexB:$idx))>; +def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx), + (i32 0xffff)), + (i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx))>; + +defm INS : SIMDIns; + +def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)), + (SUBREG_TO_REG (i32 0), + (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>; +def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)), + (SUBREG_TO_REG (i32 0), + (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>; + +def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)), + (SUBREG_TO_REG (i32 0), + (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>; +def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)), + (SUBREG_TO_REG (i32 0), + (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>; + +def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))), + (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), + (i32 FPR32:$Rn), ssub))>; +def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))), + (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), + (i32 FPR32:$Rn), ssub))>; +def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))), + (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), + (i64 FPR64:$Rn), dsub))>; + +def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))), + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>; +def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))), + (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>; +def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))), + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>; + +def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn), + (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))), + (EXTRACT_SUBREG + (INSvi32lane + (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)), + VectorIndexS:$imm, + (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)), + (i64 0)), + dsub)>; +def : Pat<(v4f32 (vector_insert (v4f32 V128:$Rn), + (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))), + (INSvi32lane + V128:$Rn, VectorIndexS:$imm, + (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)), + (i64 0))>; +def : Pat<(v2f64 (vector_insert (v2f64 V128:$Rn), + (f64 FPR64:$Rm), (i64 VectorIndexD:$imm))), + (INSvi64lane + V128:$Rn, VectorIndexD:$imm, + (v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)), + (i64 0))>; + +// Copy an element at a constant index in one vector into a constant indexed +// element of another. +// FIXME refactor to a shared class/dev parameterized on vector type, vector +// index type and INS extension +def : Pat<(v16i8 (int_aarch64_neon_vcopy_lane + (v16i8 V128:$Vd), VectorIndexB:$idx, (v16i8 V128:$Vs), + VectorIndexB:$idx2)), + (v16i8 (INSvi8lane + V128:$Vd, VectorIndexB:$idx, V128:$Vs, VectorIndexB:$idx2) + )>; +def : Pat<(v8i16 (int_aarch64_neon_vcopy_lane + (v8i16 V128:$Vd), VectorIndexH:$idx, (v8i16 V128:$Vs), + VectorIndexH:$idx2)), + (v8i16 (INSvi16lane + V128:$Vd, VectorIndexH:$idx, V128:$Vs, VectorIndexH:$idx2) + )>; +def : Pat<(v4i32 (int_aarch64_neon_vcopy_lane + (v4i32 V128:$Vd), VectorIndexS:$idx, (v4i32 V128:$Vs), + VectorIndexS:$idx2)), + (v4i32 (INSvi32lane + V128:$Vd, VectorIndexS:$idx, V128:$Vs, VectorIndexS:$idx2) + )>; +def : Pat<(v2i64 (int_aarch64_neon_vcopy_lane + (v2i64 V128:$Vd), VectorIndexD:$idx, (v2i64 V128:$Vs), + VectorIndexD:$idx2)), + (v2i64 (INSvi64lane + V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2) + )>; + +multiclass Neon_INS_elt_pattern { + def : Pat<(VT128 (vector_insert V128:$src, + (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)), + imm:$Immd)), + (INS V128:$src, imm:$Immd, V128:$Rn, imm:$Immn)>; + + def : Pat<(VT128 (vector_insert V128:$src, + (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)), + imm:$Immd)), + (INS V128:$src, imm:$Immd, + (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn)>; + + def : Pat<(VT64 (vector_insert V64:$src, + (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)), + imm:$Immd)), + (EXTRACT_SUBREG (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), + imm:$Immd, V128:$Rn, imm:$Immn), + dsub)>; + + def : Pat<(VT64 (vector_insert V64:$src, + (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)), + imm:$Immd)), + (EXTRACT_SUBREG + (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), imm:$Immd, + (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn), + dsub)>; +} + +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; + + +// Floating point vector extractions are codegen'd as either a sequence of +// subregister extractions, possibly fed by an INS if the lane number is +// anything other than zero. +def : Pat<(vector_extract (v2f64 V128:$Rn), 0), + (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>; +def : Pat<(vector_extract (v4f32 V128:$Rn), 0), + (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>; +def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx), + (f64 (EXTRACT_SUBREG + (INSvi64lane (v2f64 (IMPLICIT_DEF)), 0, + V128:$Rn, VectorIndexD:$idx), + dsub))>; +def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx), + (f32 (EXTRACT_SUBREG + (INSvi32lane (v4f32 (IMPLICIT_DEF)), 0, + V128:$Rn, VectorIndexS:$idx), + ssub))>; + +// All concat_vectors operations are canonicalised to act on i64 vectors for +// AArch64. In the general case we need an instruction, which had just as well be +// INS. +class ConcatPat + : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)), + (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1, + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>; + +def : ConcatPat; +def : ConcatPat; +def : ConcatPat; +def : ConcatPat; +def : ConcatPat; +def : ConcatPat; + +// If the high lanes are undef, though, we can just ignore them: +class ConcatUndefPat + : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)), + (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>; + +def : ConcatUndefPat; +def : ConcatUndefPat; +def : ConcatUndefPat; +def : ConcatUndefPat; +def : ConcatUndefPat; +def : ConcatUndefPat; + +//---------------------------------------------------------------------------- +// AdvSIMD across lanes instructions +//---------------------------------------------------------------------------- + +defm ADDV : SIMDAcrossLanesBHS<0, 0b11011, "addv">; +defm SMAXV : SIMDAcrossLanesBHS<0, 0b01010, "smaxv">; +defm SMINV : SIMDAcrossLanesBHS<0, 0b11010, "sminv">; +defm UMAXV : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">; +defm UMINV : SIMDAcrossLanesBHS<1, 0b11010, "uminv">; +defm SADDLV : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">; +defm UADDLV : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">; +defm FMAXNMV : SIMDAcrossLanesS<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>; +defm FMAXV : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>; +defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>; +defm FMINV : SIMDAcrossLanesS<0b01111, 1, "fminv", int_aarch64_neon_fminv>; + +multiclass SIMDAcrossLanesSignedIntrinsic { +// If there is a sign extension after this intrinsic, consume it as smov already +// performed it + def : Pat<(i32 (sext_inreg (i32 (intOp (v8i8 V64:$Rn))), i8)), + (i32 (SMOVvi8to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), + (i64 0)))>; + def : Pat<(i32 (intOp (v8i8 V64:$Rn))), + (i32 (SMOVvi8to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), + (i64 0)))>; +// If there is a sign extension after this intrinsic, consume it as smov already +// performed it +def : Pat<(i32 (sext_inreg (i32 (intOp (v16i8 V128:$Rn))), i8)), + (i32 (SMOVvi8to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), + (i64 0)))>; +def : Pat<(i32 (intOp (v16i8 V128:$Rn))), + (i32 (SMOVvi8to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), + (i64 0)))>; +// If there is a sign extension after this intrinsic, consume it as smov already +// performed it +def : Pat<(i32 (sext_inreg (i32 (intOp (v4i16 V64:$Rn))), i16)), + (i32 (SMOVvi16to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), + (i64 0)))>; +def : Pat<(i32 (intOp (v4i16 V64:$Rn))), + (i32 (SMOVvi16to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), + (i64 0)))>; +// If there is a sign extension after this intrinsic, consume it as smov already +// performed it +def : Pat<(i32 (sext_inreg (i32 (intOp (v8i16 V128:$Rn))), i16)), + (i32 (SMOVvi16to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub), + (i64 0)))>; +def : Pat<(i32 (intOp (v8i16 V128:$Rn))), + (i32 (SMOVvi16to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub), + (i64 0)))>; + +def : Pat<(i32 (intOp (v4i32 V128:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub), + ssub))>; +} + +multiclass SIMDAcrossLanesUnsignedIntrinsic { +// If there is a masking operation keeping only what has been actually +// generated, consume it. + def : Pat<(i32 (and (i32 (intOp (v8i8 V64:$Rn))), maski8_or_more)), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), + ssub))>; + def : Pat<(i32 (intOp (v8i8 V64:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), + ssub))>; +// If there is a masking operation keeping only what has been actually +// generated, consume it. +def : Pat<(i32 (and (i32 (intOp (v16i8 V128:$Rn))), maski8_or_more)), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), + ssub))>; +def : Pat<(i32 (intOp (v16i8 V128:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), + ssub))>; + +// If there is a masking operation keeping only what has been actually +// generated, consume it. +def : Pat<(i32 (and (i32 (intOp (v4i16 V64:$Rn))), maski16_or_more)), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), + ssub))>; +def : Pat<(i32 (intOp (v4i16 V64:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), + ssub))>; +// If there is a masking operation keeping only what has been actually +// generated, consume it. +def : Pat<(i32 (and (i32 (intOp (v8i16 V128:$Rn))), maski16_or_more)), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub), + ssub))>; +def : Pat<(i32 (intOp (v8i16 V128:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub), + ssub))>; + +def : Pat<(i32 (intOp (v4i32 V128:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub), + ssub))>; + +} + +multiclass SIMDAcrossLanesSignedLongIntrinsic { + def : Pat<(i32 (intOp (v8i8 V64:$Rn))), + (i32 (SMOVvi16to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub), + (i64 0)))>; +def : Pat<(i32 (intOp (v16i8 V128:$Rn))), + (i32 (SMOVvi16to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub), + (i64 0)))>; + +def : Pat<(i32 (intOp (v4i16 V64:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub), + ssub))>; +def : Pat<(i32 (intOp (v8i16 V128:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub), + ssub))>; + +def : Pat<(i64 (intOp (v4i32 V128:$Rn))), + (i64 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub), + dsub))>; +} + +multiclass SIMDAcrossLanesUnsignedLongIntrinsic { + def : Pat<(i32 (intOp (v8i8 V64:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub), + ssub))>; +def : Pat<(i32 (intOp (v16i8 V128:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub), + ssub))>; + +def : Pat<(i32 (intOp (v4i16 V64:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub), + ssub))>; +def : Pat<(i32 (intOp (v8i16 V128:$Rn))), + (i32 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub), + ssub))>; + +def : Pat<(i64 (intOp (v4i32 V128:$Rn))), + (i64 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub), + dsub))>; +} + +defm : SIMDAcrossLanesSignedIntrinsic<"ADDV", int_aarch64_neon_saddv>; +// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm +def : Pat<(i32 (int_aarch64_neon_saddv (v2i32 V64:$Rn))), + (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>; + +defm : SIMDAcrossLanesUnsignedIntrinsic<"ADDV", int_aarch64_neon_uaddv>; +// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm +def : Pat<(i32 (int_aarch64_neon_uaddv (v2i32 V64:$Rn))), + (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>; + +defm : SIMDAcrossLanesSignedIntrinsic<"SMAXV", int_aarch64_neon_smaxv>; +def : Pat<(i32 (int_aarch64_neon_smaxv (v2i32 V64:$Rn))), + (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>; + +defm : SIMDAcrossLanesSignedIntrinsic<"SMINV", int_aarch64_neon_sminv>; +def : Pat<(i32 (int_aarch64_neon_sminv (v2i32 V64:$Rn))), + (EXTRACT_SUBREG (SMINPv2i32 V64:$Rn, V64:$Rn), ssub)>; + +defm : SIMDAcrossLanesUnsignedIntrinsic<"UMAXV", int_aarch64_neon_umaxv>; +def : Pat<(i32 (int_aarch64_neon_umaxv (v2i32 V64:$Rn))), + (EXTRACT_SUBREG (UMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>; + +defm : SIMDAcrossLanesUnsignedIntrinsic<"UMINV", int_aarch64_neon_uminv>; +def : Pat<(i32 (int_aarch64_neon_uminv (v2i32 V64:$Rn))), + (EXTRACT_SUBREG (UMINPv2i32 V64:$Rn, V64:$Rn), ssub)>; + +defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>; +defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_aarch64_neon_uaddlv>; + +// The vaddlv_s32 intrinsic gets mapped to SADDLP. +def : Pat<(i64 (int_aarch64_neon_saddlv (v2i32 V64:$Rn))), + (i64 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (SADDLPv2i32_v1i64 V64:$Rn), dsub), + dsub))>; +// The vaddlv_u32 intrinsic gets mapped to UADDLP. +def : Pat<(i64 (int_aarch64_neon_uaddlv (v2i32 V64:$Rn))), + (i64 (EXTRACT_SUBREG + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (UADDLPv2i32_v1i64 V64:$Rn), dsub), + dsub))>; + +//------------------------------------------------------------------------------ +// AdvSIMD modified immediate instructions +//------------------------------------------------------------------------------ + +// AdvSIMD BIC +defm BIC : SIMDModifiedImmVectorShiftTied<1, 0b11, 0b01, "bic", AArch64bici>; +// AdvSIMD ORR +defm ORR : SIMDModifiedImmVectorShiftTied<0, 0b11, 0b01, "orr", AArch64orri>; + +def : InstAlias<"bic $Vd.4h, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"bic $Vd.8h, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"bic $Vd.2s, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"bic $Vd.4s, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>; + +def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; + +def : InstAlias<"orr $Vd.4h, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"orr $Vd.8h, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"orr $Vd.2s, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0)>; +def : InstAlias<"orr $Vd.4s, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>; + +def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; + +// AdvSIMD FMOV +def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1111, V128, fpimm8, + "fmov", ".2d", + [(set (v2f64 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>; +def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1111, V64, fpimm8, + "fmov", ".2s", + [(set (v2f32 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>; +def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1111, V128, fpimm8, + "fmov", ".4s", + [(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>; + +// AdvSIMD MOVI + +// EDIT byte mask: scalar +let isReMaterializable = 1, isAsCheapAsAMove = 1 in +def MOVID : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi", + [(set FPR64:$Rd, simdimmtype10:$imm8)]>; +// The movi_edit node has the immediate value already encoded, so we use +// a plain imm0_255 here. +def : Pat<(f64 (AArch64movi_edit imm0_255:$shift)), + (MOVID imm0_255:$shift)>; + +def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>; +def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>; +def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>; +def : Pat<(v8i8 immAllZerosV), (MOVID (i32 0))>; + +def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>; +def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>; +def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>; +def : Pat<(v8i8 immAllOnesV), (MOVID (i32 255))>; + +// EDIT byte mask: 2d + +// The movi_edit node has the immediate value already encoded, so we use +// a plain imm0_255 in the pattern +let isReMaterializable = 1, isAsCheapAsAMove = 1 in +def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1110, V128, + simdimmtype10, + "movi", ".2d", + [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>; + + +// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing. +// Complexity is added to break a tie with a plain MOVI. +let AddedComplexity = 1 in { +def : Pat<(f32 fpimm0), + (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>, + Requires<[HasZCZ]>; +def : Pat<(f64 fpimm0), + (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>, + Requires<[HasZCZ]>; +} + +def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>; +def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>; +def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>; +def : Pat<(v16i8 immAllZerosV), (MOVIv2d_ns (i32 0))>; + +def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>; +def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>; +def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>; +def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>; + +def : Pat<(v2f64 (AArch64dup (f64 fpimm0))), (MOVIv2d_ns (i32 0))>; +def : Pat<(v4f32 (AArch64dup (f32 fpimm0))), (MOVIv2d_ns (i32 0))>; + +// EDIT per word & halfword: 2s, 4h, 4s, & 8h +defm MOVI : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">; + +def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"movi $Vd.4s, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; + +def : InstAlias<"movi.4h $Vd, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"movi.8h $Vd, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"movi.2s $Vd, $imm", (MOVIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"movi.4s $Vd, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; + +def : Pat<(v2i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))), + (MOVIv2i32 imm0_255:$imm8, imm:$shift)>; +def : Pat<(v4i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))), + (MOVIv4i32 imm0_255:$imm8, imm:$shift)>; +def : Pat<(v4i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))), + (MOVIv4i16 imm0_255:$imm8, imm:$shift)>; +def : Pat<(v8i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))), + (MOVIv8i16 imm0_255:$imm8, imm:$shift)>; + +// EDIT per word: 2s & 4s with MSL shifter +def MOVIv2s_msl : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s", + [(set (v2i32 V64:$Rd), + (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>; +def MOVIv4s_msl : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s", + [(set (v4i32 V128:$Rd), + (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>; + +// Per byte: 8b & 16b +def MOVIv8b_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1110, V64, imm0_255, + "movi", ".8b", + [(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>; +def MOVIv16b_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1110, V128, imm0_255, + "movi", ".16b", + [(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>; + +// AdvSIMD MVNI + +// EDIT per word & halfword: 2s, 4h, 4s, & 8h +defm MVNI : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">; + +def : InstAlias<"mvni $Vd.4h, $imm", (MVNIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"mvni $Vd.8h, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"mvni $Vd.2s, $imm", (MVNIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"mvni $Vd.4s, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; + +def : InstAlias<"mvni.4h $Vd, $imm", (MVNIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"mvni.8h $Vd, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"mvni.2s $Vd, $imm", (MVNIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; +def : InstAlias<"mvni.4s $Vd, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; + +def : Pat<(v2i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))), + (MVNIv2i32 imm0_255:$imm8, imm:$shift)>; +def : Pat<(v4i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))), + (MVNIv4i32 imm0_255:$imm8, imm:$shift)>; +def : Pat<(v4i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))), + (MVNIv4i16 imm0_255:$imm8, imm:$shift)>; +def : Pat<(v8i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))), + (MVNIv8i16 imm0_255:$imm8, imm:$shift)>; + +// EDIT per word: 2s & 4s with MSL shifter +def MVNIv2s_msl : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s", + [(set (v2i32 V64:$Rd), + (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>; +def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s", + [(set (v4i32 V128:$Rd), + (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>; + +//---------------------------------------------------------------------------- +// AdvSIMD indexed element +//---------------------------------------------------------------------------- + +let neverHasSideEffects = 1 in { + defm FMLA : SIMDFPIndexedSDTied<0, 0b0001, "fmla">; + defm FMLS : SIMDFPIndexedSDTied<0, 0b0101, "fmls">; +} + +// NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the +// instruction expects the addend first, while the intrinsic expects it last. + +// On the other hand, there are quite a few valid combinatorial options due to +// the commutativity of multiplication and the fact that (-x) * y = x * (-y). +defm : SIMDFPIndexedSDTiedPatterns<"FMLA", + TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>; +defm : SIMDFPIndexedSDTiedPatterns<"FMLA", + TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>; + +defm : SIMDFPIndexedSDTiedPatterns<"FMLS", + TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >; +defm : SIMDFPIndexedSDTiedPatterns<"FMLS", + TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >; +defm : SIMDFPIndexedSDTiedPatterns<"FMLS", + TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >; +defm : SIMDFPIndexedSDTiedPatterns<"FMLS", + TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >; + +multiclass FMLSIndexedAfterNegPatterns { + // 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit + // and DUP scalar. + def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), + (AArch64duplane32 (v4f32 (fneg V128:$Rm)), + VectorIndexS:$idx))), + (FMLSv2i32_indexed V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>; + def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), + (v2f32 (AArch64duplane32 + (v4f32 (insert_subvector undef, + (v2f32 (fneg V64:$Rm)), + (i32 0))), + VectorIndexS:$idx)))), + (FMLSv2i32_indexed V64:$Rd, V64:$Rn, + (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), + VectorIndexS:$idx)>; + def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), + (AArch64dup (f32 (fneg FPR32Op:$Rm))))), + (FMLSv2i32_indexed V64:$Rd, V64:$Rn, + (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>; + + // 3 variants for the .4s version: DUPLANE from 128-bit, DUPLANE from 64-bit + // and DUP scalar. + def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), + (AArch64duplane32 (v4f32 (fneg V128:$Rm)), + VectorIndexS:$idx))), + (FMLSv4i32_indexed V128:$Rd, V128:$Rn, V128:$Rm, + VectorIndexS:$idx)>; + def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), + (v4f32 (AArch64duplane32 + (v4f32 (insert_subvector undef, + (v2f32 (fneg V64:$Rm)), + (i32 0))), + VectorIndexS:$idx)))), + (FMLSv4i32_indexed V128:$Rd, V128:$Rn, + (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), + VectorIndexS:$idx)>; + def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), + (AArch64dup (f32 (fneg FPR32Op:$Rm))))), + (FMLSv4i32_indexed V128:$Rd, V128:$Rn, + (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>; + + // 2 variants for the .2d version: DUPLANE from 128-bit, and DUP scalar + // (DUPLANE from 64-bit would be trivial). + def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), + (AArch64duplane64 (v2f64 (fneg V128:$Rm)), + VectorIndexD:$idx))), + (FMLSv2i64_indexed + V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>; + def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), + (AArch64dup (f64 (fneg FPR64Op:$Rm))))), + (FMLSv2i64_indexed V128:$Rd, V128:$Rn, + (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>; + + // 2 variants for 32-bit scalar version: extract from .2s or from .4s + def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), + (vector_extract (v4f32 (fneg V128:$Rm)), + VectorIndexS:$idx))), + (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn, + V128:$Rm, VectorIndexS:$idx)>; + def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), + (vector_extract (v2f32 (fneg V64:$Rm)), + VectorIndexS:$idx))), + (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn, + (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>; + + // 1 variant for 64-bit scalar version: extract from .1d or from .2d + def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn), + (vector_extract (v2f64 (fneg V128:$Rm)), + VectorIndexS:$idx))), + (FMLSv1i64_indexed FPR64:$Rd, FPR64:$Rn, + V128:$Rm, VectorIndexS:$idx)>; +} + +defm : FMLSIndexedAfterNegPatterns< + TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >; +defm : FMLSIndexedAfterNegPatterns< + TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >; + +defm FMULX : SIMDFPIndexedSD<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>; +defm FMUL : SIMDFPIndexedSD<0, 0b1001, "fmul", fmul>; + +def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))), + (FMULv2i32_indexed V64:$Rn, + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub), + (i64 0))>; +def : Pat<(v4f32 (fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))), + (FMULv4i32_indexed V128:$Rn, + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub), + (i64 0))>; +def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))), + (FMULv2i64_indexed V128:$Rn, + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub), + (i64 0))>; + +defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>; +defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>; +defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla", + TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>; +defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls", + TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>; +defm MUL : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>; +defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal", + TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>; +defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl", + TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>; +defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull", + int_aarch64_neon_smull>; +defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal", + int_aarch64_neon_sqadd>; +defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl", + int_aarch64_neon_sqsub>; +defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>; +defm UMLAL : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal", + TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; +defm UMLSL : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl", + TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>; +defm UMULL : SIMDVectorIndexedLongSD<1, 0b1010, "umull", + int_aarch64_neon_umull>; + +// A scalar sqdmull with the second operand being a vector lane can be +// handled directly with the indexed instruction encoding. +def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn), + (vector_extract (v4i32 V128:$Vm), + VectorIndexS:$idx)), + (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>; + +//---------------------------------------------------------------------------- +// AdvSIMD scalar shift instructions +//---------------------------------------------------------------------------- +defm FCVTZS : SIMDScalarRShiftSD<0, 0b11111, "fcvtzs">; +defm FCVTZU : SIMDScalarRShiftSD<1, 0b11111, "fcvtzu">; +defm SCVTF : SIMDScalarRShiftSD<0, 0b11100, "scvtf">; +defm UCVTF : SIMDScalarRShiftSD<1, 0b11100, "ucvtf">; +// Codegen patterns for the above. We don't put these directly on the +// instructions because TableGen's type inference can't handle the truth. +// Having the same base pattern for fp <--> int totally freaks it out. +def : Pat<(int_aarch64_neon_vcvtfp2fxs FPR32:$Rn, vecshiftR32:$imm), + (FCVTZSs FPR32:$Rn, vecshiftR32:$imm)>; +def : Pat<(int_aarch64_neon_vcvtfp2fxu FPR32:$Rn, vecshiftR32:$imm), + (FCVTZUs FPR32:$Rn, vecshiftR32:$imm)>; +def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxs (f64 FPR64:$Rn), vecshiftR64:$imm)), + (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>; +def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxu (f64 FPR64:$Rn), vecshiftR64:$imm)), + (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>; +def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxs (v1f64 FPR64:$Rn), + vecshiftR64:$imm)), + (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>; +def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn), + vecshiftR64:$imm)), + (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>; +def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm), + (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>; +def : Pat<(int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm), + (UCVTFs FPR32:$Rn, vecshiftR32:$imm)>; +def : Pat<(f64 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)), + (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>; +def : Pat<(f64 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)), + (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>; +def : Pat<(v1f64 (int_aarch64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn), + vecshiftR64:$imm)), + (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>; +def : Pat<(v1f64 (int_aarch64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn), + vecshiftR64:$imm)), + (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>; + +defm SHL : SIMDScalarLShiftD< 0, 0b01010, "shl", AArch64vshl>; +defm SLI : SIMDScalarLShiftDTied<1, 0b01010, "sli">; +defm SQRSHRN : SIMDScalarRShiftBHS< 0, 0b10011, "sqrshrn", + int_aarch64_neon_sqrshrn>; +defm SQRSHRUN : SIMDScalarRShiftBHS< 1, 0b10001, "sqrshrun", + int_aarch64_neon_sqrshrun>; +defm SQSHLU : SIMDScalarLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>; +defm SQSHL : SIMDScalarLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>; +defm SQSHRN : SIMDScalarRShiftBHS< 0, 0b10010, "sqshrn", + int_aarch64_neon_sqshrn>; +defm SQSHRUN : SIMDScalarRShiftBHS< 1, 0b10000, "sqshrun", + int_aarch64_neon_sqshrun>; +defm SRI : SIMDScalarRShiftDTied< 1, 0b01000, "sri">; +defm SRSHR : SIMDScalarRShiftD< 0, 0b00100, "srshr", AArch64srshri>; +defm SRSRA : SIMDScalarRShiftDTied< 0, 0b00110, "srsra", + TriOpFrag<(add node:$LHS, + (AArch64srshri node:$MHS, node:$RHS))>>; +defm SSHR : SIMDScalarRShiftD< 0, 0b00000, "sshr", AArch64vashr>; +defm SSRA : SIMDScalarRShiftDTied< 0, 0b00010, "ssra", + TriOpFrag<(add node:$LHS, + (AArch64vashr node:$MHS, node:$RHS))>>; +defm UQRSHRN : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn", + int_aarch64_neon_uqrshrn>; +defm UQSHL : SIMDScalarLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>; +defm UQSHRN : SIMDScalarRShiftBHS< 1, 0b10010, "uqshrn", + int_aarch64_neon_uqshrn>; +defm URSHR : SIMDScalarRShiftD< 1, 0b00100, "urshr", AArch64urshri>; +defm URSRA : SIMDScalarRShiftDTied< 1, 0b00110, "ursra", + TriOpFrag<(add node:$LHS, + (AArch64urshri node:$MHS, node:$RHS))>>; +defm USHR : SIMDScalarRShiftD< 1, 0b00000, "ushr", AArch64vlshr>; +defm USRA : SIMDScalarRShiftDTied< 1, 0b00010, "usra", + TriOpFrag<(add node:$LHS, + (AArch64vlshr node:$MHS, node:$RHS))>>; + +//---------------------------------------------------------------------------- +// AdvSIMD vector shift instructions +//---------------------------------------------------------------------------- +defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs>; +defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>; +defm SCVTF: SIMDVectorRShiftSDToFP<0, 0b11100, "scvtf", + int_aarch64_neon_vcvtfxs2fp>; +defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn", + int_aarch64_neon_rshrn>; +defm SHL : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>; +defm SHRN : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn", + BinOpFrag<(trunc (AArch64vashr node:$LHS, node:$RHS))>>; +defm SLI : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_aarch64_neon_vsli>; +def : Pat<(v1i64 (int_aarch64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), + (i32 vecshiftL64:$imm))), + (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>; +defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn", + int_aarch64_neon_sqrshrn>; +defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun", + int_aarch64_neon_sqrshrun>; +defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>; +defm SQSHL : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>; +defm SQSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn", + int_aarch64_neon_sqshrn>; +defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun", + int_aarch64_neon_sqshrun>; +defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_aarch64_neon_vsri>; +def : Pat<(v1i64 (int_aarch64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), + (i32 vecshiftR64:$imm))), + (SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>; +defm SRSHR : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", AArch64srshri>; +defm SRSRA : SIMDVectorRShiftBHSDTied<0, 0b00110, "srsra", + TriOpFrag<(add node:$LHS, + (AArch64srshri node:$MHS, node:$RHS))> >; +defm SSHLL : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll", + BinOpFrag<(AArch64vshl (sext node:$LHS), node:$RHS)>>; + +defm SSHR : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>; +defm SSRA : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra", + TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>; +defm UCVTF : SIMDVectorRShiftSDToFP<1, 0b11100, "ucvtf", + int_aarch64_neon_vcvtfxu2fp>; +defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn", + int_aarch64_neon_uqrshrn>; +defm UQSHL : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>; +defm UQSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn", + int_aarch64_neon_uqshrn>; +defm URSHR : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", AArch64urshri>; +defm URSRA : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra", + TriOpFrag<(add node:$LHS, + (AArch64urshri node:$MHS, node:$RHS))> >; +defm USHLL : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll", + BinOpFrag<(AArch64vshl (zext node:$LHS), node:$RHS)>>; +defm USHR : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>; +defm USRA : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra", + TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >; + +// SHRN patterns for when a logical right shift was used instead of arithmetic +// (the immediate guarantees no sign bits actually end up in the result so it +// doesn't matter). +def : Pat<(v8i8 (trunc (AArch64vlshr (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))), + (SHRNv8i8_shift V128:$Rn, vecshiftR16Narrow:$imm)>; +def : Pat<(v4i16 (trunc (AArch64vlshr (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))), + (SHRNv4i16_shift V128:$Rn, vecshiftR32Narrow:$imm)>; +def : Pat<(v2i32 (trunc (AArch64vlshr (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))), + (SHRNv2i32_shift V128:$Rn, vecshiftR64Narrow:$imm)>; + +def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Rd), + (trunc (AArch64vlshr (v8i16 V128:$Rn), + vecshiftR16Narrow:$imm)))), + (SHRNv16i8_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), + V128:$Rn, vecshiftR16Narrow:$imm)>; +def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Rd), + (trunc (AArch64vlshr (v4i32 V128:$Rn), + vecshiftR32Narrow:$imm)))), + (SHRNv8i16_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), + V128:$Rn, vecshiftR32Narrow:$imm)>; +def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Rd), + (trunc (AArch64vlshr (v2i64 V128:$Rn), + vecshiftR64Narrow:$imm)))), + (SHRNv4i32_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), + V128:$Rn, vecshiftR32Narrow:$imm)>; + +// Vector sign and zero extensions are implemented with SSHLL and USSHLL. +// Anyexts are implemented as zexts. +def : Pat<(v8i16 (sext (v8i8 V64:$Rn))), (SSHLLv8i8_shift V64:$Rn, (i32 0))>; +def : Pat<(v8i16 (zext (v8i8 V64:$Rn))), (USHLLv8i8_shift V64:$Rn, (i32 0))>; +def : Pat<(v8i16 (anyext (v8i8 V64:$Rn))), (USHLLv8i8_shift V64:$Rn, (i32 0))>; +def : Pat<(v4i32 (sext (v4i16 V64:$Rn))), (SSHLLv4i16_shift V64:$Rn, (i32 0))>; +def : Pat<(v4i32 (zext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>; +def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>; +def : Pat<(v2i64 (sext (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>; +def : Pat<(v2i64 (zext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>; +def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>; +// Also match an extend from the upper half of a 128 bit source register. +def : Pat<(v8i16 (anyext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))), + (USHLLv16i8_shift V128:$Rn, (i32 0))>; +def : Pat<(v8i16 (zext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))), + (USHLLv16i8_shift V128:$Rn, (i32 0))>; +def : Pat<(v8i16 (sext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))), + (SSHLLv16i8_shift V128:$Rn, (i32 0))>; +def : Pat<(v4i32 (anyext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))), + (USHLLv8i16_shift V128:$Rn, (i32 0))>; +def : Pat<(v4i32 (zext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))), + (USHLLv8i16_shift V128:$Rn, (i32 0))>; +def : Pat<(v4i32 (sext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))), + (SSHLLv8i16_shift V128:$Rn, (i32 0))>; +def : Pat<(v2i64 (anyext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))), + (USHLLv4i32_shift V128:$Rn, (i32 0))>; +def : Pat<(v2i64 (zext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))), + (USHLLv4i32_shift V128:$Rn, (i32 0))>; +def : Pat<(v2i64 (sext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))), + (SSHLLv4i32_shift V128:$Rn, (i32 0))>; + +// Vector shift sxtl aliases +def : InstAlias<"sxtl.8h $dst, $src1", + (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>; +def : InstAlias<"sxtl $dst.8h, $src1.8b", + (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>; +def : InstAlias<"sxtl.4s $dst, $src1", + (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>; +def : InstAlias<"sxtl $dst.4s, $src1.4h", + (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>; +def : InstAlias<"sxtl.2d $dst, $src1", + (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>; +def : InstAlias<"sxtl $dst.2d, $src1.2s", + (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>; + +// Vector shift sxtl2 aliases +def : InstAlias<"sxtl2.8h $dst, $src1", + (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>; +def : InstAlias<"sxtl2 $dst.8h, $src1.16b", + (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>; +def : InstAlias<"sxtl2.4s $dst, $src1", + (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>; +def : InstAlias<"sxtl2 $dst.4s, $src1.8h", + (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>; +def : InstAlias<"sxtl2.2d $dst, $src1", + (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>; +def : InstAlias<"sxtl2 $dst.2d, $src1.4s", + (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>; + +// Vector shift uxtl aliases +def : InstAlias<"uxtl.8h $dst, $src1", + (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>; +def : InstAlias<"uxtl $dst.8h, $src1.8b", + (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>; +def : InstAlias<"uxtl.4s $dst, $src1", + (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>; +def : InstAlias<"uxtl $dst.4s, $src1.4h", + (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>; +def : InstAlias<"uxtl.2d $dst, $src1", + (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>; +def : InstAlias<"uxtl $dst.2d, $src1.2s", + (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>; + +// Vector shift uxtl2 aliases +def : InstAlias<"uxtl2.8h $dst, $src1", + (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>; +def : InstAlias<"uxtl2 $dst.8h, $src1.16b", + (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>; +def : InstAlias<"uxtl2.4s $dst, $src1", + (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>; +def : InstAlias<"uxtl2 $dst.4s, $src1.8h", + (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>; +def : InstAlias<"uxtl2.2d $dst, $src1", + (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>; +def : InstAlias<"uxtl2 $dst.2d, $src1.4s", + (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>; + +// If an integer is about to be converted to a floating point value, +// just load it on the floating point unit. +// These patterns are more complex because floating point loads do not +// support sign extension. +// The sign extension has to be explicitly added and is only supported for +// one step: byte-to-half, half-to-word, word-to-doubleword. +// SCVTF GPR -> FPR is 9 cycles. +// SCVTF FPR -> FPR is 4 cyclces. +// (sign extension with lengthen) SXTL FPR -> FPR is 2 cycles. +// Therefore, we can do 2 sign extensions and one SCVTF FPR -> FPR +// and still being faster. +// However, this is not good for code size. +// 8-bits -> float. 2 sizes step-up. +class SExtLoadi8CVTf32Pat + : Pat<(f32 (sint_to_fp (i32 (sextloadi8 addrmode)))), + (SCVTFv1i32 (f32 (EXTRACT_SUBREG + (SSHLLv4i16_shift + (f64 + (EXTRACT_SUBREG + (SSHLLv8i8_shift + (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + INST, + bsub), + 0), + dsub)), + 0), + ssub)))>, Requires<[NotForCodeSize]>; + +def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext), + (LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>; +def : SExtLoadi8CVTf32Pat<(ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext), + (LDRBroX GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext)>; +def : SExtLoadi8CVTf32Pat<(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset), + (LDRBui GPR64sp:$Rn, uimm12s1:$offset)>; +def : SExtLoadi8CVTf32Pat<(am_unscaled8 GPR64sp:$Rn, simm9:$offset), + (LDURBi GPR64sp:$Rn, simm9:$offset)>; + +// 16-bits -> float. 1 size step-up. +class SExtLoadi16CVTf32Pat + : Pat<(f32 (sint_to_fp (i32 (sextloadi16 addrmode)))), + (SCVTFv1i32 (f32 (EXTRACT_SUBREG + (SSHLLv4i16_shift + (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + INST, + hsub), + 0), + ssub)))>, Requires<[NotForCodeSize]>; + +def : SExtLoadi16CVTf32Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext), + (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>; +def : SExtLoadi16CVTf32Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext), + (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>; +def : SExtLoadi16CVTf32Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset), + (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>; +def : SExtLoadi16CVTf32Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset), + (LDURHi GPR64sp:$Rn, simm9:$offset)>; + +// 32-bits to 32-bits are handled in target specific dag combine: +// performIntToFpCombine. +// 64-bits integer to 32-bits floating point, not possible with +// SCVTF on floating point registers (both source and destination +// must have the same size). + +// Here are the patterns for 8, 16, 32, and 64-bits to double. +// 8-bits -> double. 3 size step-up: give up. +// 16-bits -> double. 2 size step. +class SExtLoadi16CVTf64Pat + : Pat <(f64 (sint_to_fp (i32 (sextloadi16 addrmode)))), + (SCVTFv1i64 (f64 (EXTRACT_SUBREG + (SSHLLv2i32_shift + (f64 + (EXTRACT_SUBREG + (SSHLLv4i16_shift + (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + INST, + hsub), + 0), + dsub)), + 0), + dsub)))>, Requires<[NotForCodeSize]>; + +def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext), + (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>; +def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext), + (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>; +def : SExtLoadi16CVTf64Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset), + (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>; +def : SExtLoadi16CVTf64Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset), + (LDURHi GPR64sp:$Rn, simm9:$offset)>; +// 32-bits -> double. 1 size step-up. +class SExtLoadi32CVTf64Pat + : Pat <(f64 (sint_to_fp (i32 (load addrmode)))), + (SCVTFv1i64 (f64 (EXTRACT_SUBREG + (SSHLLv2i32_shift + (INSERT_SUBREG (f64 (IMPLICIT_DEF)), + INST, + ssub), + 0), + dsub)))>, Requires<[NotForCodeSize]>; + +def : SExtLoadi32CVTf64Pat<(ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext), + (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext)>; +def : SExtLoadi32CVTf64Pat<(ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext), + (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext)>; +def : SExtLoadi32CVTf64Pat<(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset), + (LDRSui GPR64sp:$Rn, uimm12s4:$offset)>; +def : SExtLoadi32CVTf64Pat<(am_unscaled32 GPR64sp:$Rn, simm9:$offset), + (LDURSi GPR64sp:$Rn, simm9:$offset)>; + +// 64-bits -> double are handled in target specific dag combine: +// performIntToFpCombine. + + +//---------------------------------------------------------------------------- +// AdvSIMD Load-Store Structure +//---------------------------------------------------------------------------- +defm LD1 : SIMDLd1Multiple<"ld1">; +defm LD2 : SIMDLd2Multiple<"ld2">; +defm LD3 : SIMDLd3Multiple<"ld3">; +defm LD4 : SIMDLd4Multiple<"ld4">; + +defm ST1 : SIMDSt1Multiple<"st1">; +defm ST2 : SIMDSt2Multiple<"st2">; +defm ST3 : SIMDSt3Multiple<"st3">; +defm ST4 : SIMDSt4Multiple<"st4">; + +class Ld1Pat + : Pat<(ty (load GPR64sp:$Rn)), (INST GPR64sp:$Rn)>; + +def : Ld1Pat; +def : Ld1Pat; +def : Ld1Pat; +def : Ld1Pat; +def : Ld1Pat; +def : Ld1Pat; +def : Ld1Pat; +def : Ld1Pat; + +class St1Pat + : Pat<(store ty:$Vt, GPR64sp:$Rn), + (INST ty:$Vt, GPR64sp:$Rn)>; + +def : St1Pat; +def : St1Pat; +def : St1Pat; +def : St1Pat; +def : St1Pat; +def : St1Pat; +def : St1Pat; +def : St1Pat; + +//--- +// Single-element +//--- + +defm LD1R : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>; +defm LD2R : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>; +defm LD3R : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>; +defm LD4R : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>; +let mayLoad = 1, neverHasSideEffects = 1 in { +defm LD1 : SIMDLdSingleBTied<0, 0b000, "ld1", VecListOneb, GPR64pi1>; +defm LD1 : SIMDLdSingleHTied<0, 0b010, 0, "ld1", VecListOneh, GPR64pi2>; +defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes, GPR64pi4>; +defm LD1 : SIMDLdSingleDTied<0, 0b100, 0b01, "ld1", VecListOned, GPR64pi8>; +defm LD2 : SIMDLdSingleBTied<1, 0b000, "ld2", VecListTwob, GPR64pi2>; +defm LD2 : SIMDLdSingleHTied<1, 0b010, 0, "ld2", VecListTwoh, GPR64pi4>; +defm LD2 : SIMDLdSingleSTied<1, 0b100, 0b00, "ld2", VecListTwos, GPR64pi8>; +defm LD2 : SIMDLdSingleDTied<1, 0b100, 0b01, "ld2", VecListTwod, GPR64pi16>; +defm LD3 : SIMDLdSingleBTied<0, 0b001, "ld3", VecListThreeb, GPR64pi3>; +defm LD3 : SIMDLdSingleHTied<0, 0b011, 0, "ld3", VecListThreeh, GPR64pi6>; +defm LD3 : SIMDLdSingleSTied<0, 0b101, 0b00, "ld3", VecListThrees, GPR64pi12>; +defm LD3 : SIMDLdSingleDTied<0, 0b101, 0b01, "ld3", VecListThreed, GPR64pi24>; +defm LD4 : SIMDLdSingleBTied<1, 0b001, "ld4", VecListFourb, GPR64pi4>; +defm LD4 : SIMDLdSingleHTied<1, 0b011, 0, "ld4", VecListFourh, GPR64pi8>; +defm LD4 : SIMDLdSingleSTied<1, 0b101, 0b00, "ld4", VecListFours, GPR64pi16>; +defm LD4 : SIMDLdSingleDTied<1, 0b101, 0b01, "ld4", VecListFourd, GPR64pi32>; +} + +def : Pat<(v8i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))), + (LD1Rv8b GPR64sp:$Rn)>; +def : Pat<(v16i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))), + (LD1Rv16b GPR64sp:$Rn)>; +def : Pat<(v4i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))), + (LD1Rv4h GPR64sp:$Rn)>; +def : Pat<(v8i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))), + (LD1Rv8h GPR64sp:$Rn)>; +def : Pat<(v2i32 (AArch64dup (i32 (load GPR64sp:$Rn)))), + (LD1Rv2s GPR64sp:$Rn)>; +def : Pat<(v4i32 (AArch64dup (i32 (load GPR64sp:$Rn)))), + (LD1Rv4s GPR64sp:$Rn)>; +def : Pat<(v2i64 (AArch64dup (i64 (load GPR64sp:$Rn)))), + (LD1Rv2d GPR64sp:$Rn)>; +def : Pat<(v1i64 (AArch64dup (i64 (load GPR64sp:$Rn)))), + (LD1Rv1d GPR64sp:$Rn)>; +// Grab the floating point version too +def : Pat<(v2f32 (AArch64dup (f32 (load GPR64sp:$Rn)))), + (LD1Rv2s GPR64sp:$Rn)>; +def : Pat<(v4f32 (AArch64dup (f32 (load GPR64sp:$Rn)))), + (LD1Rv4s GPR64sp:$Rn)>; +def : Pat<(v2f64 (AArch64dup (f64 (load GPR64sp:$Rn)))), + (LD1Rv2d GPR64sp:$Rn)>; +def : Pat<(v1f64 (AArch64dup (f64 (load GPR64sp:$Rn)))), + (LD1Rv1d GPR64sp:$Rn)>; + +class Ld1Lane128Pat + : Pat<(vector_insert (VTy VecListOne128:$Rd), + (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx), + (LD1 VecListOne128:$Rd, VecIndex:$idx, GPR64sp:$Rn)>; + +def : Ld1Lane128Pat; +def : Ld1Lane128Pat; +def : Ld1Lane128Pat; +def : Ld1Lane128Pat; +def : Ld1Lane128Pat; +def : Ld1Lane128Pat; + +class Ld1Lane64Pat + : Pat<(vector_insert (VTy VecListOne64:$Rd), + (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx), + (EXTRACT_SUBREG + (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub), + VecIndex:$idx, GPR64sp:$Rn), + dsub)>; + +def : Ld1Lane64Pat; +def : Ld1Lane64Pat; +def : Ld1Lane64Pat; +def : Ld1Lane64Pat; + + +defm LD1 : SIMDLdSt1SingleAliases<"ld1">; +defm LD2 : SIMDLdSt2SingleAliases<"ld2">; +defm LD3 : SIMDLdSt3SingleAliases<"ld3">; +defm LD4 : SIMDLdSt4SingleAliases<"ld4">; + +// Stores +defm ST1 : SIMDStSingleB<0, 0b000, "st1", VecListOneb, GPR64pi1>; +defm ST1 : SIMDStSingleH<0, 0b010, 0, "st1", VecListOneh, GPR64pi2>; +defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>; +defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>; + +let AddedComplexity = 15 in +class St1Lane128Pat + : Pat<(scalar_store + (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)), + GPR64sp:$Rn), + (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn)>; + +def : St1Lane128Pat; +def : St1Lane128Pat; +def : St1Lane128Pat; +def : St1Lane128Pat; +def : St1Lane128Pat; +def : St1Lane128Pat; + +let AddedComplexity = 15 in +class St1Lane64Pat + : Pat<(scalar_store + (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)), + GPR64sp:$Rn), + (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub), + VecIndex:$idx, GPR64sp:$Rn)>; + +def : St1Lane64Pat; +def : St1Lane64Pat; +def : St1Lane64Pat; +def : St1Lane64Pat; + +multiclass St1LanePost64Pat { + def : Pat<(scalar_store + (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)), + GPR64sp:$Rn, offset), + (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub), + VecIndex:$idx, GPR64sp:$Rn, XZR)>; + + def : Pat<(scalar_store + (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)), + GPR64sp:$Rn, GPR64:$Rm), + (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub), + VecIndex:$idx, GPR64sp:$Rn, $Rm)>; +} + +defm : St1LanePost64Pat; +defm : St1LanePost64Pat; +defm : St1LanePost64Pat; +defm : St1LanePost64Pat; +defm : St1LanePost64Pat; +defm : St1LanePost64Pat; + +multiclass St1LanePost128Pat { + def : Pat<(scalar_store + (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)), + GPR64sp:$Rn, offset), + (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, XZR)>; + + def : Pat<(scalar_store + (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)), + GPR64sp:$Rn, GPR64:$Rm), + (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, $Rm)>; +} + +defm : St1LanePost128Pat; +defm : St1LanePost128Pat; +defm : St1LanePost128Pat; +defm : St1LanePost128Pat; +defm : St1LanePost128Pat; +defm : St1LanePost128Pat; + +let mayStore = 1, neverHasSideEffects = 1 in { +defm ST2 : SIMDStSingleB<1, 0b000, "st2", VecListTwob, GPR64pi2>; +defm ST2 : SIMDStSingleH<1, 0b010, 0, "st2", VecListTwoh, GPR64pi4>; +defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos, GPR64pi8>; +defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod, GPR64pi16>; +defm ST3 : SIMDStSingleB<0, 0b001, "st3", VecListThreeb, GPR64pi3>; +defm ST3 : SIMDStSingleH<0, 0b011, 0, "st3", VecListThreeh, GPR64pi6>; +defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, GPR64pi12>; +defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, GPR64pi24>; +defm ST4 : SIMDStSingleB<1, 0b001, "st4", VecListFourb, GPR64pi4>; +defm ST4 : SIMDStSingleH<1, 0b011, 0, "st4", VecListFourh, GPR64pi8>; +defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours, GPR64pi16>; +defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd, GPR64pi32>; +} + +defm ST1 : SIMDLdSt1SingleAliases<"st1">; +defm ST2 : SIMDLdSt2SingleAliases<"st2">; +defm ST3 : SIMDLdSt3SingleAliases<"st3">; +defm ST4 : SIMDLdSt4SingleAliases<"st4">; + +//---------------------------------------------------------------------------- +// Crypto extensions +//---------------------------------------------------------------------------- + +def AESErr : AESTiedInst<0b0100, "aese", int_aarch64_crypto_aese>; +def AESDrr : AESTiedInst<0b0101, "aesd", int_aarch64_crypto_aesd>; +def AESMCrr : AESInst< 0b0110, "aesmc", int_aarch64_crypto_aesmc>; +def AESIMCrr : AESInst< 0b0111, "aesimc", int_aarch64_crypto_aesimc>; + +def SHA1Crrr : SHATiedInstQSV<0b000, "sha1c", int_aarch64_crypto_sha1c>; +def SHA1Prrr : SHATiedInstQSV<0b001, "sha1p", int_aarch64_crypto_sha1p>; +def SHA1Mrrr : SHATiedInstQSV<0b010, "sha1m", int_aarch64_crypto_sha1m>; +def SHA1SU0rrr : SHATiedInstVVV<0b011, "sha1su0", int_aarch64_crypto_sha1su0>; +def SHA256Hrrr : SHATiedInstQQV<0b100, "sha256h", int_aarch64_crypto_sha256h>; +def SHA256H2rrr : SHATiedInstQQV<0b101, "sha256h2",int_aarch64_crypto_sha256h2>; +def SHA256SU1rrr :SHATiedInstVVV<0b110, "sha256su1",int_aarch64_crypto_sha256su1>; + +def SHA1Hrr : SHAInstSS< 0b0000, "sha1h", int_aarch64_crypto_sha1h>; +def SHA1SU1rr : SHATiedInstVV<0b0001, "sha1su1", int_aarch64_crypto_sha1su1>; +def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0>; + +//---------------------------------------------------------------------------- +// Compiler-pseudos +//---------------------------------------------------------------------------- +// FIXME: Like for X86, these should go in their own separate .td file. + +// Any instruction that defines a 32-bit result leaves the high half of the +// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may +// be copying from a truncate. But any other 32-bit operation will zero-extend +// up to 64 bits. +// FIXME: X86 also checks for CMOV here. Do we need something similar? +def def32 : PatLeaf<(i32 GPR32:$src), [{ + return N->getOpcode() != ISD::TRUNCATE && + N->getOpcode() != TargetOpcode::EXTRACT_SUBREG && + N->getOpcode() != ISD::CopyFromReg; +}]>; + +// In the case of a 32-bit def that is known to implicitly zero-extend, +// we can use a SUBREG_TO_REG. +def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>; + +// For an anyext, we don't care what the high bits are, so we can perform an +// INSERT_SUBREF into an IMPLICIT_DEF. +def : Pat<(i64 (anyext GPR32:$src)), + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>; + +// When we need to explicitly zero-extend, we use an unsigned bitfield move +// instruction (UBFM) on the enclosing super-reg. +def : Pat<(i64 (zext GPR32:$src)), + (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>; + +// To sign extend, we use a signed bitfield move instruction (SBFM) on the +// containing super-reg. +def : Pat<(i64 (sext GPR32:$src)), + (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>; +def : Pat<(i64 (sext_inreg GPR64:$src, i32)), (SBFMXri GPR64:$src, 0, 31)>; +def : Pat<(i64 (sext_inreg GPR64:$src, i16)), (SBFMXri GPR64:$src, 0, 15)>; +def : Pat<(i64 (sext_inreg GPR64:$src, i8)), (SBFMXri GPR64:$src, 0, 7)>; +def : Pat<(i64 (sext_inreg GPR64:$src, i1)), (SBFMXri GPR64:$src, 0, 0)>; +def : Pat<(i32 (sext_inreg GPR32:$src, i16)), (SBFMWri GPR32:$src, 0, 15)>; +def : Pat<(i32 (sext_inreg GPR32:$src, i8)), (SBFMWri GPR32:$src, 0, 7)>; +def : Pat<(i32 (sext_inreg GPR32:$src, i1)), (SBFMWri GPR32:$src, 0, 0)>; + +def : Pat<(shl (sext_inreg GPR32:$Rn, i8), (i64 imm0_31:$imm)), + (SBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)), + (i64 (i32shift_sext_i8 imm0_31:$imm)))>; +def : Pat<(shl (sext_inreg GPR64:$Rn, i8), (i64 imm0_63:$imm)), + (SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)), + (i64 (i64shift_sext_i8 imm0_63:$imm)))>; + +def : Pat<(shl (sext_inreg GPR32:$Rn, i16), (i64 imm0_31:$imm)), + (SBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)), + (i64 (i32shift_sext_i16 imm0_31:$imm)))>; +def : Pat<(shl (sext_inreg GPR64:$Rn, i16), (i64 imm0_63:$imm)), + (SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)), + (i64 (i64shift_sext_i16 imm0_63:$imm)))>; + +def : Pat<(shl (i64 (sext GPR32:$Rn)), (i64 imm0_63:$imm)), + (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32), + (i64 (i64shift_a imm0_63:$imm)), + (i64 (i64shift_sext_i32 imm0_63:$imm)))>; + +// sra patterns have an AddedComplexity of 10, so make sure we have a higher +// AddedComplexity for the following patterns since we want to match sext + sra +// patterns before we attempt to match a single sra node. +let AddedComplexity = 20 in { +// We support all sext + sra combinations which preserve at least one bit of the +// original value which is to be sign extended. E.g. we support shifts up to +// bitwidth-1 bits. +def : Pat<(sra (sext_inreg GPR32:$Rn, i8), (i64 imm0_7:$imm)), + (SBFMWri GPR32:$Rn, (i64 imm0_7:$imm), 7)>; +def : Pat<(sra (sext_inreg GPR64:$Rn, i8), (i64 imm0_7:$imm)), + (SBFMXri GPR64:$Rn, (i64 imm0_7:$imm), 7)>; + +def : Pat<(sra (sext_inreg GPR32:$Rn, i16), (i64 imm0_15:$imm)), + (SBFMWri GPR32:$Rn, (i64 imm0_15:$imm), 15)>; +def : Pat<(sra (sext_inreg GPR64:$Rn, i16), (i64 imm0_15:$imm)), + (SBFMXri GPR64:$Rn, (i64 imm0_15:$imm), 15)>; + +def : Pat<(sra (i64 (sext GPR32:$Rn)), (i64 imm0_31:$imm)), + (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32), + (i64 imm0_31:$imm), 31)>; +} // AddedComplexity = 20 + +// To truncate, we can simply extract from a subregister. +def : Pat<(i32 (trunc GPR64sp:$src)), + (i32 (EXTRACT_SUBREG GPR64sp:$src, sub_32))>; + +// __builtin_trap() uses the BRK instruction on AArch64. +def : Pat<(trap), (BRK 1)>; + +// Conversions within AdvSIMD types in the same register size are free. +// But because we need a consistent lane ordering, in big endian many +// conversions require one or more REV instructions. +// +// Consider a simple memory load followed by a bitconvert then a store. +// v0 = load v2i32 +// v1 = BITCAST v2i32 v0 to v4i16 +// store v4i16 v2 +// +// In big endian mode every memory access has an implicit byte swap. LDR and +// STR do a 64-bit byte swap, whereas LD1/ST1 do a byte swap per lane - that +// is, they treat the vector as a sequence of elements to be byte-swapped. +// The two pairs of instructions are fundamentally incompatible. We've decided +// to use LD1/ST1 only to simplify compiler implementation. +// +// LD1/ST1 perform the equivalent of a sequence of LDR/STR + REV. This makes +// the original code sequence: +// v0 = load v2i32 +// v1 = REV v2i32 (implicit) +// v2 = BITCAST v2i32 v1 to v4i16 +// v3 = REV v4i16 v2 (implicit) +// store v4i16 v3 +// +// But this is now broken - the value stored is different to the value loaded +// due to lane reordering. To fix this, on every BITCAST we must perform two +// other REVs: +// v0 = load v2i32 +// v1 = REV v2i32 (implicit) +// v2 = REV v2i32 +// v3 = BITCAST v2i32 v2 to v4i16 +// v4 = REV v4i16 +// v5 = REV v4i16 v4 (implicit) +// store v4i16 v5 +// +// This means an extra two instructions, but actually in most cases the two REV +// instructions can be combined into one. For example: +// (REV64_2s (REV64_4h X)) === (REV32_4h X) +// +// There is also no 128-bit REV instruction. This must be synthesized with an +// EXT instruction. +// +// Most bitconverts require some sort of conversion. The only exceptions are: +// a) Identity conversions - vNfX <-> vNiX +// b) Single-lane-to-scalar - v1fX <-> fX or v1iX <-> iX +// + +let Predicates = [IsLE] in { +def : Pat<(v8i8 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; + +def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))), + (COPY_TO_REGCLASS V64:$Vn, GPR64)>; +def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))), + (COPY_TO_REGCLASS V64:$Vn, GPR64)>; +def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))), + (COPY_TO_REGCLASS V64:$Vn, GPR64)>; +def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), + (COPY_TO_REGCLASS V64:$Vn, GPR64)>; +def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))), + (COPY_TO_REGCLASS V64:$Vn, GPR64)>; +} +let Predicates = [IsBE] in { +def : Pat<(v8i8 (bitconvert GPR64:$Xn)), + (REV64v8i8 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; +def : Pat<(v4i16 (bitconvert GPR64:$Xn)), + (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; +def : Pat<(v2i32 (bitconvert GPR64:$Xn)), + (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; +def : Pat<(v2f32 (bitconvert GPR64:$Xn)), + (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; + +def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))), + (REV64v8i8 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; +def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))), + (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; +def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))), + (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; +def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), + (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; +} +def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))), + (COPY_TO_REGCLASS V64:$Vn, GPR64)>; +def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)), + (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)), + (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>; + +def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))), + (COPY_TO_REGCLASS GPR32:$Xn, FPR32)>; +def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))), + (COPY_TO_REGCLASS FPR32:$Xn, GPR32)>; +def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))), + (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; +def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))), + (COPY_TO_REGCLASS FPR64:$Xn, GPR64)>; +def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))), + (COPY_TO_REGCLASS V64:$Vn, GPR64)>; + +let Predicates = [IsLE] in { +def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), + (v1i64 (REV64v2i32 FPR64:$src))>; +def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), + (v1i64 (REV64v4i16 FPR64:$src))>; +def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), + (v1i64 (REV64v8i8 FPR64:$src))>; +def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), + (v1i64 (REV64v2i32 FPR64:$src))>; +} +def : Pat<(v1i64 (bitconvert (v1f64 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1i64 (bitconvert (f64 FPR64:$src))), (v1i64 FPR64:$src)>; + +let Predicates = [IsLE] in { +def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), + (v2i32 (REV64v2i32 FPR64:$src))>; +def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), + (v2i32 (REV32v4i16 FPR64:$src))>; +def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))), + (v2i32 (REV32v8i8 FPR64:$src))>; +def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), + (v2i32 (REV64v2i32 FPR64:$src))>; +def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), + (v2i32 (REV64v2i32 FPR64:$src))>; +} +def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>; + +let Predicates = [IsLE] in { +def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), + (v4i16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), + (v4i16 (REV32v4i16 FPR64:$src))>; +def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))), + (v4i16 (REV16v8i8 FPR64:$src))>; +def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), + (v4i16 (REV64v4i16 FPR64:$src))>; +def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), + (v4i16 (REV32v4i16 FPR64:$src))>; +def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), + (v4i16 (REV64v4i16 FPR64:$src))>; +} + +let Predicates = [IsLE] in { +def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), (v8i8 FPR64:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), + (v8i8 (REV64v8i8 FPR64:$src))>; +def : Pat<(v8i8 (bitconvert (v2i32 FPR64:$src))), + (v8i8 (REV32v8i8 FPR64:$src))>; +def : Pat<(v8i8 (bitconvert (v4i16 FPR64:$src))), + (v8i8 (REV16v8i8 FPR64:$src))>; +def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), + (v8i8 (REV64v8i8 FPR64:$src))>; +def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))), + (v8i8 (REV32v8i8 FPR64:$src))>; +def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), + (v8i8 (REV64v8i8 FPR64:$src))>; +} + +let Predicates = [IsLE] in { +def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))), (f64 FPR64:$src)>; +def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))), (f64 FPR64:$src)>; +def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))), (f64 FPR64:$src)>; +def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), (f64 FPR64:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))), + (f64 (REV64v2i32 FPR64:$src))>; +def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))), + (f64 (REV64v4i16 FPR64:$src))>; +def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))), + (f64 (REV64v2i32 FPR64:$src))>; +def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), + (f64 (REV64v8i8 FPR64:$src))>; +} +def : Pat<(f64 (bitconvert (v1i64 FPR64:$src))), (f64 FPR64:$src)>; +def : Pat<(f64 (bitconvert (v1f64 FPR64:$src))), (f64 FPR64:$src)>; + +let Predicates = [IsLE] in { +def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>; +def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>; +def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))), (v1f64 FPR64:$src)>; +def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), + (v1f64 (REV64v2i32 FPR64:$src))>; +def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), + (v1f64 (REV64v4i16 FPR64:$src))>; +def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))), + (v1f64 (REV64v8i8 FPR64:$src))>; +def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), + (v1f64 (REV64v2i32 FPR64:$src))>; +} +def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>; +def : Pat<(v1f64 (bitconvert (f64 FPR64:$src))), (v1f64 FPR64:$src)>; + +let Predicates = [IsLE] in { +def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 FPR64:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), + (v2f32 (REV64v2i32 FPR64:$src))>; +def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), + (v2f32 (REV32v4i16 FPR64:$src))>; +def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))), + (v2f32 (REV32v8i8 FPR64:$src))>; +def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), + (v2f32 (REV64v2i32 FPR64:$src))>; +def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), + (v2f32 (REV64v2i32 FPR64:$src))>; +} +def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>; + +let Predicates = [IsLE] in { +def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), (f128 FPR128:$src)>; +def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>; +def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>; +def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>; +def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>; +def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), + (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>; +def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), + (f128 (EXTv16i8 (REV64v4i32 FPR128:$src), + (REV64v4i32 FPR128:$src), (i32 8)))>; +def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), + (f128 (EXTv16i8 (REV64v8i16 FPR128:$src), + (REV64v8i16 FPR128:$src), (i32 8)))>; +def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), + (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>; +def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), + (f128 (EXTv16i8 (REV64v4i32 FPR128:$src), + (REV64v4i32 FPR128:$src), (i32 8)))>; +def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), + (f128 (EXTv16i8 (REV64v16i8 FPR128:$src), + (REV64v16i8 FPR128:$src), (i32 8)))>; +} + +let Predicates = [IsLE] in { +def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), (v2f64 FPR128:$src)>; +def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>; +def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>; +def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>; +def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), + (v2f64 (EXTv16i8 FPR128:$src, + FPR128:$src, (i32 8)))>; +def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), + (v2f64 (REV64v4i32 FPR128:$src))>; +def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), + (v2f64 (REV64v8i16 FPR128:$src))>; +def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), + (v2f64 (REV64v16i8 FPR128:$src))>; +def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), + (v2f64 (REV64v4i32 FPR128:$src))>; +} +def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>; + +let Predicates = [IsLE] in { +def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), + (v4f32 (EXTv16i8 (REV64v4i32 FPR128:$src), + (REV64v4i32 FPR128:$src), (i32 8)))>; +def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), + (v4f32 (REV32v8i16 FPR128:$src))>; +def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), + (v4f32 (REV32v16i8 FPR128:$src))>; +def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), + (v4f32 (REV64v4i32 FPR128:$src))>; +def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), + (v4f32 (REV64v4i32 FPR128:$src))>; +} +def : Pat<(v4f32 (bitconvert (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>; + +let Predicates = [IsLE] in { +def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), + (v2i64 (EXTv16i8 FPR128:$src, + FPR128:$src, (i32 8)))>; +def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), + (v2i64 (REV64v4i32 FPR128:$src))>; +def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), + (v2i64 (REV64v8i16 FPR128:$src))>; +def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), + (v2i64 (REV64v16i8 FPR128:$src))>; +def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), + (v2i64 (REV64v4i32 FPR128:$src))>; +} +def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>; + +let Predicates = [IsLE] in { +def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), + (v4i32 (EXTv16i8 (REV64v4i32 FPR128:$src), + (REV64v4i32 FPR128:$src), + (i32 8)))>; +def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), + (v4i32 (REV64v4i32 FPR128:$src))>; +def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), + (v4i32 (REV32v8i16 FPR128:$src))>; +def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), + (v4i32 (REV32v16i8 FPR128:$src))>; +def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), + (v4i32 (REV64v4i32 FPR128:$src))>; +} +def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>; + +let Predicates = [IsLE] in { +def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))), + (v8i16 (EXTv16i8 (REV64v8i16 FPR128:$src), + (REV64v8i16 FPR128:$src), + (i32 8)))>; +def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), + (v8i16 (REV64v8i16 FPR128:$src))>; +def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), + (v8i16 (REV32v8i16 FPR128:$src))>; +def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), + (v8i16 (REV16v16i8 FPR128:$src))>; +def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), + (v8i16 (REV64v8i16 FPR128:$src))>; +def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), + (v8i16 (REV32v8i16 FPR128:$src))>; +} + +let Predicates = [IsLE] in { +def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>; +} +let Predicates = [IsBE] in { +def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), + (v16i8 (EXTv16i8 (REV64v16i8 FPR128:$src), + (REV64v16i8 FPR128:$src), + (i32 8)))>; +def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), + (v16i8 (REV64v16i8 FPR128:$src))>; +def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), + (v16i8 (REV32v16i8 FPR128:$src))>; +def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), + (v16i8 (REV16v16i8 FPR128:$src))>; +def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), + (v16i8 (REV64v16i8 FPR128:$src))>; +def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), + (v16i8 (REV32v16i8 FPR128:$src))>; +} + +def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))), + (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>; +def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))), + (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>; +def : Pat<(v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 1))), + (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>; +def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))), + (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>; + +// A 64-bit subvector insert to the first 128-bit vector position +// is a subregister copy that needs no instruction. +def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (i32 0)), + (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>; +def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (i32 0)), + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>; +def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (i32 0)), + (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>; +def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)), + (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>; +def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)), + (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>; +def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)), + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>; + +// Use pair-wise add instructions when summing up the lanes for v2f64, v2i64 +// or v2f32. +def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)), + (vector_extract (v2i64 FPR128:$Rn), (i64 1)))), + (i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>; +def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)), + (vector_extract (v2f64 FPR128:$Rn), (i64 1)))), + (f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>; + // vector_extract on 64-bit vectors gets promoted to a 128 bit vector, + // so we match on v4f32 here, not v2f32. This will also catch adding + // the low two lanes of a true v4f32 vector. +def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)), + (vector_extract (v4f32 FPR128:$Rn), (i64 1))), + (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>; + +// Scalar 64-bit shifts in FPR64 registers. +def : Pat<(i64 (int_aarch64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), + (SSHLv1i64 FPR64:$Rn, FPR64:$Rm)>; +def : Pat<(i64 (int_aarch64_neon_ushl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), + (USHLv1i64 FPR64:$Rn, FPR64:$Rm)>; +def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), + (SRSHLv1i64 FPR64:$Rn, FPR64:$Rm)>; +def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), + (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>; + +// Tail call return handling. These are all compiler pseudo-instructions, +// so no encoding information or anything like that. +let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in { + def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>; + def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>; +} + +def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)), + (TCRETURNri tcGPR64:$dst, imm:$FPDiff)>; +def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)), + (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>; +def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)), + (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>; + +include "AArch64InstrAtomics.td" diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp new file mode 100644 index 00000000000..e7454be125b --- /dev/null +++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -0,0 +1,942 @@ +//=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that performs load / store related peephole +// optimizations. This pass should be run after register allocation. +// +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +#define DEBUG_TYPE "aarch64-ldst-opt" + +/// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine +/// load / store instructions to form ldp / stp instructions. + +STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); +STATISTIC(NumPostFolded, "Number of post-index updates folded"); +STATISTIC(NumPreFolded, "Number of pre-index updates folded"); +STATISTIC(NumUnscaledPairCreated, + "Number of load/store from unscaled generated"); + +static cl::opt ScanLimit("aarch64-load-store-scan-limit", cl::init(20), + cl::Hidden); + +// Place holder while testing unscaled load/store combining +static cl::opt +EnableAArch64UnscaledMemOp("aarch64-unscaled-mem-op", cl::Hidden, + cl::desc("Allow AArch64 unscaled load/store combining"), + cl::init(true)); + +namespace { +struct AArch64LoadStoreOpt : public MachineFunctionPass { + static char ID; + AArch64LoadStoreOpt() : MachineFunctionPass(ID) {} + + const AArch64InstrInfo *TII; + const TargetRegisterInfo *TRI; + + // Scan the instructions looking for a load/store that can be combined + // with the current instruction into a load/store pair. + // Return the matching instruction if one is found, else MBB->end(). + // If a matching instruction is found, mergeForward is set to true if the + // merge is to remove the first instruction and replace the second with + // a pair-wise insn, and false if the reverse is true. + MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, + bool &mergeForward, + unsigned Limit); + // Merge the two instructions indicated into a single pair-wise instruction. + // If mergeForward is true, erase the first instruction and fold its + // operation into the second. If false, the reverse. Return the instruction + // following the first instruction (which may change during processing). + MachineBasicBlock::iterator + mergePairedInsns(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, bool mergeForward); + + // Scan the instruction list to find a base register update that can + // be combined with the current instruction (a load or store) using + // pre or post indexed addressing with writeback. Scan forwards. + MachineBasicBlock::iterator + findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit, + int Value); + + // Scan the instruction list to find a base register update that can + // be combined with the current instruction (a load or store) using + // pre or post indexed addressing with writeback. Scan backwards. + MachineBasicBlock::iterator + findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); + + // Merge a pre-index base register update into a ld/st instruction. + MachineBasicBlock::iterator + mergePreIdxUpdateInsn(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Update); + + // Merge a post-index base register update into a ld/st instruction. + MachineBasicBlock::iterator + mergePostIdxUpdateInsn(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Update); + + bool optimizeBlock(MachineBasicBlock &MBB); + + bool runOnMachineFunction(MachineFunction &Fn) override; + + const char *getPassName() const override { + return "AArch64 load / store optimization pass"; + } + +private: + int getMemSize(MachineInstr *MemMI); +}; +char AArch64LoadStoreOpt::ID = 0; +} + +static bool isUnscaledLdst(unsigned Opc) { + switch (Opc) { + default: + return false; + case AArch64::STURSi: + return true; + case AArch64::STURDi: + return true; + case AArch64::STURQi: + return true; + case AArch64::STURWi: + return true; + case AArch64::STURXi: + return true; + case AArch64::LDURSi: + return true; + case AArch64::LDURDi: + return true; + case AArch64::LDURQi: + return true; + case AArch64::LDURWi: + return true; + case AArch64::LDURXi: + return true; + } +} + +// Size in bytes of the data moved by an unscaled load or store +int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) { + switch (MemMI->getOpcode()) { + default: + llvm_unreachable("Opcode has has unknown size!"); + case AArch64::STRSui: + case AArch64::STURSi: + return 4; + case AArch64::STRDui: + case AArch64::STURDi: + return 8; + case AArch64::STRQui: + case AArch64::STURQi: + return 16; + case AArch64::STRWui: + case AArch64::STURWi: + return 4; + case AArch64::STRXui: + case AArch64::STURXi: + return 8; + case AArch64::LDRSui: + case AArch64::LDURSi: + return 4; + case AArch64::LDRDui: + case AArch64::LDURDi: + return 8; + case AArch64::LDRQui: + case AArch64::LDURQi: + return 16; + case AArch64::LDRWui: + case AArch64::LDURWi: + return 4; + case AArch64::LDRXui: + case AArch64::LDURXi: + return 8; + } +} + +static unsigned getMatchingPairOpcode(unsigned Opc) { + switch (Opc) { + default: + llvm_unreachable("Opcode has no pairwise equivalent!"); + case AArch64::STRSui: + case AArch64::STURSi: + return AArch64::STPSi; + case AArch64::STRDui: + case AArch64::STURDi: + return AArch64::STPDi; + case AArch64::STRQui: + case AArch64::STURQi: + return AArch64::STPQi; + case AArch64::STRWui: + case AArch64::STURWi: + return AArch64::STPWi; + case AArch64::STRXui: + case AArch64::STURXi: + return AArch64::STPXi; + case AArch64::LDRSui: + case AArch64::LDURSi: + return AArch64::LDPSi; + case AArch64::LDRDui: + case AArch64::LDURDi: + return AArch64::LDPDi; + case AArch64::LDRQui: + case AArch64::LDURQi: + return AArch64::LDPQi; + case AArch64::LDRWui: + case AArch64::LDURWi: + return AArch64::LDPWi; + case AArch64::LDRXui: + case AArch64::LDURXi: + return AArch64::LDPXi; + } +} + +static unsigned getPreIndexedOpcode(unsigned Opc) { + switch (Opc) { + default: + llvm_unreachable("Opcode has no pre-indexed equivalent!"); + case AArch64::STRSui: return AArch64::STRSpre; + case AArch64::STRDui: return AArch64::STRDpre; + case AArch64::STRQui: return AArch64::STRQpre; + case AArch64::STRWui: return AArch64::STRWpre; + case AArch64::STRXui: return AArch64::STRXpre; + case AArch64::LDRSui: return AArch64::LDRSpre; + case AArch64::LDRDui: return AArch64::LDRDpre; + case AArch64::LDRQui: return AArch64::LDRQpre; + case AArch64::LDRWui: return AArch64::LDRWpre; + case AArch64::LDRXui: return AArch64::LDRXpre; + } +} + +static unsigned getPostIndexedOpcode(unsigned Opc) { + switch (Opc) { + default: + llvm_unreachable("Opcode has no post-indexed wise equivalent!"); + case AArch64::STRSui: + return AArch64::STRSpost; + case AArch64::STRDui: + return AArch64::STRDpost; + case AArch64::STRQui: + return AArch64::STRQpost; + case AArch64::STRWui: + return AArch64::STRWpost; + case AArch64::STRXui: + return AArch64::STRXpost; + case AArch64::LDRSui: + return AArch64::LDRSpost; + case AArch64::LDRDui: + return AArch64::LDRDpost; + case AArch64::LDRQui: + return AArch64::LDRQpost; + case AArch64::LDRWui: + return AArch64::LDRWpost; + case AArch64::LDRXui: + return AArch64::LDRXpost; + } +} + +MachineBasicBlock::iterator +AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Paired, + bool mergeForward) { + MachineBasicBlock::iterator NextI = I; + ++NextI; + // If NextI is the second of the two instructions to be merged, we need + // to skip one further. Either way we merge will invalidate the iterator, + // and we don't need to scan the new instruction, as it's a pairwise + // instruction, which we're not considering for further action anyway. + if (NextI == Paired) + ++NextI; + + bool IsUnscaled = isUnscaledLdst(I->getOpcode()); + int OffsetStride = + IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1; + + unsigned NewOpc = getMatchingPairOpcode(I->getOpcode()); + // Insert our new paired instruction after whichever of the paired + // instructions mergeForward indicates. + MachineBasicBlock::iterator InsertionPoint = mergeForward ? Paired : I; + // Also based on mergeForward is from where we copy the base register operand + // so we get the flags compatible with the input code. + MachineOperand &BaseRegOp = + mergeForward ? Paired->getOperand(1) : I->getOperand(1); + + // Which register is Rt and which is Rt2 depends on the offset order. + MachineInstr *RtMI, *Rt2MI; + if (I->getOperand(2).getImm() == + Paired->getOperand(2).getImm() + OffsetStride) { + RtMI = Paired; + Rt2MI = I; + } else { + RtMI = I; + Rt2MI = Paired; + } + // Handle Unscaled + int OffsetImm = RtMI->getOperand(2).getImm(); + if (IsUnscaled && EnableAArch64UnscaledMemOp) + OffsetImm /= OffsetStride; + + // Construct the new instruction. + MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint, + I->getDebugLoc(), TII->get(NewOpc)) + .addOperand(RtMI->getOperand(0)) + .addOperand(Rt2MI->getOperand(0)) + .addOperand(BaseRegOp) + .addImm(OffsetImm); + (void)MIB; + + // FIXME: Do we need/want to copy the mem operands from the source + // instructions? Probably. What uses them after this? + + DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n "); + DEBUG(I->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(Paired->print(dbgs())); + DEBUG(dbgs() << " with instruction:\n "); + DEBUG(((MachineInstr *)MIB)->print(dbgs())); + DEBUG(dbgs() << "\n"); + + // Erase the old instructions. + I->eraseFromParent(); + Paired->eraseFromParent(); + + return NextI; +} + +/// trackRegDefsUses - Remember what registers the specified instruction uses +/// and modifies. +static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs, + BitVector &UsedRegs, + const TargetRegisterInfo *TRI) { + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI->getOperand(i); + if (MO.isRegMask()) + ModifiedRegs.setBitsNotInMask(MO.getRegMask()); + + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (MO.isDef()) { + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + ModifiedRegs.set(*AI); + } else { + assert(MO.isUse() && "Reg operand not a def and not a use?!?"); + for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) + UsedRegs.set(*AI); + } + } +} + +static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { + if (!IsUnscaled && (Offset > 63 || Offset < -64)) + return false; + if (IsUnscaled) { + // Convert the byte-offset used by unscaled into an "element" offset used + // by the scaled pair load/store instructions. + int elemOffset = Offset / OffsetStride; + if (elemOffset > 63 || elemOffset < -64) + return false; + } + return true; +} + +// Do alignment, specialized to power of 2 and for signed ints, +// avoiding having to do a C-style cast from uint_64t to int when +// using RoundUpToAlignment from include/llvm/Support/MathExtras.h. +// FIXME: Move this function to include/MathExtras.h? +static int alignTo(int Num, int PowOf2) { + return (Num + PowOf2 - 1) & ~(PowOf2 - 1); +} + +/// findMatchingInsn - Scan the instructions looking for a load/store that can +/// be combined with the current instruction into a load/store pair. +MachineBasicBlock::iterator +AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, + bool &mergeForward, unsigned Limit) { + MachineBasicBlock::iterator E = I->getParent()->end(); + MachineBasicBlock::iterator MBBI = I; + MachineInstr *FirstMI = I; + ++MBBI; + + int Opc = FirstMI->getOpcode(); + bool mayLoad = FirstMI->mayLoad(); + bool IsUnscaled = isUnscaledLdst(Opc); + unsigned Reg = FirstMI->getOperand(0).getReg(); + unsigned BaseReg = FirstMI->getOperand(1).getReg(); + int Offset = FirstMI->getOperand(2).getImm(); + + // Early exit if the first instruction modifies the base register. + // e.g., ldr x0, [x0] + // Early exit if the offset if not possible to match. (6 bits of positive + // range, plus allow an extra one in case we find a later insn that matches + // with Offset-1 + if (FirstMI->modifiesRegister(BaseReg, TRI)) + return E; + int OffsetStride = + IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(FirstMI) : 1; + if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride)) + return E; + + // Track which registers have been modified and used between the first insn + // (inclusive) and the second insn. + BitVector ModifiedRegs, UsedRegs; + ModifiedRegs.resize(TRI->getNumRegs()); + UsedRegs.resize(TRI->getNumRegs()); + for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { + MachineInstr *MI = MBBI; + // Skip DBG_VALUE instructions. Otherwise debug info can affect the + // optimization by changing how far we scan. + if (MI->isDebugValue()) + continue; + + // Now that we know this is a real instruction, count it. + ++Count; + + if (Opc == MI->getOpcode() && MI->getOperand(2).isImm()) { + // If we've found another instruction with the same opcode, check to see + // if the base and offset are compatible with our starting instruction. + // These instructions all have scaled immediate operands, so we just + // check for +1/-1. Make sure to check the new instruction offset is + // actually an immediate and not a symbolic reference destined for + // a relocation. + // + // Pairwise instructions have a 7-bit signed offset field. Single insns + // have a 12-bit unsigned offset field. To be a valid combine, the + // final offset must be in range. + unsigned MIBaseReg = MI->getOperand(1).getReg(); + int MIOffset = MI->getOperand(2).getImm(); + if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) || + (Offset + OffsetStride == MIOffset))) { + int MinOffset = Offset < MIOffset ? Offset : MIOffset; + // If this is a volatile load/store that otherwise matched, stop looking + // as something is going on that we don't have enough information to + // safely transform. Similarly, stop if we see a hint to avoid pairs. + if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) + return E; + // If the resultant immediate offset of merging these instructions + // is out of range for a pairwise instruction, bail and keep looking. + bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode()); + if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + continue; + } + // If the alignment requirements of the paired (scaled) instruction + // can't express the offset of the unscaled input, bail and keep + // looking. + if (IsUnscaled && EnableAArch64UnscaledMemOp && + (alignTo(MinOffset, OffsetStride) != MinOffset)) { + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + continue; + } + // If the destination register of the loads is the same register, bail + // and keep looking. A load-pair instruction with both destination + // registers the same is UNPREDICTABLE and will result in an exception. + if (mayLoad && Reg == MI->getOperand(0).getReg()) { + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + continue; + } + + // If the Rt of the second instruction was not modified or used between + // the two instructions, we can combine the second into the first. + if (!ModifiedRegs[MI->getOperand(0).getReg()] && + !UsedRegs[MI->getOperand(0).getReg()]) { + mergeForward = false; + return MBBI; + } + + // Likewise, if the Rt of the first instruction is not modified or used + // between the two instructions, we can combine the first into the + // second. + if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] && + !UsedRegs[FirstMI->getOperand(0).getReg()]) { + mergeForward = true; + return MBBI; + } + // Unable to combine these instructions due to interference in between. + // Keep looking. + } + } + + // If the instruction wasn't a matching load or store, but does (or can) + // modify memory, stop searching, as we don't have alias analysis or + // anything like that to tell us whether the access is tromping on the + // locations we care about. The big one we want to catch is calls. + // + // FIXME: Theoretically, we can do better than that for SP and FP based + // references since we can effectively know where those are touching. It's + // unclear if it's worth the extra code, though. Most paired instructions + // will be sequential, perhaps with a few intervening non-memory related + // instructions. + if (MI->mayStore() || MI->isCall()) + return E; + // Likewise, if we're matching a store instruction, we don't want to + // move across a load, as it may be reading the same location. + if (FirstMI->mayStore() && MI->mayLoad()) + return E; + + // Update modified / uses register lists. + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + + // Otherwise, if the base register is modified, we have no match, so + // return early. + if (ModifiedRegs[BaseReg]) + return E; + } + return E; +} + +MachineBasicBlock::iterator +AArch64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Update) { + assert((Update->getOpcode() == AArch64::ADDXri || + Update->getOpcode() == AArch64::SUBXri) && + "Unexpected base register update instruction to merge!"); + MachineBasicBlock::iterator NextI = I; + // Return the instruction following the merged instruction, which is + // the instruction following our unmerged load. Unless that's the add/sub + // instruction we're merging, in which case it's the one after that. + if (++NextI == Update) + ++NextI; + + int Value = Update->getOperand(2).getImm(); + assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && + "Can't merge 1 << 12 offset into pre-indexed load / store"); + if (Update->getOpcode() == AArch64::SUBXri) + Value = -Value; + + unsigned NewOpc = getPreIndexedOpcode(I->getOpcode()); + MachineInstrBuilder MIB = + BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) + .addOperand(Update->getOperand(0)) + .addOperand(I->getOperand(0)) + .addOperand(I->getOperand(1)) + .addImm(Value); + (void)MIB; + + DEBUG(dbgs() << "Creating pre-indexed load/store."); + DEBUG(dbgs() << " Replacing instructions:\n "); + DEBUG(I->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(Update->print(dbgs())); + DEBUG(dbgs() << " with instruction:\n "); + DEBUG(((MachineInstr *)MIB)->print(dbgs())); + DEBUG(dbgs() << "\n"); + + // Erase the old instructions for the block. + I->eraseFromParent(); + Update->eraseFromParent(); + + return NextI; +} + +MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn( + MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update) { + assert((Update->getOpcode() == AArch64::ADDXri || + Update->getOpcode() == AArch64::SUBXri) && + "Unexpected base register update instruction to merge!"); + MachineBasicBlock::iterator NextI = I; + // Return the instruction following the merged instruction, which is + // the instruction following our unmerged load. Unless that's the add/sub + // instruction we're merging, in which case it's the one after that. + if (++NextI == Update) + ++NextI; + + int Value = Update->getOperand(2).getImm(); + assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && + "Can't merge 1 << 12 offset into post-indexed load / store"); + if (Update->getOpcode() == AArch64::SUBXri) + Value = -Value; + + unsigned NewOpc = getPostIndexedOpcode(I->getOpcode()); + MachineInstrBuilder MIB = + BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) + .addOperand(Update->getOperand(0)) + .addOperand(I->getOperand(0)) + .addOperand(I->getOperand(1)) + .addImm(Value); + (void)MIB; + + DEBUG(dbgs() << "Creating post-indexed load/store."); + DEBUG(dbgs() << " Replacing instructions:\n "); + DEBUG(I->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(Update->print(dbgs())); + DEBUG(dbgs() << " with instruction:\n "); + DEBUG(((MachineInstr *)MIB)->print(dbgs())); + DEBUG(dbgs() << "\n"); + + // Erase the old instructions for the block. + I->eraseFromParent(); + Update->eraseFromParent(); + + return NextI; +} + +static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg, + int Offset) { + switch (MI->getOpcode()) { + default: + break; + case AArch64::SUBXri: + // Negate the offset for a SUB instruction. + Offset *= -1; + // FALLTHROUGH + case AArch64::ADDXri: + // Make sure it's a vanilla immediate operand, not a relocation or + // anything else we can't handle. + if (!MI->getOperand(2).isImm()) + break; + // Watch out for 1 << 12 shifted value. + if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm())) + break; + // If the instruction has the base register as source and dest and the + // immediate will fit in a signed 9-bit integer, then we have a match. + if (MI->getOperand(0).getReg() == BaseReg && + MI->getOperand(1).getReg() == BaseReg && + MI->getOperand(2).getImm() <= 255 && + MI->getOperand(2).getImm() >= -256) { + // If we have a non-zero Offset, we check that it matches the amount + // we're adding to the register. + if (!Offset || Offset == MI->getOperand(2).getImm()) + return true; + } + break; + } + return false; +} + +MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( + MachineBasicBlock::iterator I, unsigned Limit, int Value) { + MachineBasicBlock::iterator E = I->getParent()->end(); + MachineInstr *MemMI = I; + MachineBasicBlock::iterator MBBI = I; + const MachineFunction &MF = *MemMI->getParent()->getParent(); + + unsigned DestReg = MemMI->getOperand(0).getReg(); + unsigned BaseReg = MemMI->getOperand(1).getReg(); + int Offset = MemMI->getOperand(2).getImm() * + TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); + + // If the base register overlaps the destination register, we can't + // merge the update. + if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + return E; + + // Scan forward looking for post-index opportunities. + // Updating instructions can't be formed if the memory insn already + // has an offset other than the value we're looking for. + if (Offset != Value) + return E; + + // Track which registers have been modified and used between the first insn + // (inclusive) and the second insn. + BitVector ModifiedRegs, UsedRegs; + ModifiedRegs.resize(TRI->getNumRegs()); + UsedRegs.resize(TRI->getNumRegs()); + ++MBBI; + for (unsigned Count = 0; MBBI != E; ++MBBI) { + MachineInstr *MI = MBBI; + // Skip DBG_VALUE instructions. Otherwise debug info can affect the + // optimization by changing how far we scan. + if (MI->isDebugValue()) + continue; + + // Now that we know this is a real instruction, count it. + ++Count; + + // If we found a match, return it. + if (isMatchingUpdateInsn(MI, BaseReg, Value)) + return MBBI; + + // Update the status of what the instruction clobbered and used. + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + + // Otherwise, if the base register is used or modified, we have no match, so + // return early. + if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) + return E; + } + return E; +} + +MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( + MachineBasicBlock::iterator I, unsigned Limit) { + MachineBasicBlock::iterator B = I->getParent()->begin(); + MachineBasicBlock::iterator E = I->getParent()->end(); + MachineInstr *MemMI = I; + MachineBasicBlock::iterator MBBI = I; + const MachineFunction &MF = *MemMI->getParent()->getParent(); + + unsigned DestReg = MemMI->getOperand(0).getReg(); + unsigned BaseReg = MemMI->getOperand(1).getReg(); + int Offset = MemMI->getOperand(2).getImm(); + unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); + + // If the load/store is the first instruction in the block, there's obviously + // not any matching update. Ditto if the memory offset isn't zero. + if (MBBI == B || Offset != 0) + return E; + // If the base register overlaps the destination register, we can't + // merge the update. + if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + return E; + + // Track which registers have been modified and used between the first insn + // (inclusive) and the second insn. + BitVector ModifiedRegs, UsedRegs; + ModifiedRegs.resize(TRI->getNumRegs()); + UsedRegs.resize(TRI->getNumRegs()); + --MBBI; + for (unsigned Count = 0; MBBI != B; --MBBI) { + MachineInstr *MI = MBBI; + // Skip DBG_VALUE instructions. Otherwise debug info can affect the + // optimization by changing how far we scan. + if (MI->isDebugValue()) + continue; + + // Now that we know this is a real instruction, count it. + ++Count; + + // If we found a match, return it. + if (isMatchingUpdateInsn(MI, BaseReg, RegSize)) + return MBBI; + + // Update the status of what the instruction clobbered and used. + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + + // Otherwise, if the base register is used or modified, we have no match, so + // return early. + if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) + return E; + } + return E; +} + +bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { + bool Modified = false; + // Two tranformations to do here: + // 1) Find loads and stores that can be merged into a single load or store + // pair instruction. + // e.g., + // ldr x0, [x2] + // ldr x1, [x2, #8] + // ; becomes + // ldp x0, x1, [x2] + // 2) Find base register updates that can be merged into the load or store + // as a base-reg writeback. + // e.g., + // ldr x0, [x2] + // add x2, x2, #4 + // ; becomes + // ldr x0, [x2], #4 + + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + MBBI != E;) { + MachineInstr *MI = MBBI; + switch (MI->getOpcode()) { + default: + // Just move on to the next instruction. + ++MBBI; + break; + case AArch64::STRSui: + case AArch64::STRDui: + case AArch64::STRQui: + case AArch64::STRXui: + case AArch64::STRWui: + case AArch64::LDRSui: + case AArch64::LDRDui: + case AArch64::LDRQui: + case AArch64::LDRXui: + case AArch64::LDRWui: + // do the unscaled versions as well + case AArch64::STURSi: + case AArch64::STURDi: + case AArch64::STURQi: + case AArch64::STURWi: + case AArch64::STURXi: + case AArch64::LDURSi: + case AArch64::LDURDi: + case AArch64::LDURQi: + case AArch64::LDURWi: + case AArch64::LDURXi: { + // If this is a volatile load/store, don't mess with it. + if (MI->hasOrderedMemoryRef()) { + ++MBBI; + break; + } + // Make sure this is a reg+imm (as opposed to an address reloc). + if (!MI->getOperand(2).isImm()) { + ++MBBI; + break; + } + // Check if this load/store has a hint to avoid pair formation. + // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. + if (TII->isLdStPairSuppressed(MI)) { + ++MBBI; + break; + } + // Look ahead up to ScanLimit instructions for a pairable instruction. + bool mergeForward = false; + MachineBasicBlock::iterator Paired = + findMatchingInsn(MBBI, mergeForward, ScanLimit); + if (Paired != E) { + // Merge the loads into a pair. Keeping the iterator straight is a + // pain, so we let the merge routine tell us what the next instruction + // is after it's done mucking about. + MBBI = mergePairedInsns(MBBI, Paired, mergeForward); + + Modified = true; + ++NumPairCreated; + if (isUnscaledLdst(MI->getOpcode())) + ++NumUnscaledPairCreated; + break; + } + ++MBBI; + break; + } + // FIXME: Do the other instructions. + } + } + + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + MBBI != E;) { + MachineInstr *MI = MBBI; + // Do update merging. It's simpler to keep this separate from the above + // switch, though not strictly necessary. + int Opc = MI->getOpcode(); + switch (Opc) { + default: + // Just move on to the next instruction. + ++MBBI; + break; + case AArch64::STRSui: + case AArch64::STRDui: + case AArch64::STRQui: + case AArch64::STRXui: + case AArch64::STRWui: + case AArch64::LDRSui: + case AArch64::LDRDui: + case AArch64::LDRQui: + case AArch64::LDRXui: + case AArch64::LDRWui: + // do the unscaled versions as well + case AArch64::STURSi: + case AArch64::STURDi: + case AArch64::STURQi: + case AArch64::STURWi: + case AArch64::STURXi: + case AArch64::LDURSi: + case AArch64::LDURDi: + case AArch64::LDURQi: + case AArch64::LDURWi: + case AArch64::LDURXi: { + // Make sure this is a reg+imm (as opposed to an address reloc). + if (!MI->getOperand(2).isImm()) { + ++MBBI; + break; + } + // Look ahead up to ScanLimit instructions for a mergable instruction. + MachineBasicBlock::iterator Update = + findMatchingUpdateInsnForward(MBBI, ScanLimit, 0); + if (Update != E) { + // Merge the update into the ld/st. + MBBI = mergePostIdxUpdateInsn(MBBI, Update); + Modified = true; + ++NumPostFolded; + break; + } + // Don't know how to handle pre/post-index versions, so move to the next + // instruction. + if (isUnscaledLdst(Opc)) { + ++MBBI; + break; + } + + // Look back to try to find a pre-index instruction. For example, + // add x0, x0, #8 + // ldr x1, [x0] + // merged into: + // ldr x1, [x0, #8]! + Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit); + if (Update != E) { + // Merge the update into the ld/st. + MBBI = mergePreIdxUpdateInsn(MBBI, Update); + Modified = true; + ++NumPreFolded; + break; + } + + // Look forward to try to find a post-index instruction. For example, + // ldr x1, [x0, #64] + // add x0, x0, #64 + // merged into: + // ldr x1, [x0, #64]! + + // The immediate in the load/store is scaled by the size of the register + // being loaded. The immediate in the add we're looking for, + // however, is not, so adjust here. + int Value = MI->getOperand(2).getImm() * + TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent())) + ->getSize(); + Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value); + if (Update != E) { + // Merge the update into the ld/st. + MBBI = mergePreIdxUpdateInsn(MBBI, Update); + Modified = true; + ++NumPreFolded; + break; + } + + // Nothing found. Just move to the next instruction. + ++MBBI; + break; + } + // FIXME: Do the other instructions. + } + } + + return Modified; +} + +bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { + const TargetMachine &TM = Fn.getTarget(); + TII = static_cast(TM.getInstrInfo()); + TRI = TM.getRegisterInfo(); + + bool Modified = false; + for (auto &MBB : Fn) + Modified |= optimizeBlock(MBB); + + return Modified; +} + +// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep +// loads and stores near one another? + +/// createARMLoadStoreOptimizationPass - returns an instance of the load / store +/// optimization pass. +FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { + return new AArch64LoadStoreOpt(); +} diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp new file mode 100644 index 00000000000..ab6d37532a7 --- /dev/null +++ b/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -0,0 +1,202 @@ +//==-- AArch64MCInstLower.cpp - Convert AArch64 MachineInstr to an MCInst --==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains code to lower AArch64 MachineInstrs to their corresponding +// MCInst records. +// +//===----------------------------------------------------------------------===// + +#include "AArch64MCInstLower.h" +#include "MCTargetDesc/AArch64MCExpr.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/IR/Mangler.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Target/TargetMachine.h" +using namespace llvm; + +AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, Mangler &mang, + AsmPrinter &printer) + : Ctx(ctx), Printer(printer), TargetTriple(printer.getTargetTriple()) {} + +MCSymbol * +AArch64MCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const { + return Printer.getSymbol(MO.getGlobal()); +} + +MCSymbol * +AArch64MCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const { + return Printer.GetExternalSymbolSymbol(MO.getSymbolName()); +} + +MCOperand AArch64MCInstLower::lowerSymbolOperandDarwin(const MachineOperand &MO, + MCSymbol *Sym) const { + // FIXME: We would like an efficient form for this, so we don't have to do a + // lot of extra uniquing. + MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None; + if ((MO.getTargetFlags() & AArch64II::MO_GOT) != 0) { + if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE) + RefKind = MCSymbolRefExpr::VK_GOTPAGE; + else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == + AArch64II::MO_PAGEOFF) + RefKind = MCSymbolRefExpr::VK_GOTPAGEOFF; + else + assert(0 && "Unexpected target flags with MO_GOT on GV operand"); + } else if ((MO.getTargetFlags() & AArch64II::MO_TLS) != 0) { + if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE) + RefKind = MCSymbolRefExpr::VK_TLVPPAGE; + else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == + AArch64II::MO_PAGEOFF) + RefKind = MCSymbolRefExpr::VK_TLVPPAGEOFF; + else + llvm_unreachable("Unexpected target flags with MO_TLS on GV operand"); + } else { + if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE) + RefKind = MCSymbolRefExpr::VK_PAGE; + else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == + AArch64II::MO_PAGEOFF) + RefKind = MCSymbolRefExpr::VK_PAGEOFF; + } + const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx); + if (!MO.isJTI() && MO.getOffset()) + Expr = MCBinaryExpr::CreateAdd( + Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx); + return MCOperand::CreateExpr(Expr); +} + +MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO, + MCSymbol *Sym) const { + uint32_t RefFlags = 0; + + if (MO.getTargetFlags() & AArch64II::MO_GOT) + RefFlags |= AArch64MCExpr::VK_GOT; + else if (MO.getTargetFlags() & AArch64II::MO_TLS) { + TLSModel::Model Model; + if (MO.isGlobal()) { + const GlobalValue *GV = MO.getGlobal(); + Model = Printer.TM.getTLSModel(GV); + } else { + assert(MO.isSymbol() && + StringRef(MO.getSymbolName()) == "_TLS_MODULE_BASE_" && + "unexpected external TLS symbol"); + Model = TLSModel::GeneralDynamic; + } + switch (Model) { + case TLSModel::InitialExec: + RefFlags |= AArch64MCExpr::VK_GOTTPREL; + break; + case TLSModel::LocalExec: + RefFlags |= AArch64MCExpr::VK_TPREL; + break; + case TLSModel::LocalDynamic: + RefFlags |= AArch64MCExpr::VK_DTPREL; + break; + case TLSModel::GeneralDynamic: + RefFlags |= AArch64MCExpr::VK_TLSDESC; + break; + } + } else { + // No modifier means this is a generic reference, classified as absolute for + // the cases where it matters (:abs_g0: etc). + RefFlags |= AArch64MCExpr::VK_ABS; + } + + if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE) + RefFlags |= AArch64MCExpr::VK_PAGE; + else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == + AArch64II::MO_PAGEOFF) + RefFlags |= AArch64MCExpr::VK_PAGEOFF; + else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G3) + RefFlags |= AArch64MCExpr::VK_G3; + else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G2) + RefFlags |= AArch64MCExpr::VK_G2; + else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G1) + RefFlags |= AArch64MCExpr::VK_G1; + else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G0) + RefFlags |= AArch64MCExpr::VK_G0; + + if (MO.getTargetFlags() & AArch64II::MO_NC) + RefFlags |= AArch64MCExpr::VK_NC; + + const MCExpr *Expr = + MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, Ctx); + if (!MO.isJTI() && MO.getOffset()) + Expr = MCBinaryExpr::CreateAdd( + Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx); + + AArch64MCExpr::VariantKind RefKind; + RefKind = static_cast(RefFlags); + Expr = AArch64MCExpr::Create(Expr, RefKind, Ctx); + + return MCOperand::CreateExpr(Expr); +} + +MCOperand AArch64MCInstLower::LowerSymbolOperand(const MachineOperand &MO, + MCSymbol *Sym) const { + if (TargetTriple.isOSDarwin()) + return lowerSymbolOperandDarwin(MO, Sym); + + assert(TargetTriple.isOSBinFormatELF() && "Expect Darwin or ELF target"); + return lowerSymbolOperandELF(MO, Sym); +} + +bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO, + MCOperand &MCOp) const { + switch (MO.getType()) { + default: + assert(0 && "unknown operand type"); + case MachineOperand::MO_Register: + // Ignore all implicit register operands. + if (MO.isImplicit()) + return false; + MCOp = MCOperand::CreateReg(MO.getReg()); + break; + case MachineOperand::MO_RegisterMask: + // Regmasks are like implicit defs. + return false; + case MachineOperand::MO_Immediate: + MCOp = MCOperand::CreateImm(MO.getImm()); + break; + case MachineOperand::MO_MachineBasicBlock: + MCOp = MCOperand::CreateExpr( + MCSymbolRefExpr::Create(MO.getMBB()->getSymbol(), Ctx)); + break; + case MachineOperand::MO_GlobalAddress: + MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO)); + break; + case MachineOperand::MO_ExternalSymbol: + MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO)); + break; + case MachineOperand::MO_JumpTableIndex: + MCOp = LowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex())); + break; + case MachineOperand::MO_ConstantPoolIndex: + MCOp = LowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex())); + break; + case MachineOperand::MO_BlockAddress: + MCOp = LowerSymbolOperand( + MO, Printer.GetBlockAddressSymbol(MO.getBlockAddress())); + break; + } + return true; +} + +void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { + OutMI.setOpcode(MI->getOpcode()); + + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + MCOperand MCOp; + if (lowerOperand(MI->getOperand(i), MCOp)) + OutMI.addOperand(MCOp); + } +} diff --git a/lib/Target/AArch64/AArch64MCInstLower.h b/lib/Target/AArch64/AArch64MCInstLower.h new file mode 100644 index 00000000000..ba50ba9e2fe --- /dev/null +++ b/lib/Target/AArch64/AArch64MCInstLower.h @@ -0,0 +1,52 @@ +//===-- AArch64MCInstLower.h - Lower MachineInstr to MCInst ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef AArch64_MCINSTLOWER_H +#define AArch64_MCINSTLOWER_H + +#include "llvm/ADT/Triple.h" +#include "llvm/Support/Compiler.h" + +namespace llvm { +class AsmPrinter; +class MCAsmInfo; +class MCContext; +class MCInst; +class MCOperand; +class MCSymbol; +class MachineInstr; +class MachineModuleInfoMachO; +class MachineOperand; +class Mangler; + +/// AArch64MCInstLower - This class is used to lower an MachineInstr +/// into an MCInst. +class LLVM_LIBRARY_VISIBILITY AArch64MCInstLower { + MCContext &Ctx; + AsmPrinter &Printer; + Triple TargetTriple; + +public: + AArch64MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer); + + bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const; + void Lower(const MachineInstr *MI, MCInst &OutMI) const; + + MCOperand lowerSymbolOperandDarwin(const MachineOperand &MO, + MCSymbol *Sym) const; + MCOperand lowerSymbolOperandELF(const MachineOperand &MO, + MCSymbol *Sym) const; + MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const; + + MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const; + MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const; +}; +} + +#endif diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h new file mode 100644 index 00000000000..7c257ba9116 --- /dev/null +++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -0,0 +1,163 @@ +//=- AArch64MachineFuctionInfo.h - AArch64 machine function info --*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares AArch64-specific per-machine-function information. +// +//===----------------------------------------------------------------------===// + +#ifndef AArch64MACHINEFUNCTIONINFO_H +#define AArch64MACHINEFUNCTIONINFO_H + +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/MC/MCLinkerOptimizationHint.h" + +namespace llvm { + +/// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and +/// contains private AArch64-specific information for each MachineFunction. +class AArch64FunctionInfo : public MachineFunctionInfo { + + /// Number of bytes of arguments this function has on the stack. If the callee + /// is expected to restore the argument stack this should be a multiple of 16, + /// all usable during a tail call. + /// + /// The alternative would forbid tail call optimisation in some cases: if we + /// want to transfer control from a function with 8-bytes of stack-argument + /// space to a function with 16-bytes then misalignment of this value would + /// make a stack adjustment necessary, which could not be undone by the + /// callee. + unsigned BytesInStackArgArea; + + /// The number of bytes to restore to deallocate space for incoming + /// arguments. Canonically 0 in the C calling convention, but non-zero when + /// callee is expected to pop the args. + unsigned ArgumentStackToRestore; + + /// HasStackFrame - True if this function has a stack frame. Set by + /// processFunctionBeforeCalleeSavedScan(). + bool HasStackFrame; + + /// \brief Amount of stack frame size, not including callee-saved registers. + unsigned LocalStackSize; + + /// \brief Number of TLS accesses using the special (combinable) + /// _TLS_MODULE_BASE_ symbol. + unsigned NumLocalDynamicTLSAccesses; + + /// \brief FrameIndex for start of varargs area for arguments passed on the + /// stack. + int VarArgsStackIndex; + + /// \brief FrameIndex for start of varargs area for arguments passed in + /// general purpose registers. + int VarArgsGPRIndex; + + /// \brief Size of the varargs area for arguments passed in general purpose + /// registers. + unsigned VarArgsGPRSize; + + /// \brief FrameIndex for start of varargs area for arguments passed in + /// floating-point registers. + int VarArgsFPRIndex; + + /// \brief Size of the varargs area for arguments passed in floating-point + /// registers. + unsigned VarArgsFPRSize; + +public: + AArch64FunctionInfo() + : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), + NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), + VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {} + + explicit AArch64FunctionInfo(MachineFunction &MF) + : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), + NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), + VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) { + (void)MF; + } + + unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; } + void setBytesInStackArgArea(unsigned bytes) { BytesInStackArgArea = bytes; } + + unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; } + void setArgumentStackToRestore(unsigned bytes) { + ArgumentStackToRestore = bytes; + } + + bool hasStackFrame() const { return HasStackFrame; } + void setHasStackFrame(bool s) { HasStackFrame = s; } + + void setLocalStackSize(unsigned Size) { LocalStackSize = Size; } + unsigned getLocalStackSize() const { return LocalStackSize; } + + void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; } + unsigned getNumLocalDynamicTLSAccesses() const { + return NumLocalDynamicTLSAccesses; + } + + int getVarArgsStackIndex() const { return VarArgsStackIndex; } + void setVarArgsStackIndex(int Index) { VarArgsStackIndex = Index; } + + int getVarArgsGPRIndex() const { return VarArgsGPRIndex; } + void setVarArgsGPRIndex(int Index) { VarArgsGPRIndex = Index; } + + unsigned getVarArgsGPRSize() const { return VarArgsGPRSize; } + void setVarArgsGPRSize(unsigned Size) { VarArgsGPRSize = Size; } + + int getVarArgsFPRIndex() const { return VarArgsFPRIndex; } + void setVarArgsFPRIndex(int Index) { VarArgsFPRIndex = Index; } + + unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; } + void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; } + + typedef SmallPtrSet SetOfInstructions; + + const SetOfInstructions &getLOHRelated() const { return LOHRelated; } + + // Shortcuts for LOH related types. + class MILOHDirective { + MCLOHType Kind; + + /// Arguments of this directive. Order matters. + SmallVector Args; + + public: + typedef SmallVectorImpl LOHArgs; + + MILOHDirective(MCLOHType Kind, const LOHArgs &Args) + : Kind(Kind), Args(Args.begin(), Args.end()) { + assert(isValidMCLOHType(Kind) && "Invalid LOH directive type!"); + } + + MCLOHType getKind() const { return Kind; } + const LOHArgs &getArgs() const { return Args; } + }; + + typedef MILOHDirective::LOHArgs MILOHArgs; + typedef SmallVector MILOHContainer; + + const MILOHContainer &getLOHContainer() const { return LOHContainerSet; } + + /// Add a LOH directive of this @p Kind and this @p Args. + void addLOHDirective(MCLOHType Kind, const MILOHArgs &Args) { + LOHContainerSet.push_back(MILOHDirective(Kind, Args)); + LOHRelated.insert(Args.begin(), Args.end()); + } + +private: + // Hold the lists of LOHs. + MILOHContainer LOHContainerSet; + SetOfInstructions LOHRelated; +}; +} // End llvm namespace + +#endif // AArch64MACHINEFUNCTIONINFO_H diff --git a/lib/Target/AArch64/AArch64PerfectShuffle.h b/lib/Target/AArch64/AArch64PerfectShuffle.h new file mode 100644 index 00000000000..b22fa2424d5 --- /dev/null +++ b/lib/Target/AArch64/AArch64PerfectShuffle.h @@ -0,0 +1,6586 @@ +//===-- AArch64PerfectShuffle.h - AdvSIMD Perfect Shuffle Table -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file, which was autogenerated by llvm-PerfectShuffle, contains data +// for the optimal way to build a perfect shuffle using AdvSIMD instructions. +// +//===----------------------------------------------------------------------===// + +// 31 entries have cost 0 +// 242 entries have cost 1 +// 1447 entries have cost 2 +// 3602 entries have cost 3 +// 1237 entries have cost 4 +// 2 entries have cost 5 + +// This table is 6561*4 = 26244 bytes in size. +static const unsigned PerfectShuffleTable[6561+1] = { + 135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS + 1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS + 2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0> + 2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0> + 1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS + 2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3> + 2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3> + 2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0> + 135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS + 2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0> + 1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS + 1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS + 2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0> + 2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5> + 2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7> + 3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1> + 2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1> + 1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS + 3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0> + 3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1> + 1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS + 2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0> + 3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6> + 3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6> + 2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7> + 2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2> + 1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS + 2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> + 2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0> + 2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0> + 2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3> + 2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6> + 3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6> + 3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7> + 3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0> + 2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> + 2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1> + 2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS + 3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS + 3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4> + 3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6> + 1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS + 2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS + 3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5> + 1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS + 2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7> + 2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3> + 3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7> + 3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5> + 2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6> + 2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5> + 2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7> + 2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7> + 2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7> + 2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7> + 3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS + 2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3> + 3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7> + 3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS + 3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0> + 2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6> + 2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0> + 2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0> + 2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> + 3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0> + 3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7> + 3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0> + 2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6> + 3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0> + 3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7> + 2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7> + 2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> + 135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS + 1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS + 1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS + 2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u> + 1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS + 1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS + 2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS + 2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u> + 135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS + 2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1> + 1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS + 2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1> + 2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0> + 2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5> + 4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7> + 2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1> + 2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0> + 1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS + 1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1> + 2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1> + 2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0> + 2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3> + 1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS + 2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7> + 2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3> + 2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1> + 1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS + 1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS + 3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1> + 2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2> + 835584U, // <0,1,2,3>: Cost 0 copy LHS + 1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS + 3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7> + 2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7> + 1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2> + 835584U, // <0,1,2,u>: Cost 0 copy LHS + 2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0> + 2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3> + 2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0> + 2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0> + 2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS + 2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7> + 2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0> + 2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1> + 2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3> + 2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS + 4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1> + 2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1> + 2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4> + 2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS + 1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS + 2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS + 2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4> + 1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS + 3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1> + 2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1> + 3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0> + 2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7> + 2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6> + 2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1> + 2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1> + 2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1> + 2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7> + 2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS + 3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7> + 2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1> + 3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7> + 2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS + 3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7> + 2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1> + 1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1> + 1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1> + 2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0> + 2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1> + 2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0> + 2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1> + 2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6> + 3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0> + 2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0> + 2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7> + 2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2> + 1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS + 1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS + 2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS + 835584U, // <0,1,u,3>: Cost 0 copy LHS + 1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS + 1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS + 2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS + 1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u> + 835584U, // <0,1,u,u>: Cost 0 copy LHS + 2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0> + 1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS + 1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS + 2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0> + 2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6> + 2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7> + 2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7> + 2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0> + 1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS + 2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2> + 2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1> + 2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2> + 2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> + 2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS + 2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7> + 2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7> + 3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7> + 2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2> + 1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2> + 2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2> + 2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2> + 3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3> + 1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS + 2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3> + 2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7> + 2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2> + 1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS + 2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2> + 2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> + 3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3> + 2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> + 2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6> + 2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> + 3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3> + 2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0> + 2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> + 2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS + 4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3> + 2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4> + 2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4> + 2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS + 1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS + 1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS + 2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4> + 1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS + 2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7> + 2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3> + 3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7> + 3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6> + 2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6> + 2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5> + 2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0> + 2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS + 2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS + 2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1> + 3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2> + 2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3> + 2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7> + 2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5> + 3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6> + 2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6> + 2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2> + 2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7> + 2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> + 2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2> + 3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2> + 2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0> + 2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6> + 2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2> + 3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2> + 2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7> + 2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> + 1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u> + 1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS + 1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS + 3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3> + 1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS + 1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS + 1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS + 2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS + 1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS + 2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0> + 2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2> + 2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0> + 4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3> + 2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS + 4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6> + 3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7> + 3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0> + 2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS + 2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2> + 2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1> + 2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3> + 2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3> + 2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6> + 4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6> + 3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1> + 3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3> + 2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2> + 1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS + 1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2> + 2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2> + 3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3> + 1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS + 3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6> + 2598154746U, // <0,3,2,6>: Cost 3 vext1 , <6,2,7,3> + 2598155258U, // <0,3,2,7>: Cost 3 vext1 , <7,0,1,2> + 1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS + 3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2> + 2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3> + 3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3> + 2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3> + 3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6> + 3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6> + 3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7> + 3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7> + 2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3> + 3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2> + 2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4> + 2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4> + 4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3> + 3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6> + 2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6> + 3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS + 3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4> + 2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6> + 3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS + 3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2> + 4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2> + 3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7> + 2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5> + 3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7> + 2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7> + 2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0> + 2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5> + 2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7> + 3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3> + 3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7> + 3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7> + 3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7> + 3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6> + 3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6> + 2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0> + 2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0> + 3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1> + 2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3> + 3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7> + 3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7> + 3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5> + 3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7> + 3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7> + 3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0> + 2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3> + 1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS + 1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u> + 2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2> + 3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3> + 1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS + 3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6> + 2598203898U, // <0,3,u,6>: Cost 3 vext1 , <6,2,7,3> + 2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0> + 1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS + 2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4> + 2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS + 2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4> + 3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0> + 2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6> + 2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1> + 3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS + 3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0> + 3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS + 2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS + 3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1> + 3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0> + 2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1> + 2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS + 1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS + 3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS + 2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1> + 1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS + 2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS + 2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2> + 3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2> + 2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4> + 2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS + 2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS + 1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS + 2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2> + 1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS + 3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2> + 3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2> + 3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4> + 3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3> + 3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6> + 3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6> + 4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS + 3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4> + 3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2> + 2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4> + 3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0> + 3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3> + 3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4> + 2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4> + 2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS + 2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS + 3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4> + 2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS + 2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS + 2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0> + 3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5> + 2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5> + 2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS + 2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6> + 1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS + 2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5> + 1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS + 2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6> + 3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2> + 2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6> + 3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0> + 2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6> + 3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6> + 2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0> + 2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4> + 2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2> + 3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS + 3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1> + 3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4> + 3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4> + 3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS + 2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> + 2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0> + 3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2> + 2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> + 2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS + 2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS + 2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS + 2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u> + 2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS + 1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS + 1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS + 2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u> + 1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS + 3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0> + 2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS + 3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2> + 3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5> + 3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS + 3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1> + 2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1> + 3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS + 2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS + 1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS + 2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3> + 2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2> + 2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2> + 1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1> + 2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5> + 2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0> + 2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3> + 1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS + 2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS + 2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2> + 3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7> + 2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2> + 2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS + 3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5> + 4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6> + 3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS + 3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS + 3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2> + 3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3> + 3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1> + 3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3> + 2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5> + 3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0> + 3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7> + 2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0> + 2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0> + 2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1> + 3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4> + 3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4> + 3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5> + 3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6> + 2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS + 3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5> + 2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6> + 2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS + 3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0> + 3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0> + 3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5> + 3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0> + 3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5> + 2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5> + 2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0> + 2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7> + 2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7> + 2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS + 3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0> + 3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3> + 3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4> + 2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS + 2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0> + 3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7> + 1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0> + 1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0> + 2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS + 2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0> + 2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7> + 3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2> + 2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS + 2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7> + 2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0> + 2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0> + 2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS + 1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS + 2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS + 2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0> + 2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u> + 1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u> + 2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS + 2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u> + 1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0> + 1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0> + 2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS + 2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS + 2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2> + 3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4> + 2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6> + 2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0> + 2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0> + 2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS + 2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS + 2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS + 3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1> + 2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3> + 3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3> + 2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS + 2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1> + 2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6> + 2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS + 2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS + 1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS + 2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2> + 2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2> + 2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1> + 1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2> + 2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3> + 3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6> + 2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS + 1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS + 3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2> + 3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7> + 3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0> + 3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3> + 2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6> + 3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7> + 3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0> + 2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0> + 2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0> + 2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS + 4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2> + 2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4> + 3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2> + 2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS + 2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS + 2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0> + 4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS + 2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS + 2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> + 3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0> + 3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7> + 3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0> + 3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5> + 3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0> + 3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7> + 4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS + 2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> + 3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0> + 3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3> + 3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6> + 3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0> + 3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4> + 3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6> + 2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6> + 2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7> + 2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7> + 2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1> + 2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0> + 3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7> + 2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0> + 2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5> + 2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6> + 3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2> + 2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0> + 2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1> + 1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS + 2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS + 2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u> + 2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0> + 1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u> + 2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS + 2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0> + 2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS + 1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS + 2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0> + 2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS + 2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0> + 3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0> + 2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5> + 2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6> + 2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7> + 3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7> + 2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS + 2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1> + 3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1> + 2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0> + 3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5> + 2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1> + 3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3> + 3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7> + 2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7> + 2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1> + 2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS + 1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2> + 3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2> + 2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0> + 2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS + 2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7> + 2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2> + 3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7> + 1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2> + 3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2> + 3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3> + 3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3> + 3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3> + 3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6> + 3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7> + 2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> + 3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0> + 2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7> + 3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS + 2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4> + 2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4> + 3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7> + 3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6> + 2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS + 3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7> + 3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5> + 2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS + 3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0> + 3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7> + 3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5> + 3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7> + 2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5> + 3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7> + 2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7> + 2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0> + 2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7> + 3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0> + 2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6> + 3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7> + 3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7> + 3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS + 2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7> + 3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6> + 2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0> + 2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7> + 3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1> + 3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0> + 3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7> + 3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0> + 3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS + 3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7> + 3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7> + 2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7> + 2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7> + 2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u> + 1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u> + 2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u> + 2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0> + 2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u> + 2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS + 2669066421U, // <0,7,u,6>: Cost 3 vext2 , + 2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0> + 1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u> + 135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS + 1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS + 1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS + 2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, + 1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS + 2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, + 3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS + 2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0> + 135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS + 1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1> + 1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS + 1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS + 2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> + 1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS + 1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS + 2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, + 2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS + 1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS + 1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS + 1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2> + 1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS + 835584U, // <0,u,2,3>: Cost 0 copy LHS + 1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS + 3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, + 1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS + 1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2> + 835584U, // <0,u,2,u>: Cost 0 copy LHS + 2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2> + 2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> + 2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u> + 2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> + 2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6> + 2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> + 2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> + 2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u> + 2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> + 2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS + 2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS + 3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS + 2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4> + 2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS + 1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS + 1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS + 2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, + 1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS + 2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS + 2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0> + 3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, + 2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, + 2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS + 2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u> + 1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS + 2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS + 1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS + 2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS + 2262496983U, // <0,u,6,1>: Cost 3 vrev + 2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u> + 2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, + 2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS + 2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u> + 2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u> + 1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u> + 1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u> + 2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS + 2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0> + 2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7> + 2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u> + 2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS + 2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, + 2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7> + 2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7> + 2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS + 135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS + 1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS + 1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS + 835584U, // <0,u,u,3>: Cost 0 copy LHS + 1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS + 1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS + 1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS + 1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u> + 835584U, // <0,u,u,u>: Cost 0 copy LHS + 2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0> + 1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1> + 2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2> + 2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0> + 2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1> + 2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0> + 3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7> + 3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0> + 1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1> + 2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS + 2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1> + 1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS + 3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3> + 2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS + 2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1> + 2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7> + 3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2> + 1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS + 2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1> + 2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1> + 2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0> + 2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1> + 2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6> + 3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7> + 2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0> + 2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2> + 2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1> + 3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0> + 3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1> + 67944550U, // <1,0,3,2>: Cost 1 vrev LHS + 2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3> + 2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS + 4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7> + 3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7> + 2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3> + 68386972U, // <1,0,3,u>: Cost 1 vrev LHS + 2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1> + 2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5> + 2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6> + 3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1> + 3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1> + 2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS + 3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1> + 3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4> + 2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS + 4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0> + 2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS + 3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS + 3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5> + 2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5> + 3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0> + 3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0> + 3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS + 3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS + 3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1> + 2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7> + 3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7> + 3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6> + 3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1> + 3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0> + 3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0> + 2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0> + 2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0> + 2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0> + 4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1> + 2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7> + 3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0> + 3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6> + 3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0> + 2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0> + 3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7> + 2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0> + 3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0> + 1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1> + 67985515U, // <1,0,u,2>: Cost 1 vrev LHS + 2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1> + 2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6> + 2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS + 2669082807U, // <1,0,u,6>: Cost 3 vext2 , + 2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u> + 68427937U, // <1,0,u,u>: Cost 1 vrev LHS + 1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1> + 1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS + 2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1> + 2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2> + 2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5> + 2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1> + 3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7> + 3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0> + 1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1> + 1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS + 2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0> + 2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3> + 1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS + 2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7> + 2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7> + 2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1> + 202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS + 2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2> + 2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1> + 2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2> + 2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1> + 2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS + 3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7> + 2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7> + 3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0> + 2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1> + 2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2> + 3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1> + 4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2> + 2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS + 2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6> + 2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7> + 3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7> + 2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3> + 2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS + 2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS + 2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4> + 3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0> + 3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5> + 2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS + 1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS + 2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS + 3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4> + 1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS + 2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1> + 2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3> + 4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2> + 2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7> + 2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5> + 2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5> + 2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0> + 2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS + 2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7> + 3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2> + 2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7> + 2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3> + 3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7> + 3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6> + 4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5> + 2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6> + 2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0> + 2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0> + 2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1> + 2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> + 3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3> + 4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS + 2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6> + 3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7> + 3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0> + 2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7> + 2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> + 1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS + 2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, + 2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS + 1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS + 1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS + 2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, + 2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS + 202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS + 2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0> + 1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS + 2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2> + 2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1> + 2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5> + 3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7> + 2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2> + 2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1> + 1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS + 2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2> + 2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1> + 2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0> + 2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS + 2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS + 2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7> + 2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7> + 3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0> + 2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS + 2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2> + 2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2> + 2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2> + 2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3> + 3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5> + 3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7> + 2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7> + 2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1> + 2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3> + 403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS + 1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3> + 403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS + 1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> + 1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> + 403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS + 2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2> + 3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6> + 3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4> + 2598759198U, // <1,2,4,3>: Cost 3 vext1 , <3,u,1,2> + 2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4> + 1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS + 2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6> + 2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0> + 1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS + 2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS + 2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7> + 3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2> + 2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS + 2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS + 2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5> + 2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0> + 2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7> + 2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS + 2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1> + 2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2> + 2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3> + 2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7> + 3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5> + 3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7> + 2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6> + 2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2> + 2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7> + 1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2> + 2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2> + 3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3> + 2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1> + 2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6> + 3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0> + 3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1> + 2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1> + 1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2> + 403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS + 1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2> + 1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2> + 1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2> + 403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS + 1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS + 1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3> + 1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> + 403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS + 2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0> + 1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS + 2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2> + 2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3> + 2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5> + 3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6> + 3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7> + 3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1> + 1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS + 2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2> + 2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1> + 2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3> + 1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS + 2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS + 2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7> + 3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7> + 2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1> + 1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS + 2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> + 2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3> + 2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> + 2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> + 2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> + 3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5> + 2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> + 3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3> + 2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> + 1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS + 1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3> + 2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2> + 2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3> + 1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS + 2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5> + 2598826490U, // <1,3,3,6>: Cost 3 vext1 , <6,2,7,3> + 3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7> + 1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS + 2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS + 2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3> + 3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3> + 2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4> + 2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS + 1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS + 2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6> + 3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4> + 1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS + 2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS + 2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7> + 2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5> + 2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5> + 2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS + 2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5> + 2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4> + 1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS + 1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS + 2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0> + 3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1> + 2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3> + 3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7> + 2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3> + 3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, + 2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3> + 2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> + 2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> + 2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1> + 2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3> + 3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7> + 2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3> + 2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6> + 2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5> + 3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1> + 2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7> + 2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1> + 1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS + 1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u> + 2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2> + 1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS + 1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS + 1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS + 2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6> + 1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS + 1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS + 2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4> + 2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS + 3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4> + 2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> + 2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5> + 1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1> + 2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2> + 3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1> + 1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1> + 3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2> + 2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4> + 3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0> + 3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6> + 3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1> + 2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS + 3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS + 3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0> + 2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS + 3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2> + 3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3> + 3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2> + 2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1> + 3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4> + 2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS + 3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7> + 3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2> + 2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS + 2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS + 2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4> + 2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3> + 3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3> + 2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6> + 3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5> + 3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6> + 3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3> + 3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u> + 2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1> + 3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4> + 3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4> + 3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4> + 2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4> + 2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS + 2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6> + 3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1> + 2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS + 2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS + 2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5> + 2598913774U, // <1,4,5,2>: Cost 3 vext1 , <2,3,u,1> + 3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2> + 2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS + 2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS + 1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS + 3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS + 1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS + 2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS + 2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1> + 3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2> + 2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2> + 2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS + 3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7> + 3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7> + 2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> + 2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS + 2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4> + 3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1> + 3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4> + 3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4> + 3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS + 3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0> + 2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1> + 3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2> + 2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4> + 2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS + 2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS + 2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u> + 2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1> + 2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, + 1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1> + 1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS + 2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> + 1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS + 2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0> + 1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS + 2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5> + 3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4> + 1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> + 2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5> + 2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1> + 3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1> + 1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS + 2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2> + 2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1> + 2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0> + 2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7> + 2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5> + 2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1> + 2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7> + 3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS + 2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3> + 3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1> + 2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5> + 2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2> + 2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1> + 3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5> + 3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7> + 2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7> + 4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS + 2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1> + 2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2> + 2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7> + 3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2> + 2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3> + 2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6> + 3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5> + 4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6> + 2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS + 2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS + 2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1> + 2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> + 3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3> + 3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4> + 2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4> + 1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS + 2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS + 2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6> + 1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS + 2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS + 2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5> + 3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5> + 4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3> + 2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS + 2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5> + 2667794530U, // <1,5,5,6>: Cost 3 vext2 , <5,6,7,0> + 2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7> + 2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS + 2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1> + 2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5> + 2667794938U, // <1,5,6,2>: Cost 3 vext2 , <6,2,7,3> + 3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4> + 3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6> + 3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6> + 2667795256U, // <1,5,6,6>: Cost 3 vext2 , <6,6,6,6> + 2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0> + 2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0> + 2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS + 2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1> + 2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1> + 2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7> + 2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS + 2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7> + 3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0> + 2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1> + 2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS + 2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, + 1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS + 2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, + 2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, + 1594054682U, // <1,5,u,4>: Cost 2 vext2 , + 1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS + 2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, + 2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS + 2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS + 3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0> + 2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS + 2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6> + 3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1> + 2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5> + 2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6> + 2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6> + 4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS + 2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS + 3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2> + 2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1> + 3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0> + 3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3> + 2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6> + 3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7> + 3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1> + 2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS + 2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS + 2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1> + 3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0> + 3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2> + 3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1> + 3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS + 3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7> + 2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7> + 2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS + 2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS + 2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS + 3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1> + 2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3> + 3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3> + 2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS + 4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, + 2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3> + 3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7> + 3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u> + 3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1> + 3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0> + 3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4> + 3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4> + 3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5> + 2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS + 3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS + 4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS + 2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS + 3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2> + 3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5> + 3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5> + 3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5> + 3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6> + 3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5> + 2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0> + 2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS + 2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS + 2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1> + 3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6> + 3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7> + 3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0> + 3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS + 3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7> + 2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6> + 2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7> + 2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7> + 1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1> + 2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1> + 2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3> + 3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7> + 2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5> + 2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1> + 2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7> + 4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS + 1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1> + 1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1> + 2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS + 3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2> + 3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, + 2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5> + 2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS + 2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, + 3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7> + 1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1> + 2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1> + 2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS + 3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1> + 3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1> + 2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1> + 3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0> + 2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7> + 2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7> + 2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS + 2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS + 2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1> + 3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0> + 2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1> + 2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS + 2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7> + 2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7> + 2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1> + 2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7> + 2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS + 3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7> + 3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2> + 2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1> + 2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS + 3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7> + 2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2> + 3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2> + 2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS + 1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS + 2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7> + 2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2> + 2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3> + 1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS + 1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3> + 2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3> + 2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2> + 1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS + 2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1> + 3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1> + 3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4> + 3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4> + 3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4> + 2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS + 3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0> + 3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6> + 2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS + 2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS + 2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7> + 3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2> + 2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5> + 2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS + 4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6> + 2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7> + 3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS + 2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS + 2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1> + 2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0> + 3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7> + 3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6> + 3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS + 3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6> + 3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6> + 2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0> + 2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0> + 2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1> + 3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1> + 3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1> + 3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0> + 3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS + 2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7> + 3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7> + 2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7> + 2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1> + 1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS + 2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS + 2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2> + 2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u> + 1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS + 1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u> + 2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3> + 2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2> + 1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS + 1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u> + 1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS + 2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2> + 2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u> + 1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u> + 1658771190U, // <1,u,0,5>: Cost 2 vext3 , + 2736789248U, // <1,u,0,6>: Cost 3 vext3 , + 2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1> + 1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS + 1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS + 202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS + 1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS + 1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS + 1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS + 2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7> + 3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS + 2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS + 202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS + 2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> + 2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS + 2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> + 2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> + 2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> + 2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS + 2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> + 2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS + 2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> + 403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS + 1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2> + 115726126U, // <1,u,3,2>: Cost 1 vrev LHS + 2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS + 403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS + 1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3> + 1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3> + 2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS + 403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS + 2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1> + 2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0> + 2263217967U, // <1,u,4,2>: Cost 3 vrev + 2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4> + 2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS + 1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS + 2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6> + 2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, + 1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS + 2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS + 2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7> + 3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS + 2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS + 2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS + 2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS + 1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS + 1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS + 1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS + 2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u> + 2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, + 2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3> + 2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, + 2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4> + 3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, + 2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u> + 2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> + 2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> + 1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u> + 2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1> + 2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1> + 2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7> + 2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS + 2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u> + 2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, + 2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7> + 1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u> + 403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS + 202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS + 115767091U, // <1,u,u,2>: Cost 1 vrev LHS + 1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS + 403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS + 1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS + 1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS + 1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS + 403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS + 2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0> + 2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1> + 1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2> + 2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0> + 2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS + 3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5> + 2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0> + 4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7> + 1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2> + 2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1> + 2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0> + 1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS + 2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS + 2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS + 3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7> + 2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1> + 3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2> + 1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS + 1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS + 2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2> + 2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0> + 2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2> + 1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS + 2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3> + 2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2> + 2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2> + 1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS + 2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0> + 2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1> + 2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2> + 4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3> + 3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS + 3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5> + 4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6> + 3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7> + 2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u> + 2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS + 2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5> + 2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6> + 3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2> + 2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS + 2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS + 2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS + 3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5> + 2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6> + 3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS + 3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5> + 2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7> + 3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5> + 2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5> + 3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5> + 3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0> + 2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS + 2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5> + 4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0> + 2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS + 2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7> + 3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6> + 2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6> + 3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0> + 3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6> + 3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7> + 2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6> + 2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> + 3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3> + 2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1> + 3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0> + 3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6> + 3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2> + 3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0> + 3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7> + 2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> + 1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS + 2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1> + 1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS + 2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2> + 1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS + 2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS + 2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS + 2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS + 1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS + 2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS + 2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS + 3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0> + 1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2> + 2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS + 2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2> + 3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0> + 2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1> + 1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2> + 2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS + 2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1> + 2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0> + 2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3> + 2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS + 3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7> + 3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1> + 3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2> + 2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3> + 2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS + 3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1> + 2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2> + 2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0> + 2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS + 3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3> + 3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7> + 2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0> + 2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0> + 2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS + 2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1> + 2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2> + 2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3> + 2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS + 2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5> + 2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> + 4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7> + 2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u> + 2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5> + 3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6> + 3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5> + 1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4> + 3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS + 2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS + 3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS + 3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6> + 1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4> + 2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS + 2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7> + 3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1> + 2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7> + 2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS + 3740913668U, // <2,1,5,5>: Cost 4 vext2 , <5,5,5,5> + 3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5> + 3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS + 2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7> + 2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS + 3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2> + 2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7> + 3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS + 2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS + 4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5> + 3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6> + 3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1> + 2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS + 2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2> + 3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1> + 3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0> + 2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7> + 3740915046U, // <2,1,7,4>: Cost 4 vext2 , <7,4,5,6> + 3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7> + 3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1> + 3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u> + 2669827714U, // <2,1,7,u>: Cost 3 vext2 , <7,u,1,2> + 2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS + 2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1> + 2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2> + 1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u> + 2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS + 2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5> + 2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> + 2669828370U, // <2,1,u,7>: Cost 3 vext2 , + 1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u> + 1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2> + 1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS + 2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0> + 2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2> + 2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS + 3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7> + 3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6> + 3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2> + 1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2> + 2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2> + 2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1> + 2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0> + 2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS + 3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS + 2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7> + 3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3> + 3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1> + 2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS + 1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS + 2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2> + 269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS + 2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3> + 1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS + 2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7> + 2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7> + 2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2> + 269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS + 2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1> + 2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0> + 2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2> + 1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS + 2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5> + 3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5> + 2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6> + 2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3> + 1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS + 2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS + 3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2> + 2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5> + 3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5> + 2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS + 1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS + 2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS + 3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4> + 1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS + 3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2> + 2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3> + 2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7> + 4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS + 2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5> + 2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5> + 2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0> + 2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS + 2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS + 3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6> + 3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3> + 2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3> + 2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7> + 3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS + 3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7> + 3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6> + 2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2> + 2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7> + 2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1> + 3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5> + 2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2> + 4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS + 2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6> + 3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7> + 3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7> + 2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7> + 2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1> + 1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS + 1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS + 269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS + 1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS + 1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS + 1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS + 2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6> + 2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS + 269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS + 1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS + 1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3> + 1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7> + 2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7> + 2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0> + 470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS + 1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS + 1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7> + 2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0> + 1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> + 2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2> + 2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1> + 1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> + 1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5> + 2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7> + 1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1> + 1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1> + 1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3> + 1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3> + 1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3> + 1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7> + 2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3> + 2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7> + 1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2> + 1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS + 1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4> + 2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0> + 2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2> + 1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS + 470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS + 1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4> + 470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS + 2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS + 1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3> + 2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4> + 2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6> + 1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> + 1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7> + 1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7> + 2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2> + 2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6> + 1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3> + 2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5> + 2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6> + 2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7> + 1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1> + 1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2> + 2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3> + 2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3> + 2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1> + 1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6> + 2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2> + 2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1> + 1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2> + 1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, + 470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS + 1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, + 1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, + 1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, + 470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS + 1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, + 1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, + 470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS + 2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4> + 2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS + 2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4> + 2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4> + 2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2> + 2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1> + 1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> + 3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1> + 1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> + 2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2> + 3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1> + 2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4> + 2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS + 3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4> + 3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7> + 3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3> + 3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1> + 2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS + 2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4> + 3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3> + 2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4> + 2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5> + 2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4> + 2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS + 3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS + 3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0> + 2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS + 2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4> + 4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1> + 3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4> + 3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3> + 3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4> + 2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5> + 2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6> + 3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7> + 2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u> + 2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2> + 3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1> + 3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2> + 3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3> + 2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4> + 2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS + 2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS + 4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7> + 2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS + 2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5> + 2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2> + 2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5> + 2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5> + 2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS + 3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5> + 1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS + 2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS + 1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS + 1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS + 2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2> + 2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4> + 2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2> + 1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS + 2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS + 2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6> + 2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1> + 1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS + 2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> + 3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2> + 3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4> + 3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4> + 3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6> + 2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0> + 2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1> + 3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7> + 2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> + 1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS + 2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS + 2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2> + 2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2> + 1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS + 2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5> + 1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS + 2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS + 1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS + 3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0> + 2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS + 2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2> + 2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5> + 2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5> + 2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5> + 3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7> + 3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS + 2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS + 3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2> + 3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1> + 2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0> + 2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7> + 2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5> + 3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7> + 3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2> + 2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3> + 2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5> + 3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2> + 4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1> + 2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2> + 3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1> + 2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5> + 3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1> + 3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7> + 3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS + 3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS + 2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS + 2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3> + 2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5> + 3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3> + 2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6> + 2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3> + 2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7> + 2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u> + 2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS + 3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4> + 2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5> + 2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4> + 2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS + 2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS + 3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6> + 3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS + 2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS + 2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS + 3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7> + 3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5> + 3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5> + 2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS + 2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5> + 4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6> + 2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7> + 2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7> + 2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS + 3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6> + 2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3> + 3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6> + 2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5> + 3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7> + 4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6> + 3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS + 3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS + 2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS + 2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2> + 2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7> + 3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2> + 2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS + 2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7> + 2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2> + 4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS + 2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS + 2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS + 2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS + 2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u> + 2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u> + 2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, + 2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS + 2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6> + 2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7> + 2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS + 2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0> + 1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS + 2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2> + 3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0> + 1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6> + 3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6> + 2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1> + 2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2> + 1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS + 2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2> + 2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1> + 2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0> + 2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS + 2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6> + 2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7> + 3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7> + 2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2> + 2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS + 2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1> + 2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3> + 2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2> + 2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1> + 2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6> + 2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7> + 2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6> + 2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS + 2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1> + 2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2> + 3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1> + 2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7> + 2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3> + 2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6> + 3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5> + 3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6> + 1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS + 1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS + 2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2> + 2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3> + 2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0> + 3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4> + 2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4> + 1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS + 2667875700U, // <2,6,4,6>: Cost 3 vext2 , <4,6,4,6> + 4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS + 1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS + 3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2> + 2667876048U, // <2,6,5,1>: Cost 3 vext2 , <5,1,7,3> + 2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7> + 3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5> + 2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5> + 2667876356U, // <2,6,5,5>: Cost 3 vext2 , <5,5,5,5> + 2667876450U, // <2,6,5,6>: Cost 3 vext2 , <5,6,7,0> + 2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS + 2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS + 2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS + 3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1> + 2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6> + 3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3> + 2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS + 3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5> + 2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6> + 2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS + 2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS + 2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1> + 2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2> + 2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2> + 2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0> + 2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5> + 3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7> + 2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2> + 2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0> + 2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1> + 2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, + 1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS + 2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, + 2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS + 1594136612U, // <2,6,u,4>: Cost 2 vext2 , + 1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS + 2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, + 1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS + 1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS + 2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2> + 1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2> + 2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2> + 2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0> + 2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2> + 2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7> + 2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7> + 2599760953U, // <2,7,0,7>: Cost 3 vext1 , <7,0,u,2> + 1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2> + 2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> + 3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1> + 3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0> + 3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7> + 3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5> + 3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7> + 2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7> + 3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0> + 2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> + 3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2> + 3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7> + 2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2> + 3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1> + 3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6> + 2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7> + 2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7> + 3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1> + 2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7> + 1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS + 2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7> + 2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2> + 2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3> + 1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS + 2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3> + 1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3> + 2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2> + 1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS + 2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6> + 3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4> + 3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7> + 2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4> + 3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4> + 2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS + 2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u> + 3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0> + 2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS + 2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2> + 3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7> + 3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7> + 3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5> + 2599800118U, // <2,7,5,4>: Cost 3 vext1 , RHS + 3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5> + 3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7> + 2599802214U, // <2,7,5,7>: Cost 3 vext1 , <7,4,5,6> + 2599802670U, // <2,7,5,u>: Cost 3 vext1 , LHS + 2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS + 3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7> + 2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3> + 3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6> + 2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS + 2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6> + 2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6> + 3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0> + 2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS + 2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1> + 3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2> + 3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7> + 3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7> + 2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS + 3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5> + 2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7> + 2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7> + 2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1> + 1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS + 1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2> + 2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2> + 2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u> + 1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS + 2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS + 1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u> + 2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2> + 1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS + 1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0> + 470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS + 1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> + 1658631909U, // <2,u,0,3>: Cost 2 vext3 , + 1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> + 2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7> + 1658853120U, // <2,u,0,6>: Cost 2 vext3 , + 3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS + 470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS + 1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> + 1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1> + 1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0> + 1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> + 2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5> + 1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> + 2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7> + 2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, + 1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3> + 1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS + 2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1> + 269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS + 1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1> + 1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS + 2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6> + 1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> + 3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS + 269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS + 1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2> + 2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1> + 1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3> + 1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS + 1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6> + 2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5> + 1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3> + 1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS + 1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS + 1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS + 1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4> + 2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5> + 1190213513U, // <2,u,4,3>: Cost 2 vrev + 1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS + 470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS + 1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6> + 3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS + 470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS + 2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5> + 1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3> + 2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5> + 2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, + 1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6> + 1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5> + 1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS + 1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7> + 1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS + 1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS + 2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2> + 1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3> + 2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, + 1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS + 2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS + 1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6> + 1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1> + 1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS + 1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2> + 2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u> + 2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7> + 2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1> + 1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6> + 2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2> + 2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7> + 1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7> + 1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2> + 1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS + 470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS + 269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS + 1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS + 1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS + 470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS + 1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS + 1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS + 470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS + 1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0> + 1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> + 1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> + 3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1> + 2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1> + 3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7> + 3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1> + 3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0> + 1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2> + 1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS + 2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0> + 537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS + 2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3> + 1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS + 2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7> + 2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7> + 2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1> + 537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS + 1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> + 2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0> + 2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1> + 1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7> + 2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7> + 2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7> + 1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> + 2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2> + 2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0> + 2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1> + 2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3> + 2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6> + 3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7> + 2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7> + 3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1> + 2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3> + 2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4> + 1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> + 1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> + 3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1> + 2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6> + 1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS + 2666573172U, // <3,0,4,6>: Cost 3 vext2 , <4,6,4,6> + 3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4> + 1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> + 2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7> + 2666573520U, // <3,0,5,1>: Cost 3 vext2 , <5,1,7,3> + 3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS + 3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6> + 2666573766U, // <3,0,5,4>: Cost 3 vext2 , <5,4,7,6> + 2666573828U, // <3,0,5,5>: Cost 3 vext2 , <5,5,5,5> + 2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7> + 2666573992U, // <3,0,5,7>: Cost 3 vext2 , <5,7,5,7> + 3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS + 2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7> + 2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7> + 2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7> + 3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2> + 3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1> + 3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0> + 2666574648U, // <3,0,6,6>: Cost 3 vext2 , <6,6,6,6> + 2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0> + 2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7> + 2666574842U, // <3,0,7,0>: Cost 3 vext2 , <7,0,1,2> + 2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7> + 2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0> + 3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7> + 2666575206U, // <3,0,7,4>: Cost 3 vext2 , <7,4,5,6> + 3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7> + 3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3> + 2666575468U, // <3,0,7,7>: Cost 3 vext2 , <7,7,7,7> + 2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0> + 1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2> + 1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> + 537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS + 2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1> + 1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> + 1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS + 2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, + 2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u> + 537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS + 2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS + 2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS + 2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1> + 1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> + 2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS + 2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1> + 3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6> + 3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0> + 1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> + 2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1> + 1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1> + 2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1> + 1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> + 2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5> + 2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> + 3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5> + 3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3> + 1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3> + 2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1> + 2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3> + 2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2> + 1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0> + 2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS + 2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3> + 3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7> + 2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0> + 1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0> + 1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS + 1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> + 2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> + 2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1> + 1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS + 1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> + 2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> + 2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3> + 1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> + 2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS + 2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5> + 2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5> + 2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5> + 2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS + 2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS + 2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS + 3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4> + 2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5> + 2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1> + 2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7> + 2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5> + 1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> + 2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5> + 3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7> + 3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0> + 2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS + 1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> + 3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1> + 2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> + 2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7> + 2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7> + 3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5> + 2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> + 3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7> + 2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0> + 2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7> + 3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS + 2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1> + 4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2> + 3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS + 3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS + 2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7> + 3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7> + 3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7> + 3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS + 1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS + 1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3> + 2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0> + 1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0> + 1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS + 1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> + 2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7> + 2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS + 1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3> + 2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0> + 1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS + 2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0> + 2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0> + 2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5> + 2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7> + 2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> + 2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0> + 1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS + 1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2> + 2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1> + 2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0> + 2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> + 2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS + 2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7> + 2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> + 3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1> + 1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1> + 2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1> + 2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3> + 1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2> + 1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> + 2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5> + 2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> + 2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6> + 3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3> + 1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3> + 1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1> + 2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0> + 2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2> + 2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3> + 1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5> + 2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1> + 2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2> + 2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0> + 1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1> + 2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS + 2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4> + 2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4> + 2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5> + 2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS + 1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS + 2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0> + 2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4> + 1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS + 2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS + 2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5> + 2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> + 2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> + 1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5> + 2665263108U, // <3,2,5,5>: Cost 3 vext2 , <5,5,5,5> + 2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7> + 2665263272U, // <3,2,5,7>: Cost 3 vext2 , <5,7,5,7> + 1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5> + 2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1> + 2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3> + 2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6> + 1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> + 2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5> + 2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> + 2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> + 2665263950U, // <3,2,6,7>: Cost 3 vext2 , <6,7,0,1> + 1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> + 2665264122U, // <3,2,7,0>: Cost 3 vext2 , <7,0,1,2> + 2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3> + 4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2> + 2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS + 2665264486U, // <3,2,7,4>: Cost 3 vext2 , <7,4,5,6> + 2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7> + 2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7> + 2665264748U, // <3,2,7,7>: Cost 3 vext2 , <7,7,7,7> + 2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS + 1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1> + 1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS + 1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2> + 1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> + 1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5> + 1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS + 2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0> + 2665265408U, // <3,2,u,7>: Cost 3 vext2 , + 1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1> + 2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0> + 1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2> + 2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0> + 2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2> + 2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1> + 2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2> + 3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2> + 4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7> + 1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2> + 2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3> + 1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3> + 2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3> + 2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1> + 2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS + 2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3> + 3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3> + 2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3> + 1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3> + 2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS + 2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3> + 1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3> + 2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0> + 2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS + 2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4> + 2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3> + 2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3> + 1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3> + 1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS + 2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3> + 2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3> + 336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS + 1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS + 2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5> + 2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7> + 2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3> + 336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS + 2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS + 2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4> + 2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4> + 2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6> + 2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4> + 1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6> + 2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS + 4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7> + 1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6> + 2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS + 2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5> + 2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5> + 2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5> + 2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS + 3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5> + 2665934946U, // <3,3,5,6>: Cost 3 vext2 , <5,6,7,0> + 2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS + 2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS + 2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7> + 3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7> + 2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7> + 2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7> + 2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7> + 3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7> + 2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6> + 2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3> + 2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3> + 2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS + 2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7> + 2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7> + 2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3> + 2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS + 2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7> + 3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3> + 3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7> + 2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS + 1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS + 1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2> + 1592858504U, // <3,3,u,2>: Cost 2 vext2 , + 336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS + 1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS + 1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6> + 2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3> + 2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS + 336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS + 2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0> + 1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS + 2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2> + 3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4> + 2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5> + 1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1> + 1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2> + 3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0> + 1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS + 2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2> + 2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1> + 1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4> + 2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3> + 2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS + 2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0> + 2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3> + 2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4> + 1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4> + 3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS + 2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3> + 2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2> + 2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1> + 2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3> + 2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3> + 2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0> + 2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4> + 2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0> + 2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2> + 2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4> + 2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4> + 2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3> + 2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1> + 2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS + 3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS + 3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1> + 2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2> + 2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS + 2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4> + 2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4> + 2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4> + 1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4> + 1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS + 1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6> + 3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4> + 1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS + 1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS + 2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3> + 1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5> + 2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2> + 1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS + 2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7> + 537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS + 2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5> + 537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS + 2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1> + 2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6> + 2666607098U, // <3,4,6,2>: Cost 3 vext2 , <6,2,7,3> + 2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6> + 1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6> + 2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> + 2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7> + 2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4> + 1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2> + 2666607610U, // <3,4,7,0>: Cost 3 vext2 , <7,0,1,2> + 3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5> + 2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4> + 3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7> + 2666607974U, // <3,4,7,4>: Cost 3 vext2 , <7,4,5,6> + 2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0> + 2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0> + 2666608236U, // <3,4,7,7>: Cost 3 vext2 , <7,7,7,7> + 2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4> + 1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS + 1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS + 1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u> + 2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, + 1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS + 1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS + 537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS + 2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u> + 537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS + 3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0> + 2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS + 2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5> + 3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4> + 2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1> + 2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> + 2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1> + 2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0> + 2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS + 2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS + 3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1> + 2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5> + 2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5> + 2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5> + 2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3> + 3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7> + 1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3> + 1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3> + 3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3> + 2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5> + 3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2> + 2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5> + 2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5> + 2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3> + 3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7> + 2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3> + 2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5> + 3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2> + 2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5> + 3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4> + 2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3> + 2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6> + 2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5> + 4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6> + 3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS + 3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS + 2600304742U, // <3,5,4,0>: Cost 3 vext1 , LHS + 3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5> + 2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4> + 3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0> + 2600308022U, // <3,5,4,4>: Cost 3 vext1 , RHS + 2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS + 2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS + 1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6> + 1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6> + 2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS + 2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3> + 2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5> + 2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5> + 2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS + 1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5> + 2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6> + 1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7> + 1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7> + 2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1> + 2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7> + 2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6> + 2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4> + 2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5> + 2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7> + 2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7> + 1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0> + 1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0> + 1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS + 1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7> + 2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2> + 2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2> + 1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS + 1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7> + 2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0> + 2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7> + 1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS + 1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS + 1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u> + 2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2> + 2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2> + 1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS + 1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7> + 2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS + 1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3> + 1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS + 2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS + 2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2> + 2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4> + 3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4> + 2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2> + 2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7> + 2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0> + 2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS + 2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2> + 3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3> + 3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1> + 2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6> + 3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1> + 2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6> + 3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3> + 2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3> + 2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3> + 2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6> + 2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS + 2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3> + 2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6> + 3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0> + 2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6> + 2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6> + 2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3> + 1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3> + 1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3> + 3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2> + 3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3> + 3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3> + 2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3> + 2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6> + 3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6> + 3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2> + 2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS + 2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS + 3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6> + 2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3> + 2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5> + 3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6> + 2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6> + 2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6> + 2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0> + 2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS + 2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6> + 2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS + 3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2> + 2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7> + 2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> + 2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5> + 3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6> + 4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6> + 2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5> + 2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS + 2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1> + 2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3> + 2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3> + 3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6> + 2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4> + 2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7> + 1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7> + 1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7> + 1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1> + 2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7> + 2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7> + 2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3> + 1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5> + 2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1> + 2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2> + 2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS + 1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1> + 1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1> + 2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2> + 2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u> + 2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6> + 1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5> + 2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6> + 1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3> + 1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1> + 2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0> + 1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS + 2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2> + 3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0> + 2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5> + 2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0> + 2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0> + 2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1> + 1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS + 2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2> + 2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1> + 2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0> + 2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7> + 2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS + 1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7> + 2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7> + 2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3> + 1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7> + 2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS + 3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3> + 2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2> + 2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1> + 2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS + 2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7> + 1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7> + 2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3> + 1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7> + 2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2> + 2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3> + 2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3> + 2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3> + 2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6> + 2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7> + 2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7> + 2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7> + 2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2> + 2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS + 2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7> + 2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7> + 3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7> + 2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS + 1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS + 2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4> + 2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6> + 1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS + 2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2> + 2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3> + 3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3> + 2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0> + 2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5> + 2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5> + 2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7> + 2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS + 2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS + 2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1> + 2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0> + 2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3> + 2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0> + 2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5> + 2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4> + 2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6> + 2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0> + 2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7> + 2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS + 2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7> + 2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7> + 2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7> + 2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS + 2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7> + 2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7> + 1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7> + 2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, + 1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS + 2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, + 2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS + 2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, + 1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS + 1595545808U, // <3,7,u,6>: Cost 2 vext2 , + 1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS + 1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0> + 1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, + 1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, + 1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, + 2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, + 1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, + 1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, + 2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS + 1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, + 1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u> + 1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u> + 537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS + 1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, + 1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS + 1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u> + 2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u> + 1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, + 537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS + 1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2> + 2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3> + 1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u> + 1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, + 1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6> + 2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, + 1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u> + 1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, + 1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, + 1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, + 1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3> + 2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, + 336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS + 1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, + 1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, + 2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, + 2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS + 336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS + 2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS + 1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, + 1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, + 2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, + 1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4> + 1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, + 1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, + 1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, + 1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, + 1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS + 2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, + 1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5> + 1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, + 1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS + 1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5> + 537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS + 1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, + 537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS + 2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, + 2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6> + 2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, + 1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, + 1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6> + 2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, + 1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6> + 1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, + 1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, + 1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS + 1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7> + 2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2> + 3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS + 1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS + 1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7> + 2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, + 1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7> + 1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS + 1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, + 1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, + 537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS + 336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS + 1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, + 1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, + 537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS + 1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, + 537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS + 2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0> + 2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1> + 2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2> + 3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4> + 2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4> + 3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0> + 3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0> + 3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0> + 2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1> + 2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS + 2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4> + 1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1> + 2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS + 2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4> + 3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1> + 2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1> + 1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4> + 3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4> + 2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4> + 2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0> + 2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6> + 3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7> + 2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> + 3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2> + 2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4> + 3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS + 2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> + 2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4> + 3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4> + 3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS + 3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5> + 3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7> + 3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0> + 2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> + 2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS + 2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5> + 2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6> + 3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2> + 2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS + 2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS + 3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2> + 3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4> + 2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS + 2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS + 1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS + 2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2> + 2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5> + 2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS + 3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7> + 3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7> + 2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5> + 1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS + 2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS + 3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1> + 1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS + 2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6> + 2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS + 2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6> + 3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6> + 2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0> + 1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS + 3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2> + 3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS + 4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS + 3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0> + 3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5> + 3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5> + 3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0> + 3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7> + 4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS + 2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS + 1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS + 1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u> + 2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS + 2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS + 2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> + 2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u> + 1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1> + 2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS + 3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6> + 2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2> + 2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5> + 3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4> + 3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1> + 3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4> + 2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS + 3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2> + 2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4> + 2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4> + 2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3> + 3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5> + 3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1> + 3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7> + 3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1> + 2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3> + 2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS + 2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4> + 3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2> + 1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4> + 2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS + 3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3> + 3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7> + 3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2> + 1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4> + 2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS + 2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3> + 2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4> + 2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4> + 2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS + 2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7> + 2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2> + 2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3> + 3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1> + 3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0> + 3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5> + 3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS + 3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5> + 2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS + 3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4> + 3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0> + 2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS + 1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS + 2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2> + 2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5> + 2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2> + 1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS + 2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7> + 2600686074U, // <4,1,5,6>: Cost 3 vext1 , <6,2,7,3> + 2600686586U, // <4,1,5,7>: Cost 3 vext1 , <7,0,1,2> + 1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS + 2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS + 2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1> + 4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2> + 3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS + 2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS + 2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6> + 4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7> + 3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1> + 2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS + 2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> + 3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1> + 3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1> + 2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4> + 3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6> + 3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4> + 3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7> + 3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7> + 2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> + 1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS + 2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2> + 2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2> + 1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4> + 1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS + 2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS + 2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 2600686586U, // <4,1,u,7>: Cost 3 vext1 , <7,0,1,2> + 1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS + 2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2> + 2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS + 2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6> + 3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2> + 2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6> + 3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7> + 2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4> + 3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2> + 2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS + 3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2> + 3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1> + 3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0> + 3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS + 2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> + 3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7> + 3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3> + 3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3> + 2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> + 3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4> + 3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3> + 2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2> + 2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3> + 3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0> + 3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7> + 3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6> + 3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2> + 2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3> + 2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1> + 3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1> + 3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2> + 2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4> + 2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5> + 2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4> + 3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4> + 2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4> + 2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1> + 2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS + 2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4> + 2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4> + 2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4> + 2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS + 2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS + 2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4> + 3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0> + 2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS + 2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS + 2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0> + 2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2> + 2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS + 2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS + 3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7> + 2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7> + 3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS + 2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS + 1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS + 2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2> + 2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2> + 2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS + 1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS + 2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6> + 2600767994U, // <4,2,6,6>: Cost 3 vext1 , <6,2,7,3> + 2600768506U, // <4,2,6,7>: Cost 3 vext1 , <7,0,1,2> + 1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS + 2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> + 3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2> + 3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2> + 2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4> + 4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4> + 3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7> + 3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4> + 3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7> + 2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> + 1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS + 2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2> + 2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2> + 2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS + 1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS + 2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS + 2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u> + 2600784890U, // <4,2,u,7>: Cost 3 vext1 , <7,0,1,2> + 1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS + 3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0> + 2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2> + 2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4> + 3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3> + 3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1> + 2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0> + 3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0> + 3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0> + 2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2> + 2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1> + 3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1> + 2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4> + 2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4> + 3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0> + 3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3> + 3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1> + 3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3> + 2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4> + 3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS + 2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3> + 3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2> + 2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4> + 3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS + 2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4> + 3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3> + 3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3> + 2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3> + 3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1> + 3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1> + 3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3> + 2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> + 2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4> + 3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7> + 3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7> + 3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7> + 2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> + 2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1> + 2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2> + 2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4> + 3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0> + 2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5> + 2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6> + 2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4> + 3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2> + 2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1> + 2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS + 2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5> + 2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5> + 2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3> + 2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS + 4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5> + 3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5> + 2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4> + 2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS + 2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS + 2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6> + 2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6> + 2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3> + 2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS + 1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6> + 4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6> + 2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4> + 1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6> + 3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1> + 3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5> + 3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7> + 3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7> + 3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5> + 3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7> + 3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7> + 3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4> + 3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1> + 2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS + 2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2> + 2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u> + 2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4> + 2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS + 1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u> + 2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u> + 2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4> + 1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u> + 2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4> + 1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS + 2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS + 3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1> + 2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0> + 2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1> + 2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2> + 3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0> + 1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS + 2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2> + 2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1> + 2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3> + 2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3> + 2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3> + 2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4> + 3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3> + 3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3> + 2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3> + 3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4> + 3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4> + 2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2> + 2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4> + 2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4> + 3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7> + 2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4> + 3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4> + 2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4> + 2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2> + 3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4> + 3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3> + 2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> + 2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4> + 3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5> + 3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4> + 3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4> + 2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> + 1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS + 2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4> + 2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2> + 2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4> + 161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS + 1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS + 2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS + 2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4> + 161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS + 2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS + 2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4> + 3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5> + 2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5> + 2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS + 1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS + 1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS + 2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5> + 1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS + 2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS + 3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2> + 2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5> + 2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6> + 2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS + 2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6> + 1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS + 2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4> + 1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS + 2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2> + 3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4> + 3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7> + 3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4> + 2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> + 2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4> + 2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4> + 2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7> + 2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> + 1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS + 1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS + 2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS + 2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u> + 161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS + 1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS + 1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS + 2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u> + 161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS + 2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0> + 1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS + 2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5> + 2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0> + 2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5> + 3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7> + 2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5> + 2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0> + 1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS + 2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2> + 2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5> + 2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0> + 2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS + 3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4> + 2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5> + 3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6> + 2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3> + 2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS + 2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS + 3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3> + 2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5> + 1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5> + 2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5> + 3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7> + 2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7> + 4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS + 1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5> + 2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2> + 3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1> + 2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4> + 2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3> + 2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0> + 2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5> + 2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4> + 2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5> + 2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2> + 2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS + 3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4> + 2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5> + 2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4> + 2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS + 1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS + 2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5> + 2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6> + 1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS + 1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS + 2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3> + 2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4> + 2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2> + 1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5> + 2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5> + 2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0> + 2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS + 1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS + 1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS + 2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6> + 2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6> + 1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6> + 1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS + 3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5> + 2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6> + 27705344U, // <4,5,6,7>: Cost 0 copy RHS + 27705344U, // <4,5,6,u>: Cost 0 copy RHS + 2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS + 2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4> + 2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7> + 2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5> + 2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS + 2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7> + 2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4> + 2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4> + 2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS + 1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS + 1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS + 2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, + 1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u> + 1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS + 1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS + 2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, + 27705344U, // <4,5,u,7>: Cost 0 copy RHS + 27705344U, // <4,5,u,u>: Cost 0 copy RHS + 2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0> + 1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS + 1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6> + 3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0> + 2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5> + 3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7> + 2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0> + 4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS + 1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS + 2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2> + 2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1> + 2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0> + 2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3> + 3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5> + 2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7> + 3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7> + 4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS + 2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3> + 2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4> + 2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3> + 2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2> + 2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1> + 2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6> + 2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7> + 2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7> + 2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3> + 2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1> + 2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2> + 2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6> + 3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2> + 2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3> + 2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6> + 2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6> + 3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6> + 2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4> + 2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2> + 2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS + 2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3> + 2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4> + 2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4> + 2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS + 1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS + 1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS + 2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4> + 1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS + 2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS + 2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3> + 2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3> + 3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3> + 2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6> + 2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5> + 2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6> + 2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS + 2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS + 1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS + 2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2> + 2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3> + 2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2> + 1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6> + 2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6> + 2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6> + 2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS + 1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS + 2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2> + 2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2> + 3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7> + 2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4> + 2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6> + 2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> + 3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3> + 2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> + 2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2> + 1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS + 1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS + 1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS + 2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, + 1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u> + 1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS + 1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS + 2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS + 1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS + 3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0> + 2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS + 2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4> + 3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4> + 3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5> + 2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0> + 3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7> + 3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4> + 2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS + 2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1> + 3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1> + 2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4> + 3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5> + 3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS + 3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7> + 3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7> + 3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3> + 2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1> + 3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS + 3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3> + 3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2> + 2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4> + 3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7> + 2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7> + 3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7> + 3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3> + 2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7> + 3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2> + 3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4> + 3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4> + 3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3> + 3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6> + 2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7> + 2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7> + 3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4> + 2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7> + 2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1> + 3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3> + 3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7> + 3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5> + 2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4> + 2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS + 2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4> + 3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7> + 2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS + 2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2> + 3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7> + 2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5> + 3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5> + 2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6> + 2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5> + 2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5> + 2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7> + 2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2> + 1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS + 3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2> + 2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2> + 2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2> + 1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS + 1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6> + 2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3> + 3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7> + 1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS + 3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS + 3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4> + 3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7> + 3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4> + 3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7> + 2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7> + 3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7> + 2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7> + 2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7> + 1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS + 2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS + 2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2> + 2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2> + 1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS + 1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u> + 2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3> + 3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7> + 1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS + 2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0> + 1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS + 1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u> + 2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, + 2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5> + 2265397305U, // <4,u,0,5>: Cost 3 vrev + 2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u> + 2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0> + 1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS + 2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2> + 2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1> + 1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS + 2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3> + 2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, + 2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7> + 3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7> + 2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, + 1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS + 2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS + 2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u> + 2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2> + 1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u> + 2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u> + 2733864859U, // <4,u,2,5>: Cost 3 vext3 , + 2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7> + 2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, + 1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u> + 2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2> + 2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2> + 2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u> + 2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3> + 2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6> + 2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, + 2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> + 2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u> + 2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2> + 1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS + 2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, + 2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4> + 2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4> + 161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS + 1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS + 1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS + 2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, + 161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS + 1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS + 1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS + 2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5> + 3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS + 1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS + 1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS + 1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS + 3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS + 1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS + 1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS + 2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2> + 1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS + 1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6> + 1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS + 1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6> + 1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS + 27705344U, // <4,u,6,7>: Cost 0 copy RHS + 27705344U, // <4,u,6,u>: Cost 0 copy RHS + 2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS + 2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4> + 2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7> + 2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u> + 2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS + 2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> + 2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7> + 2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> + 2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS + 1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS + 1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS + 1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS + 1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u> + 161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS + 1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS + 1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS + 27705344U, // <4,u,u,7>: Cost 0 copy RHS + 27705344U, // <4,u,u,u>: Cost 0 copy RHS + 2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0> + 2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1> + 2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2> + 3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5> + 2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5> + 3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0> + 3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0> + 3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0> + 2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2> + 2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS + 2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1> + 1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS + 3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7> + 2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS + 2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1> + 3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7> + 3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2> + 1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2> + 2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> + 2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4> + 2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5> + 2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5> + 2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5> + 2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4> + 2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> + 2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5> + 3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5> + 3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4> + 2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5> + 3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5> + 2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0> + 3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4> + 3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0> + 3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7> + 2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5> + 2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS + 1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> + 2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6> + 3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5> + 2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS + 2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS + 3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5> + 2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> + 1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> + 3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1> + 2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS + 3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS + 3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0> + 3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5> + 2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0> + 3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0> + 3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS + 2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS + 4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0> + 2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS + 3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7> + 3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6> + 3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5> + 3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7> + 3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6> + 2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5> + 2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS + 2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS + 2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0> + 2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7> + 3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2> + 2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS + 2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0> + 2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7> + 4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7> + 2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS + 2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2> + 1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5> + 1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS + 2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5> + 2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6> + 2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS + 2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u> + 2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> + 1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0> + 1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS + 2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2> + 2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2> + 2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5> + 2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0> + 3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7> + 3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0> + 1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS + 2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1> + 2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1> + 2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0> + 2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3> + 2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5> + 2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5> + 3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5> + 3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5> + 2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3> + 3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2> + 3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3> + 2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2> + 2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0> + 2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5> + 2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3> + 2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7> + 3808199610U, // <5,1,2,7>: Cost 4 vext3 , <1,2,7,0> + 2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0> + 2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS + 2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3> + 3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5> + 2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5> + 2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5> + 2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7> + 3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7> + 2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5> + 2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3> + 1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1> + 2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5> + 2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5> + 2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5> + 2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4> + 1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS + 2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6> + 3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4> + 1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1> + 2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1> + 2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1> + 3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1> + 2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7> + 2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5> + 2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5> + 2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0> + 2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7> + 2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1> + 2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS + 3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7> + 2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3> + 3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7> + 2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS + 2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7> + 2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6> + 2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1> + 2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1> + 2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS + 2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1> + 2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1> + 2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS + 2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS + 2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3> + 4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6> + 2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7> + 2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS + 1591662326U, // <5,1,u,0>: Cost 2 vext2 , + 1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS + 2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5> + 2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS + 2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5> + 1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS + 2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, + 2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, + 2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS + 3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0> + 2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS + 2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2> + 3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2> + 3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1> + 3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1> + 3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4> + 3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0> + 2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS + 2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2> + 3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2> + 3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5> + 2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5> + 3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS + 3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0> + 3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3> + 3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1> + 2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5> + 3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS + 3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3> + 2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2> + 2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3> + 2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5> + 3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7> + 3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6> + 3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5> + 2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3> + 2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1> + 2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5> + 2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5> + 3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5> + 1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5> + 2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5> + 3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5> + 3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7> + 1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5> + 2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2> + 3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3> + 2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5> + 2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5> + 2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6> + 2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS + 2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2> + 3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4> + 2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS + 2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS + 3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3> + 3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7> + 2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS + 2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS + 2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5> + 3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7> + 3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1> + 2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS + 3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS + 3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3> + 2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3> + 2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7> + 2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5> + 3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7> + 3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7> + 3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1> + 2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7> + 2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS + 2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2> + 2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7> + 2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS + 2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS + 4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5> + 3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6> + 3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7> + 2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS + 2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1> + 2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS + 2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u> + 2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3> + 1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5> + 2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS + 2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5> + 3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u> + 1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5> + 3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0> + 2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2> + 3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0> + 3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2> + 2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1> + 3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2> + 3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0> + 2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0> + 2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2> + 3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3> + 3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1> + 2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3> + 2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5> + 3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3> + 2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7> + 3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7> + 2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5> + 2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3> + 3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1> + 3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5> + 3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2> + 2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4> + 2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5> + 2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4> + 3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3> + 3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3> + 2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4> + 3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1> + 2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3> + 3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2> + 2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3> + 2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5> + 2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5> + 3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7> + 3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5> + 2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5> + 2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5> + 3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0> + 2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3> + 2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5> + 2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5> + 2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6> + 3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5> + 2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4> + 2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6> + 2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS + 2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5> + 2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5> + 2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5> + 2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS + 2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5> + 3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0> + 2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5> + 2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS + 2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS + 3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6> + 2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6> + 2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6> + 2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS + 3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0> + 4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6> + 2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4> + 2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS + 1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS + 1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7> + 2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2> + 2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2> + 1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS + 2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3> + 2601513466U, // <5,3,7,6>: Cost 3 vext1 , <6,2,7,3> + 3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7> + 1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS + 1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS + 1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u> + 2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2> + 2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2> + 1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS + 2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6> + 2601521658U, // <5,3,u,6>: Cost 3 vext1 , <6,2,7,3> + 2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u> + 1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS + 3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS + 2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS + 3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2> + 3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5> + 2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5> + 2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1> + 2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0> + 3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0> + 2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS + 2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1> + 3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4> + 3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4> + 3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7> + 3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4> + 2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0> + 3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5> + 3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1> + 2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1> + 3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4> + 3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4> + 3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4> + 2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5> + 3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4> + 2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3> + 3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3> + 3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5> + 2666752099U, // <5,4,2,u>: Cost 3 vext2 , <2,u,4,5> + 3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS + 3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4> + 2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4> + 3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3> + 2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4> + 3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0> + 2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5> + 3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7> + 2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4> + 2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS + 2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4> + 3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3> + 3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4> + 2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4> + 1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5> + 2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4> + 2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4> + 1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5> + 2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS + 2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5> + 2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3> + 3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2> + 2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS + 2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5> + 1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS + 3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS + 1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS + 2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS + 2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7> + 3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2> + 2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6> + 2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS + 2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5> + 3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7> + 2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5> + 2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS + 2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS + 2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4> + 2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7> + 2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7> + 2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS + 3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5> + 94817590U, // <5,4,7,6>: Cost 1 vrev RHS + 2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7> + 94965064U, // <5,4,7,u>: Cost 1 vrev RHS + 2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS + 2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u> + 2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u> + 2667419628U, // <5,4,u,3>: Cost 3 vext2 , + 2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS + 1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5> + 94825783U, // <5,4,u,6>: Cost 1 vrev RHS + 2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5> + 94973257U, // <5,4,u,u>: Cost 1 vrev RHS + 2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0> + 1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS + 2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2> + 3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2> + 2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1> + 2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0> + 3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7> + 4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS + 1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS + 2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2> + 2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5> + 2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0> + 2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3> + 2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5> + 2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7> + 3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7> + 2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3> + 2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5> + 3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS + 3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3> + 2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2> + 2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4> + 3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3> + 2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3> + 2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7> + 4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7> + 2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4> + 2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2> + 3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5> + 3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3> + 2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5> + 2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6> + 2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5> + 3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7> + 3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1> + 2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2> + 2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1> + 2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5> + 3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3> + 3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4> + 1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> + 1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS + 2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5> + 2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6> + 1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5> + 1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS + 2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3> + 2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2> + 2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2> + 1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS + 2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0> + 2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7> + 229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS + 2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS + 3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6> + 2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3> + 3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6> + 2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5> + 2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5> + 2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6> + 2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1> + 2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1> + 2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS + 3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7> + 2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5> + 2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7> + 2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS + 3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5> + 4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6> + 2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS + 2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS + 1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS + 1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS + 2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5> + 2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u> + 1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS + 2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, + 2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS + 229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS + 2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0> + 1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS + 2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2> + 2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4> + 2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5> + 3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6> + 3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7> + 4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS + 1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS + 2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2> + 2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1> + 2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0> + 2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3> + 2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6> + 2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7> + 3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7> + 2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS + 2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6> + 3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2> + 3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3> + 2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2> + 2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1> + 2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6> + 2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6> + 2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7> + 2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3> + 2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6> + 2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2> + 3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3> + 3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6> + 2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3> + 1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6> + 2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6> + 3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7> + 4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS + 1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6> + 2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS + 3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5> + 2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5> + 2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5> + 2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6> + 1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS + 2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6> + 2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5> + 1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS + 2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS + 2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3> + 3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6> + 3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4> + 2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6> + 2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5> + 2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1> + 2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS + 2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS + 2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS + 3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4> + 2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3> + 2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6> + 2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS + 2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6> + 2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6> + 2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS + 2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS + 430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS + 1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> + 1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> + 430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS + 1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6> + 1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6> + 1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7> + 430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS + 430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS + 1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS + 1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> + 1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> + 430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS + 1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS + 1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3> + 1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2> + 430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS + 2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0> + 1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS + 2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2> + 2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0> + 2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5> + 2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7> + 3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7> + 2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0> + 1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS + 2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2> + 2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1> + 2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0> + 1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7> + 2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS + 2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7> + 2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7> + 3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7> + 1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7> + 2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7> + 3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3> + 2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2> + 2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1> + 2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7> + 2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7> + 2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7> + 3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7> + 2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1> + 2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2> + 2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5> + 3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1> + 2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3> + 2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6> + 2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0> + 3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7> + 2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7> + 2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2> + 2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS + 2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7> + 3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0> + 2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4> + 2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS + 1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS + 2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> + 2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7> + 1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS + 2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS + 2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3> + 2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3> + 2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7> + 2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS + 2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5> + 2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7> + 1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS + 1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS + 2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0> + 3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5> + 2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2> + 2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6> + 2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4> + 2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u> + 2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> + 2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7> + 2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u> + 1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS + 2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1> + 2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2> + 2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3> + 1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS + 1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7> + 2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3> + 2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7> + 1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS + 1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS + 1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS + 2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, + 1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS + 1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS + 1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS + 2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, + 1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS + 1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS + 2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0> + 1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS + 2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2> + 2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, + 2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, + 2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, + 2266134675U, // <5,u,0,6>: Cost 3 vrev + 2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0> + 1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS + 2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2> + 2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1> + 1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS + 1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u> + 2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u> + 2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, + 2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u> + 2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS + 1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS + 2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0> + 2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> + 2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2> + 2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, + 2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u> + 2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u> + 2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7> + 2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, + 2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1> + 2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, + 2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u> + 2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u> + 2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3> + 1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u> + 2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0> + 2734610422U, // <5,u,3,6>: Cost 3 vext3 , + 2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u> + 1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u> + 1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u> + 1661163546U, // <5,u,4,1>: Cost 2 vext3 , + 2734463012U, // <5,u,4,2>: Cost 3 vext3 , + 2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, + 1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> + 1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS + 2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> + 2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u> + 1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS + 1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS + 2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5> + 2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, + 2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, + 1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS + 229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS + 1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS + 1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS + 229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS + 2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS + 2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS + 2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6> + 2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, + 2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS + 2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS + 2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> + 2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS + 2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, + 430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS + 1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7> + 1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2> + 2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS + 430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS + 1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3> + 118708378U, // <5,u,7,6>: Cost 1 vrev RHS + 2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS + 430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS + 430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS + 1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS + 1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS + 1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS + 430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS + 229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS + 118716571U, // <5,u,u,6>: Cost 1 vrev RHS + 1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS + 430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS + 2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0> + 2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1> + 2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2> + 3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5> + 2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6> + 3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0> + 3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6> + 3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7> + 2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2> + 2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS + 3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0> + 1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS + 2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6> + 2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS + 2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1> + 2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1> + 2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1> + 1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS + 2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2> + 2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6> + 3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6> + 2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5> + 1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6> + 3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7> + 2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6> + 3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2> + 1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6> + 3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2> + 2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4> + 2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5> + 3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3> + 2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6> + 3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6> + 3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6> + 3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7> + 2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5> + 2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6> + 2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5> + 1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6> + 3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6> + 2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6> + 2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS + 2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0> + 2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0> + 1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6> + 3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS + 2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6> + 3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6> + 3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0> + 2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6> + 3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6> + 2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0> + 2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS + 2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS + 2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0> + 2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS + 3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS + 3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5> + 2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0> + 3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7> + 2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0> + 2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1> + 2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS + 2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS + 2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0> + 2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2> + 2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7> + 2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS + 4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5> + 3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0> + 4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7> + 2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS + 2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2> + 2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1> + 1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS + 2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5> + 1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6> + 2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS + 2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u> + 2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS + 1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS + 2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS + 2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS + 3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6> + 2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2> + 2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS + 3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2> + 2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1> + 2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0> + 2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2> + 3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1> + 2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1> + 3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6> + 2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3> + 2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6> + 3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5> + 3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6> + 3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1> + 2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3> + 2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS + 3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3> + 3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2> + 2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0> + 2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS + 3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3> + 2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3> + 3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0> + 2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0> + 2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS + 2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3> + 2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6> + 3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1> + 2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6> + 2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7> + 2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3> + 3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2> + 2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3> + 2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1> + 2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6> + 3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4> + 2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6> + 3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS + 2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6> + 3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0> + 2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1> + 2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6> + 2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1> + 3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7> + 3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6> + 2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7> + 2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6> + 3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6> + 2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0> + 3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS + 2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7> + 2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS + 3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7> + 3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6> + 3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS + 2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS + 3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7> + 2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6> + 3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1> + 3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS + 2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS + 2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7> + 2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2> + 3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS + 2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS + 2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5> + 3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0> + 4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7> + 2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS + 2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS + 2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3> + 2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6> + 2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0> + 2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6> + 2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7> + 2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u> + 2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u> + 2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0> + 2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0> + 1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS + 2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2> + 3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0> + 2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6> + 3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3> + 2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4> + 3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0> + 1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS + 2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1> + 2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1> + 2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0> + 2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS + 3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6> + 2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7> + 2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3> + 3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1> + 2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1> + 3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1> + 3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3> + 2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2> + 2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3> + 2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6> + 3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7> + 2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6> + 3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7> + 2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3> + 2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1> + 3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0> + 2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6> + 2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4> + 2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5> + 2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6> + 2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6> + 2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4> + 2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1> + 1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> + 3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u> + 2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6> + 2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6> + 2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6> + 1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS + 2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0> + 3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2> + 1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2> + 3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3> + 2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3> + 3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7> + 2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6> + 2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5> + 2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5> + 2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0> + 2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS + 2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6> + 2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1> + 2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3> + 2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6> + 2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7> + 2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5> + 2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7> + 2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6> + 2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1> + 2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7> + 2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS + 2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2> + 2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7> + 1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS + 2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS + 3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7> + 2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6> + 2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7> + 1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS + 1591744256U, // <6,2,u,0>: Cost 2 vext2 , + 1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS + 2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6> + 1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS + 2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5> + 1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS + 2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0> + 2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS + 1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS + 3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0> + 2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2> + 2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4> + 3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2> + 2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2> + 2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2> + 3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0> + 2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0> + 2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2> + 3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3> + 3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1> + 3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3> + 3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1> + 2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> + 3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3> + 3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0> + 3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3> + 2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> + 3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4> + 2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> + 3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2> + 3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0> + 3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6> + 3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7> + 2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7> + 3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6> + 2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> + 3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1> + 3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3> + 2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3> + 2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3> + 2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6> + 3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5> + 3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7> + 3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7> + 2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5> + 2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS + 2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3> + 2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3> + 2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6> + 2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS + 1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6> + 2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6> + 2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4> + 1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6> + 3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS + 3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7> + 3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5> + 3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5> + 2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6> + 3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7> + 3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6> + 3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0> + 2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6> + 2602164326U, // <6,3,6,0>: Cost 3 vext1 , LHS + 2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3> + 2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6> + 3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1> + 2602167524U, // <6,3,6,4>: Cost 3 vext1 , <4,4,6,6> + 3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7> + 2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6> + 3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7> + 2602170158U, // <6,3,6,u>: Cost 3 vext1 , LHS + 1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS + 2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7> + 1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7> + 2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2> + 1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS + 2602176208U, // <6,3,7,5>: Cost 3 vext1 , <5,1,7,3> + 2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3> + 2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7> + 1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS + 1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS + 2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2> + 1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u> + 2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2> + 1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS + 1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6> + 2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0> + 2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7> + 1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS + 3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0> + 2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS + 2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6> + 3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1> + 2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6> + 2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1> + 2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2> + 3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0> + 2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS + 2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1> + 3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1> + 3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0> + 2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS + 3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS + 3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0> + 2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3> + 3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1> + 2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1> + 2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4> + 3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3> + 3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2> + 3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1> + 2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4> + 2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS + 2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0> + 3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0> + 2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4> + 3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2> + 3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3> + 3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6> + 3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3> + 2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6> + 3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6> + 2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5> + 3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7> + 2668817222U, // <6,4,3,u>: Cost 3 vext2 , <3,u,5,6> + 2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS + 3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4> + 2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4> + 3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4> + 2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4> + 2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS + 1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> + 2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4> + 1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> + 2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS + 3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3> + 2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5> + 2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6> + 2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS + 2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5> + 1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS + 2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS + 1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS + 1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS + 2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2> + 2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2> + 2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2> + 1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS + 2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3> + 2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3> + 2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2> + 1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS + 2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS + 2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4> + 3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5> + 2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7> + 2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS + 2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5> + 2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6> + 4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7> + 2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS + 1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS + 2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS + 2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u> + 2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6> + 1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6> + 2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS + 1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS + 2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS + 1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS + 3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0> + 2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS + 3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6> + 3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2> + 3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1> + 4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5> + 3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7> + 1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0> + 1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0> + 2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1> + 3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5> + 3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0> + 3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS + 3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6> + 3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7> + 3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4> + 2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3> + 2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3> + 2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS + 3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5> + 3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2> + 3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5> + 2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS + 3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6> + 2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7> + 3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS + 2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5> + 3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2> + 3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5> + 3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3> + 3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3> + 2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6> + 3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6> + 3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0> + 2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3> + 2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3> + 2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS + 3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5> + 3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5> + 2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5> + 2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS + 2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS + 3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0> + 1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6> + 1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6> + 2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS + 3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2> + 3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2> + 3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2> + 2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS + 2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5> + 2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6> + 2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7> + 2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7> + 2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1> + 3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4> + 3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4> + 2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4> + 2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5> + 3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5> + 2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6> + 2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0> + 2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1> + 2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS + 2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7> + 2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7> + 3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2> + 2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS + 2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5> + 2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6> + 3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS + 2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS + 2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS + 2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u> + 2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u> + 2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, + 2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS + 2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS + 2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6> + 1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u> + 1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u> + 2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS + 1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS + 2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4> + 3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1> + 2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2> + 3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3> + 2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0> + 4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS + 1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS + 2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2> + 2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1> + 2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0> + 2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS + 3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3> + 2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7> + 2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3> + 3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3> + 2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS + 3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6> + 3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3> + 2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6> + 2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1> + 3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3> + 3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7> + 2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7> + 2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3> + 2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3> + 2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2> + 3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4> + 3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6> + 2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3> + 2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5> + 3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4> + 2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6> + 4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS + 2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5> + 2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS + 3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2> + 3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6> + 3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6> + 1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6> + 1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS + 2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0> + 2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6> + 1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6> + 3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS + 2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3> + 3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5> + 3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0> + 2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6> + 2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5> + 2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0> + 2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS + 2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS + 1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS + 2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2> + 2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3> + 2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2> + 1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS + 2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3> + 296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS + 2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7> + 296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS + 2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS + 3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7> + 2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7> + 2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6> + 2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS + 2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4> + 2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6> + 1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS + 1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS + 1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS + 1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS + 2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u> + 2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS + 1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS + 1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS + 296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS + 1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS + 296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS + 1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS + 1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0> + 1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0> + 2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7> + 2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2> + 497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS + 1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> + 1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3> + 2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5> + 1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7> + 2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7> + 2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2> + 1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3> + 2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2> + 2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3> + 1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6> + 2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7> + 1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7> + 2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1> + 1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1> + 1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3> + 2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1> + 1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7> + 2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7> + 2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1> + 1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3> + 2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3> + 2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5> + 1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS + 1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7> + 497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS + 2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> + 1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> + 2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0> + 1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> + 1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7> + 2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2> + 2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3> + 1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5> + 2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6> + 2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7> + 1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> + 1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1> + 1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> + 2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7> + 2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3> + 2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7> + 1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> + 2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7> + 1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7> + 1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7> + 1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2> + 1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, + 497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS + 1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, + 1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, + 1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, + 497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS + 1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, + 1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, + 497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS + 1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0> + 497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS + 1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2> + 2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, + 1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5> + 1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0> + 2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7> + 1193130221U, // <6,u,0,7>: Cost 2 vrev + 497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS + 1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2> + 1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1> + 1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS + 1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3> + 2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS + 1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7> + 2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7> + 2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, + 1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS + 1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS + 2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3> + 1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2> + 1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1> + 1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS + 2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u> + 1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7> + 2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, + 1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1> + 1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2> + 2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3> + 2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3> + 1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3> + 1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6> + 2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, + 2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, + 2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1> + 1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2> + 1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1> + 2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3> + 1661245476U, // <6,u,4,2>: Cost 2 vext3 , + 2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, + 1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4> + 497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS + 1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6> + 1661614161U, // <6,u,4,7>: Cost 2 vext3 , + 497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS + 2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS + 1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> + 2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5> + 2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, + 1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6> + 1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5> + 1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS + 1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7> + 1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS + 1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS + 2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2> + 1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> + 2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, + 1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS + 2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5> + 296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS + 1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1> + 296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS + 1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS + 2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7> + 1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7> + 1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS + 1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS + 2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5> + 1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7> + 1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS + 1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS + 1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS + 497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS + 1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS + 1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, + 1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS + 497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS + 296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS + 1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS + 497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS + 1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0> + 1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1> + 1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2> + 3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0> + 2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1> + 2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6> + 2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0> + 3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7> + 1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1> + 1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS + 2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5> + 564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS + 2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7> + 1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS + 2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7> + 1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1> + 2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0> + 564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS + 1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2> + 2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5> + 2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0> + 2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1> + 1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6> + 2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> + 2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7> + 2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7> + 1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2> + 2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2> + 2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0> + 2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0> + 2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3> + 2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6> + 2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0> + 2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0> + 3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7> + 2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0> + 2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4> + 1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5> + 1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6> + 3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4> + 2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6> + 1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS + 2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6> + 3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5> + 1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5> + 2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS + 2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3> + 2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7> + 2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0> + 2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6> + 2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5> + 1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0> + 2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7> + 1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0> + 2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7> + 2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> + 2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7> + 2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0> + 3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7> + 2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7> + 2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6> + 2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1> + 2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7> + 2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2> + 2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS + 3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS + 3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7> + 2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6> + 2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7> + 2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0> + 2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7> + 2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2> + 1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2> + 1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1> + 564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS + 2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u> + 1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6> + 1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS + 1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u> + 2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, + 564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS + 2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS + 2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS + 2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS + 1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2> + 2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS + 2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> + 2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0> + 3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1> + 1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2> + 2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1> + 1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1> + 3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6> + 1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3> + 2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5> + 2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7> + 2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1> + 3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5> + 1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3> + 2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1> + 2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3> + 3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2> + 1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0> + 2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5> + 2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3> + 2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2> + 2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0> + 1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0> + 2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0> + 1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3> + 2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0> + 2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7> + 2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5> + 1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7> + 2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7> + 2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3> + 1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7> + 2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5> + 2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5> + 2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5> + 2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5> + 3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5> + 2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS + 2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1> + 3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0> + 2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS + 2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS + 2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7> + 3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6> + 1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7> + 2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS + 3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7> + 2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1> + 2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1> + 1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7> + 3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7> + 2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7> + 2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> + 2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7> + 3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7> + 2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7> + 3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7> + 2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1> + 2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7> + 2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2> + 2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1> + 3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3> + 3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS + 2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS + 3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7> + 3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0> + 2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7> + 3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS + 2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS + 1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3> + 2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0> + 1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7> + 2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS + 1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7> + 2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7> + 2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS + 1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7> + 2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2> + 2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2> + 2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0> + 2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0> + 2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6> + 2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7> + 2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1> + 3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2> + 2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7> + 2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3> + 3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0> + 3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0> + 2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1> + 2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3> + 3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0> + 2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3> + 2602718850U, // <7,2,1,7>: Cost 3 vext1 , <7,u,1,2> + 2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1> + 2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1> + 2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3> + 1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3> + 2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5> + 2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7> + 2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6> + 3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5> + 1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3> + 1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1> + 2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5> + 2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6> + 2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7> + 1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5> + 2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7> + 2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6> + 2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0> + 1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1> + 2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6> + 2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3> + 2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4> + 2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5> + 2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6> + 2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7> + 2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0> + 3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0> + 2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7> + 2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7> + 3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3> + 2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7> + 2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7> + 2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7> + 3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7> + 2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7> + 3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0> + 2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7> + 2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS + 2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3> + 2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7> + 1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7> + 2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS + 2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7> + 2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7> + 3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7> + 1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7> + 2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1> + 3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0> + 3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5> + 2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS + 2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS + 3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7> + 4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6> + 2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7> + 2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS + 1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1> + 2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5> + 1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7> + 1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5> + 2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7> + 2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7> + 2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0> + 1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7> + 2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0> + 1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2> + 2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0> + 2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2> + 2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1> + 2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2> + 2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0> + 3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0> + 1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2> + 2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1> + 2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1> + 2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3> + 2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5> + 2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6> + 2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3> + 3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1> + 2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5> + 2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5> + 2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1> + 2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0> + 2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2> + 2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0> + 2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5> + 2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4> + 2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3> + 2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3> + 2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0> + 2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1> + 2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3> + 2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3> + 1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3> + 2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4> + 2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7> + 2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3> + 2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7> + 1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3> + 2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1> + 2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2> + 2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4> + 2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5> + 2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5> + 1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6> + 2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6> + 3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4> + 1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6> + 2602819686U, // <7,3,5,0>: Cost 3 vext1 , LHS + 1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3> + 2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3> + 2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7> + 2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5> + 2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7> + 2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0> + 2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0> + 1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3> + 2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7> + 2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3> + 1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3> + 2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7> + 2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7> + 3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7> + 2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6> + 2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7> + 1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3> + 2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1> + 2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5> + 2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6> + 2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7> + 2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5> + 2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7> + 2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7> + 2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7> + 2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1> + 2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u> + 1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2> + 1593153452U, // <7,3,u,2>: Cost 2 vext2 , + 1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3> + 2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u> + 1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6> + 2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3> + 2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0> + 1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2> + 2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0> + 1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS + 2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2> + 3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1> + 2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5> + 1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> + 1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> + 3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1> + 1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1> + 2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2> + 2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1> + 2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3> + 2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7> + 2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3> + 2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0> + 2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3> + 3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3> + 2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0> + 3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1> + 3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3> + 2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2> + 2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1> + 3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7> + 2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> + 2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0> + 3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5> + 2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0> + 2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2> + 3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5> + 3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1> + 2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3> + 2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6> + 2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4> + 2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5> + 3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7> + 2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4> + 2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2> + 3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1> + 3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4> + 3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3> + 1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4> + 1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5> + 1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> + 3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7> + 1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5> + 1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS + 2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7> + 2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3> + 2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5> + 1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS + 2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7> + 564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS + 2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7> + 564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS + 2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1> + 3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5> + 2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3> + 2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4> + 1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6> + 2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7> + 2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7> + 2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1> + 1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2> + 2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2> + 3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2> + 3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5> + 3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6> + 2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1> + 2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0> + 2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1> + 2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7> + 2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7> + 1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS + 1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS + 2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3> + 2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u> + 1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6> + 1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1> + 564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS + 2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5> + 564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS + 2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS + 2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS + 2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS + 3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0> + 2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1> + 2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1> + 2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1> + 2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2> + 2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS + 2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1> + 3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1> + 3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0> + 2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7> + 2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5> + 2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3> + 2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1> + 1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> + 1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> + 3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1> + 3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0> + 3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7> + 2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4> + 3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7> + 2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> + 3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3> + 2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3> + 2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4> + 3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS + 2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5> + 3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5> + 3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3> + 3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS + 2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5> + 3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5> + 2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0> + 2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0> + 2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS + 3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7> + 3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5> + 3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4> + 2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS + 2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS + 2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5> + 1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> + 1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6> + 2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1> + 2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3> + 3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3> + 3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3> + 2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4> + 1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5> + 2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6> + 1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> + 1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7> + 2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1> + 2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7> + 3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2> + 2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4> + 2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5> + 2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7> + 2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7> + 1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0> + 1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0> + 1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS + 2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3> + 2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2> + 2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7> + 1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS + 1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> + 2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> + 2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1> + 1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3> + 1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS + 2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS + 2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0> + 2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u> + 1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS + 1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7> + 2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7> + 1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0> + 1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3> + 2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0> + 1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS + 2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4> + 2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0> + 2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2> + 2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0> + 2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0> + 2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2> + 1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS + 1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1> + 2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1> + 2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0> + 2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3> + 2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS + 2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7> + 2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> + 2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3> + 1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1> + 2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1> + 2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3> + 2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2> + 2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1> + 2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5> + 2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7> + 2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7> + 1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> + 1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> + 2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2> + 2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3> + 2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3> + 2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3> + 2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5> + 2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7> + 3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6> + 2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0> + 2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0> + 2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1> + 2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3> + 2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5> + 2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4> + 2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6> + 1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS + 2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0> + 2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5> + 1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS + 2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS + 2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3> + 2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7> + 3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4> + 1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6> + 2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5> + 2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0> + 2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7> + 1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5> + 2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1> + 2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3> + 2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3> + 3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4> + 2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4> + 2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7> + 1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6> + 1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> + 1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7> + 1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1> + 2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0> + 2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2> + 2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0> + 1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5> + 2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4> + 2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6> + 2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7> + 1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1> + 1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1> + 1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS + 2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2> + 2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0> + 1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5> + 1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS + 1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6> + 1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3> + 1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1> + 2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0> + 1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2> + 2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0> + 3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0> + 2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1> + 2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0> + 2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0> + 2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2> + 1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2> + 2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3> + 2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1> + 2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3> + 2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5> + 2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS + 2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3> + 2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1> + 2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1> + 2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3> + 3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5> + 3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0> + 2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2> + 2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0> + 3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5> + 3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3> + 2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3> + 2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3> + 2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3> + 2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1> + 3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3> + 3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6> + 2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7> + 2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5> + 2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3> + 2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7> + 2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7> + 2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7> + 2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5> + 3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7> + 3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3> + 3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4> + 2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4> + 1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6> + 2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4> + 2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6> + 1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6> + 2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS + 2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7> + 3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3> + 2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7> + 2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7> + 1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7> + 2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7> + 2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5> + 1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7> + 2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS + 2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0> + 2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7> + 2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7> + 2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS + 2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7> + 1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7> + 2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0> + 1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7> + 1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS + 2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2> + 2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2> + 2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2> + 1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS + 2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7> + 2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7> + 363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS + 363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS + 1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS + 1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2> + 2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3> + 2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0> + 1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS + 1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6> + 1595840756U, // <7,7,u,6>: Cost 2 vext2 , + 363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS + 363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS + 1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0> + 1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, + 1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, + 1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, + 2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, + 1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, + 1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, + 2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS + 1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, + 1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS + 1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1> + 564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS + 1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, + 1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS + 2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, + 1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1> + 1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, + 564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS + 1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2> + 2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, + 1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2> + 1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, + 1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6> + 2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, + 2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, + 1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, + 1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, + 1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, + 1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3> + 2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, + 1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3> + 1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, + 1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, + 2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, + 2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, + 1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, + 2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, + 1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, + 1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, + 2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, + 1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4> + 1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, + 1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, + 1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, + 1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, + 1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS + 1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u> + 2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, + 1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, + 1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u> + 1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u> + 564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS + 1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, + 564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS + 2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, + 2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, + 1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u> + 1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, + 1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6> + 2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, + 1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u> + 1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, + 1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, + 1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, + 2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, + 2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, + 2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7> + 1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, + 1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7> + 2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, + 363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS + 363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS + 1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, + 1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, + 564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS + 1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, + 1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, + 1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, + 564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS + 363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS + 564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS + 135053414U, // : Cost 1 vdup0 LHS + 1611489290U, // : Cost 2 vext3 LHS, <0,0,1,1> + 1611489300U, // : Cost 2 vext3 LHS, <0,0,2,2> + 2568054923U, // : Cost 3 vext1 <3,0,0,0>, <3,0,0,0> + 1481706806U, // : Cost 2 vext1 <0,u,0,0>, RHS + 2555449040U, // : Cost 3 vext1 <0,u,0,0>, <5,1,7,3> + 2591282078U, // : Cost 3 vext1 <6,u,0,0>, <6,u,0,0> + 2591945711U, // : Cost 3 vext1 <7,0,0,0>, <7,0,0,0> + 135053414U, // : Cost 1 vdup0 LHS + 1493655654U, // : Cost 2 vext1 <2,u,0,1>, LHS + 1860550758U, // : Cost 2 vzipl LHS, LHS + 537747563U, // : Cost 1 vext3 LHS, LHS + 2625135576U, // : Cost 3 vext2 <1,2,u,0>, <1,3,1,3> + 1493658934U, // : Cost 2 vext1 <2,u,0,1>, RHS + 2625135760U, // : Cost 3 vext2 <1,2,u,0>, <1,5,3,7> + 1517548447U, // : Cost 2 vext1 <6,u,0,1>, <6,u,0,1> + 2591290362U, // : Cost 3 vext1 <6,u,0,1>, <7,0,1,2> + 537747612U, // : Cost 1 vext3 LHS, LHS + 1611489444U, // : Cost 2 vext3 LHS, <0,2,0,2> + 2685231276U, // : Cost 3 vext3 LHS, <0,2,1,1> + 1994768486U, // : Cost 2 vtrnl LHS, LHS + 2685231294U, // : Cost 3 vext3 LHS, <0,2,3,1> + 1611489484U, // : Cost 2 vext3 LHS, <0,2,4,6> + 2712068310U, // : Cost 3 vext3 RHS, <0,2,5,7> + 2625136570U, // : Cost 3 vext2 <1,2,u,0>, <2,6,3,7> + 2591962097U, // : Cost 3 vext1 <7,0,0,2>, <7,0,0,2> + 1611489516U, // : Cost 2 vext3 LHS, <0,2,u,2> + 2954067968U, // : Cost 3 vzipr LHS, <0,0,0,0> + 2685231356U, // : Cost 3 vext3 LHS, <0,3,1,0> + 72589981U, // : Cost 1 vrev LHS + 2625137052U, // : Cost 3 vext2 <1,2,u,0>, <3,3,3,3> + 2625137154U, // : Cost 3 vext2 <1,2,u,0>, <3,4,5,6> + 2639071848U, // : Cost 3 vext2 <3,5,u,0>, <3,5,u,0> + 2639735481U, // : Cost 3 vext2 <3,6,u,0>, <3,6,u,0> + 2597279354U, // : Cost 3 vext1 <7,u,0,3>, <7,u,0,3> + 73032403U, // : Cost 1 vrev LHS + 2687074636U, // : Cost 3 vext3 <0,4,0,u>, <0,4,0,u> + 1611489618U, // : Cost 2 vext3 LHS, <0,4,1,5> + 1611489628U, // : Cost 2 vext3 LHS, <0,4,2,6> + 3629222038U, // : Cost 4 vext1 <0,u,0,4>, <3,0,1,2> + 2555481398U, // : Cost 3 vext1 <0,u,0,4>, RHS + 1551396150U, // : Cost 2 vext2 <1,2,u,0>, RHS + 2651680116U, // : Cost 3 vext2 <5,6,u,0>, <4,6,4,6> + 2646150600U, // : Cost 3 vext2 <4,7,5,0>, <4,7,5,0> + 1611932050U, // : Cost 2 vext3 LHS, <0,4,u,6> + 2561458278U, // : Cost 3 vext1 <1,u,0,5>, LHS + 1863532646U, // : Cost 2 vzipl RHS, LHS + 2712068526U, // : Cost 3 vext3 RHS, <0,5,2,7> + 2649689976U, // : Cost 3 vext2 <5,3,u,0>, <5,3,u,0> + 2220237489U, // : Cost 3 vrev <0,u,4,5> + 2651680772U, // : Cost 3 vext2 <5,6,u,0>, <5,5,5,5> + 1577939051U, // : Cost 2 vext2 <5,6,u,0>, <5,6,u,0> + 2830077238U, // : Cost 3 vuzpr <1,u,3,0>, RHS + 1579266317U, // : Cost 2 vext2 <5,u,u,0>, <5,u,u,0> + 2555494502U, // : Cost 3 vext1 <0,u,0,6>, LHS + 2712068598U, // : Cost 3 vext3 RHS, <0,6,1,7> + 1997750374U, // : Cost 2 vtrnl RHS, LHS + 2655662673U, // : Cost 3 vext2 <6,3,u,0>, <6,3,u,0> + 2555497782U, // : Cost 3 vext1 <0,u,0,6>, RHS + 2651681459U, // : Cost 3 vext2 <5,6,u,0>, <6,5,0,u> + 2651681592U, // : Cost 3 vext2 <5,6,u,0>, <6,6,6,6> + 2651681614U, // : Cost 3 vext2 <5,6,u,0>, <6,7,0,1> + 1997750428U, // : Cost 2 vtrnl RHS, LHS + 2567446630U, // : Cost 3 vext1 <2,u,0,7>, LHS + 2567447446U, // : Cost 3 vext1 <2,u,0,7>, <1,2,3,0> + 2567448641U, // : Cost 3 vext1 <2,u,0,7>, <2,u,0,7> + 2573421338U, // : Cost 3 vext1 <3,u,0,7>, <3,u,0,7> + 2567449910U, // : Cost 3 vext1 <2,u,0,7>, RHS + 2651682242U, // : Cost 3 vext2 <5,6,u,0>, <7,5,6,u> + 2591339429U, // : Cost 3 vext1 <6,u,0,7>, <6,u,0,7> + 2651682412U, // : Cost 3 vext2 <5,6,u,0>, <7,7,7,7> + 2567452462U, // : Cost 3 vext1 <2,u,0,7>, LHS + 135053414U, // : Cost 1 vdup0 LHS + 1611489938U, // : Cost 2 vext3 LHS, <0,u,1,1> + 537748125U, // : Cost 1 vext3 LHS, LHS + 2685674148U, // : Cost 3 vext3 LHS, <0,u,3,1> + 1611932338U, // : Cost 2 vext3 LHS, <0,u,4,6> + 1551399066U, // : Cost 2 vext2 <1,2,u,0>, RHS + 1517605798U, // : Cost 2 vext1 <6,u,0,u>, <6,u,0,u> + 2830077481U, // : Cost 3 vuzpr <1,u,3,0>, RHS + 537748179U, // : Cost 1 vext3 LHS, LHS + 1544101961U, // : Cost 2 vext2 <0,0,u,1>, <0,0,u,1> + 1558036582U, // : Cost 2 vext2 <2,3,u,1>, LHS + 2619171051U, // : Cost 3 vext2 <0,2,u,1>, <0,2,u,1> + 1611490038U, // : Cost 2 vext3 LHS, <1,0,3,2> + 2555522358U, // : Cost 3 vext1 <0,u,1,0>, RHS + 2712068871U, // : Cost 3 vext3 RHS, <1,0,5,1> + 2591355815U, // : Cost 3 vext1 <6,u,1,0>, <6,u,1,0> + 2597328512U, // : Cost 3 vext1 <7,u,1,0>, <7,u,1,0> + 1611490083U, // : Cost 2 vext3 LHS, <1,0,u,2> + 1481785446U, // : Cost 2 vext1 <0,u,1,1>, LHS + 202162278U, // : Cost 1 vdup1 LHS + 2555528808U, // : Cost 3 vext1 <0,u,1,1>, <2,2,2,2> + 1611490120U, // : Cost 2 vext3 LHS, <1,1,3,3> + 1481788726U, // : Cost 2 vext1 <0,u,1,1>, RHS + 2689876828U, // : Cost 3 vext3 LHS, <1,1,5,5> + 2591364008U, // : Cost 3 vext1 <6,u,1,1>, <6,u,1,1> + 2592691274U, // : Cost 3 vext1 <7,1,1,1>, <7,1,1,1> + 202162278U, // : Cost 1 vdup1 LHS + 1499709542U, // : Cost 2 vext1 <3,u,1,2>, LHS + 2689876871U, // : Cost 3 vext3 LHS, <1,2,1,3> + 2631116445U, // : Cost 3 vext2 <2,2,u,1>, <2,2,u,1> + 835584U, // : Cost 0 copy LHS + 1499712822U, // : Cost 2 vext1 <3,u,1,2>, RHS + 2689876907U, // : Cost 3 vext3 LHS, <1,2,5,3> + 2631780282U, // : Cost 3 vext2 <2,3,u,1>, <2,6,3,7> + 1523603074U, // : Cost 2 vext1 <7,u,1,2>, <7,u,1,2> + 835584U, // : Cost 0 copy LHS + 1487773798U, // : Cost 2 vext1 <1,u,1,3>, LHS + 1611490264U, // : Cost 2 vext3 LHS, <1,3,1,3> + 2685232094U, // : Cost 3 vext3 LHS, <1,3,2,0> + 2018746470U, // : Cost 2 vtrnr LHS, LHS + 1487777078U, // : Cost 2 vext1 <1,u,1,3>, RHS + 1611490304U, // : Cost 2 vext3 LHS, <1,3,5,7> + 2685674505U, // : Cost 3 vext3 LHS, <1,3,6,7> + 2640407307U, // : Cost 3 vext2 <3,7,u,1>, <3,7,u,1> + 1611490327U, // : Cost 2 vext3 LHS, <1,3,u,3> + 1567992749U, // : Cost 2 vext2 <4,0,u,1>, <4,0,u,1> + 2693121070U, // : Cost 3 vext3 <1,4,1,u>, <1,4,1,u> + 2693194807U, // : Cost 3 vext3 <1,4,2,u>, <1,4,2,u> + 1152386432U, // : Cost 2 vrev <1,u,3,4> + 2555555126U, // : Cost 3 vext1 <0,u,1,4>, RHS + 1558039862U, // : Cost 2 vext2 <2,3,u,1>, RHS + 2645716371U, // : Cost 3 vext2 <4,6,u,1>, <4,6,u,1> + 2597361284U, // : Cost 3 vext1 <7,u,1,4>, <7,u,1,4> + 1152755117U, // : Cost 2 vrev <1,u,u,4> + 1481818214U, // : Cost 2 vext1 <0,u,1,5>, LHS + 2555560694U, // : Cost 3 vext1 <0,u,1,5>, <1,0,3,2> + 2555561576U, // : Cost 3 vext1 <0,u,1,5>, <2,2,2,2> + 1611490448U, // : Cost 2 vext3 LHS, <1,5,3,7> + 1481821494U, // : Cost 2 vext1 <0,u,1,5>, RHS + 2651025435U, // : Cost 3 vext2 <5,5,u,1>, <5,5,u,1> + 2651689068U, // : Cost 3 vext2 <5,6,u,1>, <5,6,u,1> + 2823966006U, // : Cost 3 vuzpr <0,u,1,1>, RHS + 1611932861U, // : Cost 2 vext3 LHS, <1,5,u,7> + 2555568230U, // : Cost 3 vext1 <0,u,1,6>, LHS + 2689877199U, // : Cost 3 vext3 LHS, <1,6,1,7> + 2712069336U, // : Cost 3 vext3 RHS, <1,6,2,7> + 2685232353U, // : Cost 3 vext3 LHS, <1,6,3,7> + 2555571510U, // : Cost 3 vext1 <0,u,1,6>, RHS + 2689877235U, // : Cost 3 vext3 LHS, <1,6,5,7> + 2657661765U, // : Cost 3 vext2 <6,6,u,1>, <6,6,u,1> + 1584583574U, // : Cost 2 vext2 <6,7,u,1>, <6,7,u,1> + 1585247207U, // : Cost 2 vext2 <6,u,u,1>, <6,u,u,1> + 2561548390U, // : Cost 3 vext1 <1,u,1,7>, LHS + 2561549681U, // : Cost 3 vext1 <1,u,1,7>, <1,u,1,7> + 2573493926U, // : Cost 3 vext1 <3,u,1,7>, <2,3,0,1> + 2042962022U, // : Cost 2 vtrnr RHS, LHS + 2561551670U, // : Cost 3 vext1 <1,u,1,7>, RHS + 2226300309U, // : Cost 3 vrev <1,u,5,7> + 2658325990U, // : Cost 3 vext2 <6,7,u,1>, <7,6,1,u> + 2658326124U, // : Cost 3 vext2 <6,7,u,1>, <7,7,7,7> + 2042962027U, // : Cost 2 vtrnr RHS, LHS + 1481842790U, // : Cost 2 vext1 <0,u,1,u>, LHS + 202162278U, // : Cost 1 vdup1 LHS + 2685674867U, // : Cost 3 vext3 LHS, <1,u,2,0> + 835584U, // : Cost 0 copy LHS + 1481846070U, // : Cost 2 vext1 <0,u,1,u>, RHS + 1611933077U, // : Cost 2 vext3 LHS, <1,u,5,7> + 2685674910U, // : Cost 3 vext3 LHS, <1,u,6,7> + 1523652232U, // : Cost 2 vext1 <7,u,1,u>, <7,u,1,u> + 835584U, // : Cost 0 copy LHS + 1544110154U, // : Cost 2 vext2 <0,0,u,2>, <0,0,u,2> + 1545437286U, // : Cost 2 vext2 <0,2,u,2>, LHS + 1545437420U, // : Cost 2 vext2 <0,2,u,2>, <0,2,u,2> + 2685232589U, // : Cost 3 vext3 LHS, <2,0,3,0> + 2619179346U, // : Cost 3 vext2 <0,2,u,2>, <0,4,1,5> + 2712069606U, // : Cost 3 vext3 RHS, <2,0,5,7> + 2689877484U, // : Cost 3 vext3 LHS, <2,0,6,4> + 2659656273U, // : Cost 3 vext2 <7,0,u,2>, <0,7,2,u> + 1545437853U, // : Cost 2 vext2 <0,2,u,2>, LHS + 1550082851U, // : Cost 2 vext2 <1,0,u,2>, <1,0,u,2> + 2619179828U, // : Cost 3 vext2 <0,2,u,2>, <1,1,1,1> + 2619179926U, // : Cost 3 vext2 <0,2,u,2>, <1,2,3,0> + 2685232671U, // : Cost 3 vext3 LHS, <2,1,3,1> + 2555604278U, // : Cost 3 vext1 <0,u,2,1>, RHS + 2619180176U, // : Cost 3 vext2 <0,2,u,2>, <1,5,3,7> + 2689877564U, // : Cost 3 vext3 LHS, <2,1,6,3> + 2602718850U, // : Cost 3 vext1 , <7,u,1,2> + 1158703235U, // : Cost 2 vrev <2,u,u,1> + 1481867366U, // : Cost 2 vext1 <0,u,2,2>, LHS + 2555609846U, // : Cost 3 vext1 <0,u,2,2>, <1,0,3,2> + 269271142U, // : Cost 1 vdup2 LHS + 1611490930U, // : Cost 2 vext3 LHS, <2,2,3,3> + 1481870646U, // : Cost 2 vext1 <0,u,2,2>, RHS + 2689877640U, // : Cost 3 vext3 LHS, <2,2,5,7> + 2619180986U, // : Cost 3 vext2 <0,2,u,2>, <2,6,3,7> + 2593436837U, // : Cost 3 vext1 <7,2,2,2>, <7,2,2,2> + 269271142U, // : Cost 1 vdup2 LHS + 408134301U, // : Cost 1 vext1 LHS, LHS + 1481876214U, // : Cost 2 vext1 LHS, <1,0,3,2> + 1481877096U, // : Cost 2 vext1 LHS, <2,2,2,2> + 1880326246U, // : Cost 2 vzipr LHS, LHS + 408137014U, // : Cost 1 vext1 LHS, RHS + 1529654992U, // : Cost 2 vext1 LHS, <5,1,7,3> + 1529655802U, // : Cost 2 vext1 LHS, <6,2,7,3> + 1529656314U, // : Cost 2 vext1 LHS, <7,0,1,2> + 408139566U, // : Cost 1 vext1 LHS, LHS + 1567853468U, // : Cost 2 vext2 <4,0,6,2>, <4,0,6,2> + 2561598362U, // : Cost 3 vext1 <1,u,2,4>, <1,2,3,4> + 2555627214U, // : Cost 3 vext1 <0,u,2,4>, <2,3,4,5> + 2685232918U, // : Cost 3 vext3 LHS, <2,4,3,5> + 2555628854U, // : Cost 3 vext1 <0,u,2,4>, RHS + 1545440566U, // : Cost 2 vext2 <0,2,u,2>, RHS + 1571982740U, // : Cost 2 vext2 <4,6,u,2>, <4,6,u,2> + 2592125957U, // : Cost 3 vext1 <7,0,2,4>, <7,0,2,4> + 1545440809U, // : Cost 2 vext2 <0,2,u,2>, RHS + 2555633766U, // : Cost 3 vext1 <0,u,2,5>, LHS + 2561606550U, // : Cost 3 vext1 <1,u,2,5>, <1,2,3,0> + 2689877856U, // : Cost 3 vext3 LHS, <2,5,2,7> + 2685233000U, // : Cost 3 vext3 LHS, <2,5,3,6> + 1158441059U, // : Cost 2 vrev <2,u,4,5> + 2645725188U, // : Cost 3 vext2 <4,6,u,2>, <5,5,5,5> + 2689877892U, // : Cost 3 vext3 LHS, <2,5,6,7> + 2823900470U, // : Cost 3 vuzpr <0,u,0,2>, RHS + 1158736007U, // : Cost 2 vrev <2,u,u,5> + 1481900134U, // : Cost 2 vext1 <0,u,2,6>, LHS + 2555642614U, // : Cost 3 vext1 <0,u,2,6>, <1,0,3,2> + 2555643496U, // : Cost 3 vext1 <0,u,2,6>, <2,2,2,2> + 1611491258U, // : Cost 2 vext3 LHS, <2,6,3,7> + 1481903414U, // : Cost 2 vext1 <0,u,2,6>, RHS + 2689877964U, // : Cost 3 vext3 LHS, <2,6,5,7> + 2689877973U, // : Cost 3 vext3 LHS, <2,6,6,7> + 2645726030U, // : Cost 3 vext2 <4,6,u,2>, <6,7,0,1> + 1611933671U, // : Cost 2 vext3 LHS, <2,6,u,7> + 1585919033U, // : Cost 2 vext2 <7,0,u,2>, <7,0,u,2> + 2573566710U, // : Cost 3 vext1 <3,u,2,7>, <1,0,3,2> + 2567596115U, // : Cost 3 vext1 <2,u,2,7>, <2,u,2,7> + 1906901094U, // : Cost 2 vzipr RHS, LHS + 2555653430U, // : Cost 3 vext1 <0,u,2,7>, RHS + 2800080230U, // : Cost 3 vuzpl LHS, <7,4,5,6> + 2980643164U, // : Cost 3 vzipr RHS, <0,4,2,6> + 2645726828U, // : Cost 3 vext2 <4,6,u,2>, <7,7,7,7> + 1906901099U, // : Cost 2 vzipr RHS, LHS + 408175266U, // : Cost 1 vext1 LHS, LHS + 1545443118U, // : Cost 2 vext2 <0,2,u,2>, LHS + 269271142U, // : Cost 1 vdup2 LHS + 1611491416U, // : Cost 2 vext3 LHS, <2,u,3,3> + 408177974U, // : Cost 1 vext1 LHS, RHS + 1545443482U, // : Cost 2 vext2 <0,2,u,2>, RHS + 1726339226U, // : Cost 2 vuzpl LHS, RHS + 1529697274U, // : Cost 2 vext1 LHS, <7,0,1,2> + 408180526U, // : Cost 1 vext1 LHS, LHS + 1544781824U, // : Cost 2 vext2 LHS, <0,0,0,0> + 471040156U, // : Cost 1 vext2 LHS, LHS + 1544781988U, // : Cost 2 vext2 LHS, <0,2,0,2> + 2618523900U, // : Cost 3 vext2 LHS, <0,3,1,0> + 1544782162U, // : Cost 2 vext2 LHS, <0,4,1,5> + 2238188352U, // : Cost 3 vrev <3,u,5,0> + 2623169023U, // : Cost 3 vext2 LHS, <0,6,2,7> + 2238335826U, // : Cost 3 vrev <3,u,7,0> + 471040669U, // : Cost 1 vext2 LHS, LHS + 1544782582U, // : Cost 2 vext2 LHS, <1,0,3,2> + 1544782644U, // : Cost 2 vext2 LHS, <1,1,1,1> + 1544782742U, // : Cost 2 vext2 LHS, <1,2,3,0> + 1544782808U, // : Cost 2 vext2 LHS, <1,3,1,3> + 2618524733U, // : Cost 3 vext2 LHS, <1,4,3,5> + 1544782992U, // : Cost 2 vext2 LHS, <1,5,3,7> + 2618524897U, // : Cost 3 vext2 LHS, <1,6,3,7> + 2703517987U, // : Cost 3 vext3 <3,1,7,u>, <3,1,7,u> + 1544783213U, // : Cost 2 vext2 LHS, <1,u,1,3> + 1529716838U, // : Cost 2 vext1 , LHS + 1164167966U, // : Cost 2 vrev <3,u,1,2> + 1544783464U, // : Cost 2 vext2 LHS, <2,2,2,2> + 1544783526U, // : Cost 2 vext2 LHS, <2,3,0,1> + 1529720118U, // : Cost 2 vext1 , RHS + 2618525544U, // : Cost 3 vext2 LHS, <2,5,3,6> + 1544783802U, // : Cost 2 vext2 LHS, <2,6,3,7> + 2704181620U, // : Cost 3 vext3 <3,2,7,u>, <3,2,7,u> + 1544783931U, // : Cost 2 vext2 LHS, <2,u,0,1> + 1544784022U, // : Cost 2 vext2 LHS, <3,0,1,2> + 1487922559U, // : Cost 2 vext1 <1,u,3,3>, <1,u,3,3> + 1493895256U, // : Cost 2 vext1 <2,u,3,3>, <2,u,3,3> + 336380006U, // : Cost 1 vdup3 LHS + 1544784386U, // : Cost 2 vext2 LHS, <3,4,5,6> + 2824054478U, // : Cost 3 vuzpr LHS, <2,3,4,5> + 2238286668U, // : Cost 3 vrev <3,u,6,3> + 2954069136U, // : Cost 3 vzipr LHS, <1,5,3,7> + 336380006U, // : Cost 1 vdup3 LHS + 1487929446U, // : Cost 2 vext1 <1,u,3,4>, LHS + 1487930752U, // : Cost 2 vext1 <1,u,3,4>, <1,u,3,4> + 2623171644U, // : Cost 3 vext2 LHS, <4,2,6,0> + 2561673366U, // : Cost 3 vext1 <1,u,3,4>, <3,0,1,2> + 1487932726U, // : Cost 2 vext1 <1,u,3,4>, RHS + 471043382U, // : Cost 1 vext2 LHS, RHS + 1592561012U, // : Cost 2 vext2 LHS, <4,6,4,6> + 2238368598U, // : Cost 3 vrev <3,u,7,4> + 471043625U, // : Cost 1 vext2 LHS, RHS + 2555707494U, // : Cost 3 vext1 <0,u,3,5>, LHS + 1574645465U, // : Cost 2 vext2 <5,1,u,3>, <5,1,u,3> + 2567653106U, // : Cost 3 vext1 <2,u,3,5>, <2,3,u,5> + 2555709954U, // : Cost 3 vext1 <0,u,3,5>, <3,4,5,6> + 1592561606U, // : Cost 2 vext2 LHS, <5,4,7,6> + 1592561668U, // : Cost 2 vext2 LHS, <5,5,5,5> + 1592561762U, // : Cost 2 vext2 LHS, <5,6,7,0> + 1750314294U, // : Cost 2 vuzpr LHS, RHS + 1750314295U, // : Cost 2 vuzpr LHS, RHS + 2623172897U, // : Cost 3 vext2 LHS, <6,0,1,2> + 2561688962U, // : Cost 3 vext1 <1,u,3,6>, <1,u,3,6> + 1581281795U, // : Cost 2 vext2 <6,2,u,3>, <6,2,u,3> + 2706541204U, // : Cost 3 vext3 <3,6,3,u>, <3,6,3,u> + 2623173261U, // : Cost 3 vext2 LHS, <6,4,5,6> + 1164495686U, // : Cost 2 vrev <3,u,5,6> + 1592562488U, // : Cost 2 vext2 LHS, <6,6,6,6> + 1592562510U, // : Cost 2 vext2 LHS, <6,7,0,1> + 1164716897U, // : Cost 2 vrev <3,u,u,6> + 1487954022U, // : Cost 2 vext1 <1,u,3,7>, LHS + 1487955331U, // : Cost 2 vext1 <1,u,3,7>, <1,u,3,7> + 1493928028U, // : Cost 2 vext1 <2,u,3,7>, <2,u,3,7> + 2561697942U, // : Cost 3 vext1 <1,u,3,7>, <3,0,1,2> + 1487957302U, // : Cost 2 vext1 <1,u,3,7>, RHS + 2707352311U, // : Cost 3 vext3 <3,7,5,u>, <3,7,5,u> + 2655024623U, // : Cost 3 vext2 <6,2,u,3>, <7,6,2,u> + 1592563308U, // : Cost 2 vext2 LHS, <7,7,7,7> + 1487959854U, // : Cost 2 vext1 <1,u,3,7>, LHS + 1544787667U, // : Cost 2 vext2 LHS, + 471045934U, // : Cost 1 vext2 LHS, LHS + 1549432709U, // : Cost 2 vext2 LHS, + 336380006U, // : Cost 1 vdup3 LHS + 1544788031U, // : Cost 2 vext2 LHS, + 471046298U, // : Cost 1 vext2 LHS, RHS + 1549433040U, // : Cost 2 vext2 LHS, + 1750314537U, // : Cost 2 vuzpr LHS, RHS + 471046501U, // : Cost 1 vext2 LHS, LHS + 2625167360U, // : Cost 3 vext2 <1,2,u,4>, <0,0,0,0> + 1551425638U, // : Cost 2 vext2 <1,2,u,4>, LHS + 2619195630U, // : Cost 3 vext2 <0,2,u,4>, <0,2,u,4> + 2619343104U, // : Cost 3 vext2 <0,3,1,4>, <0,3,1,4> + 2625167698U, // : Cost 3 vext2 <1,2,u,4>, <0,4,1,5> + 1638329234U, // : Cost 2 vext3 RHS, <4,0,5,1> + 1638329244U, // : Cost 2 vext3 RHS, <4,0,6,2> + 3787803556U, // : Cost 4 vext3 RHS, <4,0,7,1> + 1551426205U, // : Cost 2 vext2 <1,2,u,4>, LHS + 2555748454U, // : Cost 3 vext1 <0,u,4,1>, LHS + 2625168180U, // : Cost 3 vext2 <1,2,u,4>, <1,1,1,1> + 1551426503U, // : Cost 2 vext2 <1,2,u,4>, <1,2,u,4> + 2625168344U, // : Cost 3 vext2 <1,2,u,4>, <1,3,1,3> + 2555751734U, // : Cost 3 vext1 <0,u,4,1>, RHS + 1860554038U, // : Cost 2 vzipl LHS, RHS + 2689879022U, // : Cost 3 vext3 LHS, <4,1,6,3> + 2592248852U, // : Cost 3 vext1 <7,0,4,1>, <7,0,4,1> + 1555408301U, // : Cost 2 vext2 <1,u,u,4>, <1,u,u,4> + 2555756646U, // : Cost 3 vext1 <0,u,4,2>, LHS + 2625168943U, // : Cost 3 vext2 <1,2,u,4>, <2,1,4,u> + 2625169000U, // : Cost 3 vext2 <1,2,u,4>, <2,2,2,2> + 2619197134U, // : Cost 3 vext2 <0,2,u,4>, <2,3,4,5> + 2555759926U, // : Cost 3 vext1 <0,u,4,2>, RHS + 2712071222U, // : Cost 3 vext3 RHS, <4,2,5,3> + 1994771766U, // : Cost 2 vtrnl LHS, RHS + 2592257045U, // : Cost 3 vext1 <7,0,4,2>, <7,0,4,2> + 1994771784U, // : Cost 2 vtrnl LHS, RHS + 2625169558U, // : Cost 3 vext2 <1,2,u,4>, <3,0,1,2> + 2567709594U, // : Cost 3 vext1 <2,u,4,3>, <1,2,3,4> + 2567710817U, // : Cost 3 vext1 <2,u,4,3>, <2,u,4,3> + 2625169820U, // : Cost 3 vext2 <1,2,u,4>, <3,3,3,3> + 2625169922U, // : Cost 3 vext2 <1,2,u,4>, <3,4,5,6> + 2954069710U, // : Cost 3 vzipr LHS, <2,3,4,5> + 2954068172U, // : Cost 3 vzipr LHS, <0,2,4,6> + 3903849472U, // : Cost 4 vuzpr <1,u,3,4>, <1,3,5,7> + 2954068174U, // : Cost 3 vzipr LHS, <0,2,4,u> + 1505919078U, // : Cost 2 vext1 <4,u,4,4>, LHS + 2567717831U, // : Cost 3 vext1 <2,u,4,4>, <1,2,u,4> + 2567719010U, // : Cost 3 vext1 <2,u,4,4>, <2,u,4,4> + 2570373542U, // : Cost 3 vext1 <3,3,4,4>, <3,3,4,4> + 161926454U, // : Cost 1 vdup0 RHS + 1551428918U, // : Cost 2 vext2 <1,2,u,4>, RHS + 1638329572U, // : Cost 2 vext3 RHS, <4,4,6,6> + 2594927963U, // : Cost 3 vext1 <7,4,4,4>, <7,4,4,4> + 161926454U, // : Cost 1 vdup0 RHS + 1493983334U, // : Cost 2 vext1 <2,u,4,5>, LHS + 2689879301U, // : Cost 3 vext3 LHS, <4,5,1,3> + 1493985379U, // : Cost 2 vext1 <2,u,4,5>, <2,u,4,5> + 2567727254U, // : Cost 3 vext1 <2,u,4,5>, <3,0,1,2> + 1493986614U, // : Cost 2 vext1 <2,u,4,5>, RHS + 1863535926U, // : Cost 2 vzipl RHS, RHS + 537750838U, // : Cost 1 vext3 LHS, RHS + 2830110006U, // : Cost 3 vuzpr <1,u,3,4>, RHS + 537750856U, // : Cost 1 vext3 LHS, RHS + 1482047590U, // : Cost 2 vext1 <0,u,4,6>, LHS + 2555790070U, // : Cost 3 vext1 <0,u,4,6>, <1,0,3,2> + 2555790952U, // : Cost 3 vext1 <0,u,4,6>, <2,2,2,2> + 2555791510U, // : Cost 3 vext1 <0,u,4,6>, <3,0,1,2> + 1482050870U, // : Cost 2 vext1 <0,u,4,6>, RHS + 2689879422U, // : Cost 3 vext3 LHS, <4,6,5,7> + 1997753654U, // : Cost 2 vtrnl RHS, RHS + 2712071562U, // : Cost 3 vext3 RHS, <4,6,7,1> + 1482053422U, // : Cost 2 vext1 <0,u,4,6>, LHS + 2567741542U, // : Cost 3 vext1 <2,u,4,7>, LHS + 2567742362U, // : Cost 3 vext1 <2,u,4,7>, <1,2,3,4> + 2567743589U, // : Cost 3 vext1 <2,u,4,7>, <2,u,4,7> + 2573716286U, // : Cost 3 vext1 <3,u,4,7>, <3,u,4,7> + 2567744822U, // : Cost 3 vext1 <2,u,4,7>, RHS + 2712071624U, // : Cost 3 vext3 RHS, <4,7,5,0> + 96808489U, // : Cost 1 vrev RHS + 2651715180U, // : Cost 3 vext2 <5,6,u,4>, <7,7,7,7> + 96955963U, // : Cost 1 vrev RHS + 1482063974U, // : Cost 2 vext1 <0,u,4,u>, LHS + 1551431470U, // : Cost 2 vext2 <1,2,u,4>, LHS + 1494009958U, // : Cost 2 vext1 <2,u,4,u>, <2,u,4,u> + 2555807894U, // : Cost 3 vext1 <0,u,4,u>, <3,0,1,2> + 161926454U, // : Cost 1 vdup0 RHS + 1551431834U, // : Cost 2 vext2 <1,2,u,4>, RHS + 537751081U, // : Cost 1 vext3 LHS, RHS + 2830110249U, // : Cost 3 vuzpr <1,u,3,4>, RHS + 537751099U, // : Cost 1 vext3 LHS, RHS + 2631811072U, // : Cost 3 vext2 <2,3,u,5>, <0,0,0,0> + 1558069350U, // : Cost 2 vext2 <2,3,u,5>, LHS + 2619203823U, // : Cost 3 vext2 <0,2,u,5>, <0,2,u,5> + 2619867456U, // : Cost 3 vext2 <0,3,u,5>, <0,3,u,5> + 1546273106U, // : Cost 2 vext2 <0,4,1,5>, <0,4,1,5> + 2733010539U, // : Cost 3 vext3 LHS, <5,0,5,1> + 2597622682U, // : Cost 3 vext1 <7,u,5,0>, <6,7,u,5> + 1176539396U, // : Cost 2 vrev <5,u,7,0> + 1558069917U, // : Cost 2 vext2 <2,3,u,5>, LHS + 1505968230U, // : Cost 2 vext1 <4,u,5,1>, LHS + 2624512887U, // : Cost 3 vext2 <1,1,u,5>, <1,1,u,5> + 2631811990U, // : Cost 3 vext2 <2,3,u,5>, <1,2,3,0> + 2618541056U, // : Cost 3 vext2 <0,1,u,5>, <1,3,5,7> + 1505971510U, // : Cost 2 vext1 <4,u,5,1>, RHS + 2627167419U, // : Cost 3 vext2 <1,5,u,5>, <1,5,u,5> + 2579714554U, // : Cost 3 vext1 <4,u,5,1>, <6,2,7,3> + 1638330064U, // : Cost 2 vext3 RHS, <5,1,7,3> + 1638477529U, // : Cost 2 vext3 RHS, <5,1,u,3> + 2561802342U, // : Cost 3 vext1 <1,u,5,2>, LHS + 2561803264U, // : Cost 3 vext1 <1,u,5,2>, <1,3,5,7> + 2631149217U, // : Cost 3 vext2 <2,2,u,5>, <2,2,u,5> + 1558071026U, // : Cost 2 vext2 <2,3,u,5>, <2,3,u,5> + 2561805622U, // : Cost 3 vext1 <1,u,5,2>, RHS + 2714062607U, // : Cost 3 vext3 RHS, <5,2,5,3> + 2631813050U, // : Cost 3 vext2 <2,3,u,5>, <2,6,3,7> + 3092335926U, // : Cost 3 vtrnr <0,u,0,2>, RHS + 1561389191U, // : Cost 2 vext2 <2,u,u,5>, <2,u,u,5> + 2561810534U, // : Cost 3 vext1 <1,u,5,3>, LHS + 2561811857U, // : Cost 3 vext1 <1,u,5,3>, <1,u,5,3> + 2631813474U, // : Cost 3 vext2 <2,3,u,5>, <3,2,5,u> + 2631813532U, // : Cost 3 vext2 <2,3,u,5>, <3,3,3,3> + 2619869698U, // : Cost 3 vext2 <0,3,u,5>, <3,4,5,6> + 3001847002U, // : Cost 3 vzipr LHS, <4,4,5,5> + 2954070530U, // : Cost 3 vzipr LHS, <3,4,5,6> + 2018749750U, // : Cost 2 vtrnr LHS, RHS + 2018749751U, // : Cost 2 vtrnr LHS, RHS + 2573762662U, // : Cost 3 vext1 <3,u,5,4>, LHS + 2620017634U, // : Cost 3 vext2 <0,4,1,5>, <4,1,5,0> + 2573764338U, // : Cost 3 vext1 <3,u,5,4>, <2,3,u,5> + 2573765444U, // : Cost 3 vext1 <3,u,5,4>, <3,u,5,4> + 1570680053U, // : Cost 2 vext2 <4,4,u,5>, <4,4,u,5> + 1558072630U, // : Cost 2 vext2 <2,3,u,5>, RHS + 2645749143U, // : Cost 3 vext2 <4,6,u,5>, <4,6,u,5> + 1638330310U, // : Cost 2 vext3 RHS, <5,4,7,6> + 1558072873U, // : Cost 2 vext2 <2,3,u,5>, RHS + 1506000998U, // : Cost 2 vext1 <4,u,5,5>, LHS + 2561827984U, // : Cost 3 vext1 <1,u,5,5>, <1,5,3,7> + 2579744360U, // : Cost 3 vext1 <4,u,5,5>, <2,2,2,2> + 2579744918U, // : Cost 3 vext1 <4,u,5,5>, <3,0,1,2> + 1506004278U, // : Cost 2 vext1 <4,u,5,5>, RHS + 229035318U, // : Cost 1 vdup1 RHS + 2712072206U, // : Cost 3 vext3 RHS, <5,5,6,6> + 1638330392U, // : Cost 2 vext3 RHS, <5,5,7,7> + 229035318U, // : Cost 1 vdup1 RHS + 1500037222U, // : Cost 2 vext1 <3,u,5,6>, LHS + 2561836436U, // : Cost 3 vext1 <1,u,5,6>, <1,u,5,6> + 2567809133U, // : Cost 3 vext1 <2,u,5,6>, <2,u,5,6> + 1500040006U, // : Cost 2 vext1 <3,u,5,6>, <3,u,5,6> + 1500040502U, // : Cost 2 vext1 <3,u,5,6>, RHS + 2714062935U, // : Cost 3 vext3 RHS, <5,6,5,7> + 2712072288U, // : Cost 3 vext3 RHS, <5,6,6,7> + 27705344U, // : Cost 0 copy RHS + 27705344U, // : Cost 0 copy RHS + 1488101478U, // : Cost 2 vext1 <1,u,5,7>, LHS + 1488102805U, // : Cost 2 vext1 <1,u,5,7>, <1,u,5,7> + 2561844840U, // : Cost 3 vext1 <1,u,5,7>, <2,2,2,2> + 2561845398U, // : Cost 3 vext1 <1,u,5,7>, <3,0,1,2> + 1488104758U, // : Cost 2 vext1 <1,u,5,7>, RHS + 1638330536U, // : Cost 2 vext3 RHS, <5,7,5,7> + 2712072362U, // : Cost 3 vext3 RHS, <5,7,6,0> + 2042965302U, // : Cost 2 vtrnr RHS, RHS + 1488107310U, // : Cost 2 vext1 <1,u,5,7>, LHS + 1488109670U, // : Cost 2 vext1 <1,u,5,u>, LHS + 1488110998U, // : Cost 2 vext1 <1,u,5,u>, <1,u,5,u> + 2561853032U, // : Cost 3 vext1 <1,u,5,u>, <2,2,2,2> + 1500056392U, // : Cost 2 vext1 <3,u,5,u>, <3,u,5,u> + 1488112950U, // : Cost 2 vext1 <1,u,5,u>, RHS + 229035318U, // : Cost 1 vdup1 RHS + 2954111490U, // : Cost 3 vzipr LHS, <3,4,5,6> + 27705344U, // : Cost 0 copy RHS + 27705344U, // : Cost 0 copy RHS + 2619211776U, // : Cost 3 vext2 <0,2,u,6>, <0,0,0,0> + 1545470054U, // : Cost 2 vext2 <0,2,u,6>, LHS + 1545470192U, // : Cost 2 vext2 <0,2,u,6>, <0,2,u,6> + 2255958969U, // : Cost 3 vrev <6,u,3,0> + 1546797458U, // : Cost 2 vext2 <0,4,u,6>, <0,4,u,6> + 2720624971U, // : Cost 3 vext3 <6,0,5,u>, <6,0,5,u> + 2256180180U, // : Cost 3 vrev <6,u,6,0> + 2960682294U, // : Cost 3 vzipr <1,2,u,0>, RHS + 1545470621U, // : Cost 2 vext2 <0,2,u,6>, LHS + 1182004127U, // : Cost 2 vrev <6,u,0,1> + 2619212596U, // : Cost 3 vext2 <0,2,u,6>, <1,1,1,1> + 2619212694U, // : Cost 3 vext2 <0,2,u,6>, <1,2,3,0> + 2619212760U, // : Cost 3 vext2 <0,2,u,6>, <1,3,1,3> + 2626511979U, // : Cost 3 vext2 <1,4,u,6>, <1,4,u,6> + 2619212944U, // : Cost 3 vext2 <0,2,u,6>, <1,5,3,7> + 2714063264U, // : Cost 3 vext3 RHS, <6,1,6,3> + 2967326006U, // : Cost 3 vzipr <2,3,u,1>, RHS + 1182594023U, // : Cost 2 vrev <6,u,u,1> + 1506050150U, // : Cost 2 vext1 <4,u,6,2>, LHS + 2579792630U, // : Cost 3 vext1 <4,u,6,2>, <1,0,3,2> + 2619213416U, // : Cost 3 vext2 <0,2,u,6>, <2,2,2,2> + 2619213478U, // : Cost 3 vext2 <0,2,u,6>, <2,3,0,1> + 1506053430U, // : Cost 2 vext1 <4,u,6,2>, RHS + 2633148309U, // : Cost 3 vext2 <2,5,u,6>, <2,5,u,6> + 2619213754U, // : Cost 3 vext2 <0,2,u,6>, <2,6,3,7> + 1638330874U, // : Cost 2 vext3 RHS, <6,2,7,3> + 1638478339U, // : Cost 2 vext3 RHS, <6,2,u,3> + 2619213974U, // : Cost 3 vext2 <0,2,u,6>, <3,0,1,2> + 2255836074U, // : Cost 3 vrev <6,u,1,3> + 2255909811U, // : Cost 3 vrev <6,u,2,3> + 2619214236U, // : Cost 3 vext2 <0,2,u,6>, <3,3,3,3> + 1564715549U, // : Cost 2 vext2 <3,4,u,6>, <3,4,u,6> + 2639121006U, // : Cost 3 vext2 <3,5,u,6>, <3,5,u,6> + 3001847012U, // : Cost 3 vzipr LHS, <4,4,6,6> + 1880329526U, // : Cost 2 vzipr LHS, RHS + 1880329527U, // : Cost 2 vzipr LHS, RHS + 2567864422U, // : Cost 3 vext1 <2,u,6,4>, LHS + 2733011558U, // : Cost 3 vext3 LHS, <6,4,1,3> + 2567866484U, // : Cost 3 vext1 <2,u,6,4>, <2,u,6,4> + 2638458005U, // : Cost 3 vext2 <3,4,u,6>, <4,3,6,u> + 1570540772U, // : Cost 2 vext2 <4,4,6,6>, <4,4,6,6> + 1545473334U, // : Cost 2 vext2 <0,2,u,6>, RHS + 1572015512U, // : Cost 2 vext2 <4,6,u,6>, <4,6,u,6> + 2960715062U, // : Cost 3 vzipr <1,2,u,4>, RHS + 1545473577U, // : Cost 2 vext2 <0,2,u,6>, RHS + 2567872614U, // : Cost 3 vext1 <2,u,6,5>, LHS + 2645757648U, // : Cost 3 vext2 <4,6,u,6>, <5,1,7,3> + 2567874490U, // : Cost 3 vext1 <2,u,6,5>, <2,6,3,7> + 2576501250U, // : Cost 3 vext1 <4,3,6,5>, <3,4,5,6> + 1576660943U, // : Cost 2 vext2 <5,4,u,6>, <5,4,u,6> + 2645757956U, // : Cost 3 vext2 <4,6,u,6>, <5,5,5,5> + 2645758050U, // : Cost 3 vext2 <4,6,u,6>, <5,6,7,0> + 2824080694U, // : Cost 3 vuzpr <0,u,2,6>, RHS + 1182626795U, // : Cost 2 vrev <6,u,u,5> + 1506082918U, // : Cost 2 vext1 <4,u,6,6>, LHS + 2579825398U, // : Cost 3 vext1 <4,u,6,6>, <1,0,3,2> + 2645758458U, // : Cost 3 vext2 <4,6,u,6>, <6,2,7,3> + 2579826838U, // : Cost 3 vext1 <4,u,6,6>, <3,0,1,2> + 1506086198U, // : Cost 2 vext1 <4,u,6,6>, RHS + 2579828432U, // : Cost 3 vext1 <4,u,6,6>, <5,1,7,3> + 296144182U, // : Cost 1 vdup2 RHS + 1638331202U, // : Cost 2 vext3 RHS, <6,6,7,7> + 296144182U, // : Cost 1 vdup2 RHS + 432349286U, // : Cost 1 vext1 RHS, LHS + 1506091766U, // : Cost 2 vext1 RHS, <1,0,3,2> + 1506092648U, // : Cost 2 vext1 RHS, <2,2,2,2> + 1506093206U, // : Cost 2 vext1 RHS, <3,0,1,2> + 432352809U, // : Cost 1 vext1 RHS, RHS + 1506094800U, // : Cost 2 vext1 RHS, <5,1,7,3> + 1506095610U, // : Cost 2 vext1 RHS, <6,2,7,3> + 1906904374U, // : Cost 2 vzipr RHS, RHS + 432355118U, // : Cost 1 vext1 RHS, LHS + 432357478U, // : Cost 1 vext1 RHS, LHS + 1545475886U, // : Cost 2 vext2 <0,2,u,6>, LHS + 1506100840U, // : Cost 2 vext1 RHS, <2,2,2,2> + 1506101398U, // : Cost 2 vext1 RHS, <3,0,1,2> + 432361002U, // : Cost 1 vext1 RHS, RHS + 1545476250U, // : Cost 2 vext2 <0,2,u,6>, RHS + 296144182U, // : Cost 1 vdup2 RHS + 1880370486U, // : Cost 2 vzipr LHS, RHS + 432363310U, // : Cost 1 vext1 RHS, LHS + 1571356672U, // : Cost 2 vext2 RHS, <0,0,0,0> + 497614950U, // : Cost 1 vext2 RHS, LHS + 1571356836U, // : Cost 2 vext2 RHS, <0,2,0,2> + 2573880146U, // : Cost 3 vext1 <3,u,7,0>, <3,u,7,0> + 1571357010U, // : Cost 2 vext2 RHS, <0,4,1,5> + 1512083716U, // : Cost 2 vext1 <5,u,7,0>, <5,u,7,0> + 2621874741U, // : Cost 3 vext2 <0,6,u,7>, <0,6,u,7> + 2585826298U, // : Cost 3 vext1 <5,u,7,0>, <7,0,1,2> + 497615517U, // : Cost 1 vext2 RHS, LHS + 1571357430U, // : Cost 2 vext2 RHS, <1,0,3,2> + 1571357492U, // : Cost 2 vext2 RHS, <1,1,1,1> + 1571357590U, // : Cost 2 vext2 RHS, <1,2,3,0> + 1552114715U, // : Cost 2 vext2 <1,3,u,7>, <1,3,u,7> + 2573888822U, // : Cost 3 vext1 <3,u,7,1>, RHS + 1553441981U, // : Cost 2 vext2 <1,5,u,7>, <1,5,u,7> + 2627847438U, // : Cost 3 vext2 <1,6,u,7>, <1,6,u,7> + 2727408775U, // : Cost 3 vext3 <7,1,7,u>, <7,1,7,u> + 1555432880U, // : Cost 2 vext2 <1,u,u,7>, <1,u,u,7> + 2629838337U, // : Cost 3 vext2 <2,0,u,7>, <2,0,u,7> + 1188058754U, // : Cost 2 vrev <7,u,1,2> + 1571358312U, // : Cost 2 vext2 RHS, <2,2,2,2> + 1571358374U, // : Cost 2 vext2 RHS, <2,3,0,1> + 2632492869U, // : Cost 3 vext2 <2,4,u,7>, <2,4,u,7> + 2633156502U, // : Cost 3 vext2 <2,5,u,7>, <2,5,u,7> + 1560078311U, // : Cost 2 vext2 <2,6,u,7>, <2,6,u,7> + 2728072408U, // : Cost 3 vext3 <7,2,7,u>, <7,2,7,u> + 1561405577U, // : Cost 2 vext2 <2,u,u,7>, <2,u,u,7> + 1571358870U, // : Cost 2 vext2 RHS, <3,0,1,2> + 2627184913U, // : Cost 3 vext2 <1,5,u,7>, <3,1,5,u> + 2633820523U, // : Cost 3 vext2 <2,6,u,7>, <3,2,6,u> + 1571359132U, // : Cost 2 vext2 RHS, <3,3,3,3> + 1571359234U, // : Cost 2 vext2 RHS, <3,4,5,6> + 1512108295U, // : Cost 2 vext1 <5,u,7,3>, <5,u,7,3> + 1518080992U, // : Cost 2 vext1 <6,u,7,3>, <6,u,7,3> + 2640456465U, // : Cost 3 vext2 <3,7,u,7>, <3,7,u,7> + 1571359518U, // : Cost 2 vext2 RHS, <3,u,1,2> + 1571359634U, // : Cost 2 vext2 RHS, <4,0,5,1> + 2573911067U, // : Cost 3 vext1 <3,u,7,4>, <1,3,u,7> + 2645101622U, // : Cost 3 vext2 RHS, <4,2,5,3> + 2573912918U, // : Cost 3 vext1 <3,u,7,4>, <3,u,7,4> + 1571359952U, // : Cost 2 vext2 RHS, <4,4,4,4> + 497618248U, // : Cost 1 vext2 RHS, RHS + 1571360116U, // : Cost 2 vext2 RHS, <4,6,4,6> + 2645102024U, // : Cost 3 vext2 RHS, <4,7,5,0> + 497618473U, // : Cost 1 vext2 RHS, RHS + 2645102152U, // : Cost 3 vext2 RHS, <5,0,1,2> + 1571360464U, // : Cost 2 vext2 RHS, <5,1,7,3> + 2645102334U, // : Cost 3 vext2 RHS, <5,2,3,4> + 2645102447U, // : Cost 3 vext2 RHS, <5,3,7,0> + 1571360710U, // : Cost 2 vext2 RHS, <5,4,7,6> + 1571360772U, // : Cost 2 vext2 RHS, <5,5,5,5> + 1571360866U, // : Cost 2 vext2 RHS, <5,6,7,0> + 1571360936U, // : Cost 2 vext2 RHS, <5,7,5,7> + 1571361017U, // : Cost 2 vext2 RHS, <5,u,5,7> + 1530044518U, // : Cost 2 vext1 , LHS + 2645103016U, // : Cost 3 vext2 RHS, <6,1,7,2> + 1571361274U, // : Cost 2 vext2 RHS, <6,2,7,3> + 2645103154U, // : Cost 3 vext2 RHS, <6,3,4,5> + 1530047798U, // : Cost 2 vext1 , RHS + 1188386474U, // : Cost 2 vrev <7,u,5,6> + 1571361592U, // : Cost 2 vext2 RHS, <6,6,6,6> + 1571361614U, // : Cost 2 vext2 RHS, <6,7,0,1> + 1571361695U, // : Cost 2 vext2 RHS, <6,u,0,1> + 1571361786U, // : Cost 2 vext2 RHS, <7,0,1,2> + 2573935616U, // : Cost 3 vext1 <3,u,7,7>, <1,3,5,7> + 2645103781U, // : Cost 3 vext2 RHS, <7,2,2,2> + 2573937497U, // : Cost 3 vext1 <3,u,7,7>, <3,u,7,7> + 1571362150U, // : Cost 2 vext2 RHS, <7,4,5,6> + 1512141067U, // : Cost 2 vext1 <5,u,7,7>, <5,u,7,7> + 1518113764U, // : Cost 2 vext1 <6,u,7,7>, <6,u,7,7> + 363253046U, // : Cost 1 vdup3 RHS + 363253046U, // : Cost 1 vdup3 RHS + 1571362515U, // : Cost 2 vext2 RHS, + 497620782U, // : Cost 1 vext2 RHS, LHS + 1571362693U, // : Cost 2 vext2 RHS, + 1571362748U, // : Cost 2 vext2 RHS, + 1571362879U, // : Cost 2 vext2 RHS, + 497621146U, // : Cost 1 vext2 RHS, RHS + 1571363024U, // : Cost 2 vext2 RHS, + 363253046U, // : Cost 1 vdup3 RHS + 497621349U, // : Cost 1 vext2 RHS, LHS + 135053414U, // : Cost 1 vdup0 LHS + 471081121U, // : Cost 1 vext2 LHS, LHS + 1544822948U, // : Cost 2 vext2 LHS, <0,2,0,2> + 1616140005U, // : Cost 2 vext3 LHS, + 1544823122U, // : Cost 2 vext2 LHS, <0,4,1,5> + 1512157453U, // : Cost 2 vext1 <5,u,u,0>, <5,u,u,0> + 1662220032U, // : Cost 2 vext3 RHS, + 1194457487U, // : Cost 2 vrev + 471081629U, // : Cost 1 vext2 LHS, LHS + 1544823542U, // : Cost 2 vext2 LHS, <1,0,3,2> + 202162278U, // : Cost 1 vdup1 LHS + 537753390U, // : Cost 1 vext3 LHS, LHS + 1544823768U, // : Cost 2 vext2 LHS, <1,3,1,3> + 1494248758U, // : Cost 2 vext1 <2,u,u,1>, RHS + 1544823952U, // : Cost 2 vext2 LHS, <1,5,3,7> + 1518138343U, // : Cost 2 vext1 <6,u,u,1>, <6,u,u,1> + 1640322907U, // : Cost 2 vext3 RHS, + 537753444U, // : Cost 1 vext3 LHS, LHS + 1482309734U, // : Cost 2 vext1 <0,u,u,2>, LHS + 1194031451U, // : Cost 2 vrev + 269271142U, // : Cost 1 vdup2 LHS + 835584U, // : Cost 0 copy LHS + 1482313014U, // : Cost 2 vext1 <0,u,u,2>, RHS + 2618566504U, // : Cost 3 vext2 LHS, <2,5,3,6> + 1544824762U, // : Cost 2 vext2 LHS, <2,6,3,7> + 1638479788U, // : Cost 2 vext3 RHS, + 835584U, // : Cost 0 copy LHS + 408576723U, // : Cost 1 vext1 LHS, LHS + 1482318582U, // : Cost 2 vext1 LHS, <1,0,3,2> + 120371557U, // : Cost 1 vrev LHS + 336380006U, // : Cost 1 vdup3 LHS + 408579382U, // : Cost 1 vext1 LHS, RHS + 1616140271U, // : Cost 2 vext3 LHS, + 1530098170U, // : Cost 2 vext1 LHS, <6,2,7,3> + 1880329544U, // : Cost 2 vzipr LHS, RHS + 408581934U, // : Cost 1 vext1 LHS, LHS + 1488298086U, // : Cost 2 vext1 <1,u,u,4>, LHS + 1488299437U, // : Cost 2 vext1 <1,u,u,4>, <1,u,u,4> + 1659271204U, // : Cost 2 vext3 LHS, + 1194195311U, // : Cost 2 vrev + 161926454U, // : Cost 1 vdup0 RHS + 471084342U, // : Cost 1 vext2 LHS, RHS + 1571368308U, // : Cost 2 vext2 RHS, <4,6,4,6> + 1640323153U, // : Cost 2 vext3 RHS, + 471084585U, // : Cost 1 vext2 LHS, RHS + 1494278246U, // : Cost 2 vext1 <2,u,u,5>, LHS + 1571368656U, // : Cost 2 vext2 RHS, <5,1,7,3> + 1494280327U, // : Cost 2 vext1 <2,u,u,5>, <2,u,u,5> + 1616140415U, // : Cost 2 vext3 LHS, + 1494281526U, // : Cost 2 vext1 <2,u,u,5>, RHS + 229035318U, // : Cost 1 vdup1 RHS + 537753754U, // : Cost 1 vext3 LHS, RHS + 1750355254U, // : Cost 2 vuzpr LHS, RHS + 537753772U, // : Cost 1 vext3 LHS, RHS + 1482342502U, // : Cost 2 vext1 <0,u,u,6>, LHS + 2556084982U, // : Cost 3 vext1 <0,u,u,6>, <1,0,3,2> + 1571369466U, // : Cost 2 vext2 RHS, <6,2,7,3> + 1611938000U, // : Cost 2 vext3 LHS, + 1482345782U, // : Cost 2 vext1 <0,u,u,6>, RHS + 1194359171U, // : Cost 2 vrev + 296144182U, // : Cost 1 vdup2 RHS + 27705344U, // : Cost 0 copy RHS + 27705344U, // : Cost 0 copy RHS + 432496742U, // : Cost 1 vext1 RHS, LHS + 1488324016U, // : Cost 2 vext1 <1,u,u,7>, <1,u,u,7> + 1494296713U, // : Cost 2 vext1 <2,u,u,7>, <2,u,u,7> + 1906901148U, // : Cost 2 vzipr RHS, LHS + 432500283U, // : Cost 1 vext1 RHS, RHS + 1506242256U, // : Cost 2 vext1 RHS, <5,1,7,3> + 120699277U, // : Cost 1 vrev RHS + 363253046U, // : Cost 1 vdup3 RHS + 432502574U, // : Cost 1 vext1 RHS, LHS + 408617688U, // : Cost 1 vext1 LHS, LHS + 471086894U, // : Cost 1 vext2 LHS, LHS + 537753957U, // : Cost 1 vext3 LHS, LHS + 835584U, // : Cost 0 copy LHS + 408620342U, // : Cost 1 vext1 LHS, RHS + 471087258U, // : Cost 1 vext2 LHS, RHS + 537753997U, // : Cost 1 vext3 LHS, RHS + 27705344U, // : Cost 0 copy RHS + 835584U, // : Cost 0 copy LHS + 0 +}; diff --git a/lib/Target/AArch64/AArch64PromoteConstant.cpp b/lib/Target/AArch64/AArch64PromoteConstant.cpp new file mode 100644 index 00000000000..4723cc4978e --- /dev/null +++ b/lib/Target/AArch64/AArch64PromoteConstant.cpp @@ -0,0 +1,578 @@ +//=- AArch64PromoteConstant.cpp --- Promote constant to global for AArch64 -==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the AArch64PromoteConstant pass which promotes constants +// to global variables when this is likely to be more efficient. Currently only +// types related to constant vector (i.e., constant vector, array of constant +// vectors, constant structure with a constant vector field, etc.) are promoted +// to global variables. Constant vectors are likely to be lowered in target +// constant pool during instruction selection already; therefore, the access +// will remain the same (memory load), but the structure types are not split +// into different constant pool accesses for each field. A bonus side effect is +// that created globals may be merged by the global merge pass. +// +// FIXME: This pass may be useful for other targets too. +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-promote-const" + +// Stress testing mode - disable heuristics. +static cl::opt Stress("aarch64-stress-promote-const", cl::Hidden, + cl::desc("Promote all vector constants")); + +STATISTIC(NumPromoted, "Number of promoted constants"); +STATISTIC(NumPromotedUses, "Number of promoted constants uses"); + +//===----------------------------------------------------------------------===// +// AArch64PromoteConstant +//===----------------------------------------------------------------------===// + +namespace { +/// Promotes interesting constant into global variables. +/// The motivating example is: +/// static const uint16_t TableA[32] = { +/// 41944, 40330, 38837, 37450, 36158, 34953, 33826, 32768, +/// 31776, 30841, 29960, 29128, 28340, 27595, 26887, 26215, +/// 25576, 24967, 24386, 23832, 23302, 22796, 22311, 21846, +/// 21400, 20972, 20561, 20165, 19785, 19419, 19066, 18725, +/// }; +/// +/// uint8x16x4_t LoadStatic(void) { +/// uint8x16x4_t ret; +/// ret.val[0] = vld1q_u16(TableA + 0); +/// ret.val[1] = vld1q_u16(TableA + 8); +/// ret.val[2] = vld1q_u16(TableA + 16); +/// ret.val[3] = vld1q_u16(TableA + 24); +/// return ret; +/// } +/// +/// The constants in this example are folded into the uses. Thus, 4 different +/// constants are created. +/// +/// As their type is vector the cheapest way to create them is to load them +/// for the memory. +/// +/// Therefore the final assembly final has 4 different loads. With this pass +/// enabled, only one load is issued for the constants. +class AArch64PromoteConstant : public ModulePass { + +public: + static char ID; + AArch64PromoteConstant() : ModulePass(ID) {} + + const char *getPassName() const override { return "AArch64 Promote Constant"; } + + /// Iterate over the functions and promote the interesting constants into + /// global variables with module scope. + bool runOnModule(Module &M) override { + DEBUG(dbgs() << getPassName() << '\n'); + bool Changed = false; + for (auto &MF : M) { + Changed |= runOnFunction(MF); + } + return Changed; + } + +private: + /// Look for interesting constants used within the given function. + /// Promote them into global variables, load these global variables within + /// the related function, so that the number of inserted load is minimal. + bool runOnFunction(Function &F); + + // This transformation requires dominator info + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addPreserved(); + } + + /// Type to store a list of User. + typedef SmallVector Users; + /// Map an insertion point to all the uses it dominates. + typedef DenseMap InsertionPoints; + /// Map a function to the required insertion point of load for a + /// global variable. + typedef DenseMap InsertionPointsPerFunc; + + /// Find the closest point that dominates the given Use. + Instruction *findInsertionPoint(Value::user_iterator &Use); + + /// Check if the given insertion point is dominated by an existing + /// insertion point. + /// If true, the given use is added to the list of dominated uses for + /// the related existing point. + /// \param NewPt the insertion point to be checked + /// \param UseIt the use to be added into the list of dominated uses + /// \param InsertPts existing insertion points + /// \pre NewPt and all instruction in InsertPts belong to the same function + /// \return true if one of the insertion point in InsertPts dominates NewPt, + /// false otherwise + bool isDominated(Instruction *NewPt, Value::user_iterator &UseIt, + InsertionPoints &InsertPts); + + /// Check if the given insertion point can be merged with an existing + /// insertion point in a common dominator. + /// If true, the given use is added to the list of the created insertion + /// point. + /// \param NewPt the insertion point to be checked + /// \param UseIt the use to be added into the list of dominated uses + /// \param InsertPts existing insertion points + /// \pre NewPt and all instruction in InsertPts belong to the same function + /// \pre isDominated returns false for the exact same parameters. + /// \return true if it exists an insertion point in InsertPts that could + /// have been merged with NewPt in a common dominator, + /// false otherwise + bool tryAndMerge(Instruction *NewPt, Value::user_iterator &UseIt, + InsertionPoints &InsertPts); + + /// Compute the minimal insertion points to dominates all the interesting + /// uses of value. + /// Insertion points are group per function and each insertion point + /// contains a list of all the uses it dominates within the related function + /// \param Val constant to be examined + /// \param[out] InsPtsPerFunc output storage of the analysis + void computeInsertionPoints(Constant *Val, + InsertionPointsPerFunc &InsPtsPerFunc); + + /// Insert a definition of a new global variable at each point contained in + /// InsPtsPerFunc and update the related uses (also contained in + /// InsPtsPerFunc). + bool insertDefinitions(Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc); + + /// Compute the minimal insertion points to dominate all the interesting + /// uses of Val and insert a definition of a new global variable + /// at these points. + /// Also update the uses of Val accordingly. + /// Currently a use of Val is considered interesting if: + /// - Val is not UndefValue + /// - Val is not zeroinitialized + /// - Replacing Val per a load of a global variable is valid. + /// \see shouldConvert for more details + bool computeAndInsertDefinitions(Constant *Val); + + /// Promote the given constant into a global variable if it is expected to + /// be profitable. + /// \return true if Cst has been promoted + bool promoteConstant(Constant *Cst); + + /// Transfer the list of dominated uses of IPI to NewPt in InsertPts. + /// Append UseIt to this list and delete the entry of IPI in InsertPts. + static void appendAndTransferDominatedUses(Instruction *NewPt, + Value::user_iterator &UseIt, + InsertionPoints::iterator &IPI, + InsertionPoints &InsertPts) { + // Record the dominated use. + IPI->second.push_back(UseIt); + // Transfer the dominated uses of IPI to NewPt + // Inserting into the DenseMap may invalidate existing iterator. + // Keep a copy of the key to find the iterator to erase. + Instruction *OldInstr = IPI->first; + InsertPts.insert(InsertionPoints::value_type(NewPt, IPI->second)); + // Erase IPI. + IPI = InsertPts.find(OldInstr); + InsertPts.erase(IPI); + } +}; +} // end anonymous namespace + +char AArch64PromoteConstant::ID = 0; + +namespace llvm { +void initializeAArch64PromoteConstantPass(PassRegistry &); +} + +INITIALIZE_PASS_BEGIN(AArch64PromoteConstant, "aarch64-promote-const", + "AArch64 Promote Constant Pass", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(AArch64PromoteConstant, "aarch64-promote-const", + "AArch64 Promote Constant Pass", false, false) + +ModulePass *llvm::createAArch64PromoteConstantPass() { + return new AArch64PromoteConstant(); +} + +/// Check if the given type uses a vector type. +static bool isConstantUsingVectorTy(const Type *CstTy) { + if (CstTy->isVectorTy()) + return true; + if (CstTy->isStructTy()) { + for (unsigned EltIdx = 0, EndEltIdx = CstTy->getStructNumElements(); + EltIdx < EndEltIdx; ++EltIdx) + if (isConstantUsingVectorTy(CstTy->getStructElementType(EltIdx))) + return true; + } else if (CstTy->isArrayTy()) + return isConstantUsingVectorTy(CstTy->getArrayElementType()); + return false; +} + +/// Check if the given use (Instruction + OpIdx) of Cst should be converted into +/// a load of a global variable initialized with Cst. +/// A use should be converted if it is legal to do so. +/// For instance, it is not legal to turn the mask operand of a shuffle vector +/// into a load of a global variable. +static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr, + unsigned OpIdx) { + // shufflevector instruction expects a const for the mask argument, i.e., the + // third argument. Do not promote this use in that case. + if (isa(Instr) && OpIdx == 2) + return false; + + // extractvalue instruction expects a const idx. + if (isa(Instr) && OpIdx > 0) + return false; + + // extractvalue instruction expects a const idx. + if (isa(Instr) && OpIdx > 1) + return false; + + if (isa(Instr) && OpIdx > 0) + return false; + + // Alignment argument must be constant. + if (isa(Instr) && OpIdx > 0) + return false; + + // Alignment argument must be constant. + if (isa(Instr) && OpIdx > 1) + return false; + + // Index must be constant. + if (isa(Instr) && OpIdx > 0) + return false; + + // Personality function and filters must be constant. + // Give up on that instruction. + if (isa(Instr)) + return false; + + // Switch instruction expects constants to compare to. + if (isa(Instr)) + return false; + + // Expected address must be a constant. + if (isa(Instr)) + return false; + + // Do not mess with intrinsics. + if (isa(Instr)) + return false; + + // Do not mess with inline asm. + const CallInst *CI = dyn_cast(Instr); + if (CI && isa(CI->getCalledValue())) + return false; + + return true; +} + +/// Check if the given Cst should be converted into +/// a load of a global variable initialized with Cst. +/// A constant should be converted if it is likely that the materialization of +/// the constant will be tricky. Thus, we give up on zero or undef values. +/// +/// \todo Currently, accept only vector related types. +/// Also we give up on all simple vector type to keep the existing +/// behavior. Otherwise, we should push here all the check of the lowering of +/// BUILD_VECTOR. By giving up, we lose the potential benefit of merging +/// constant via global merge and the fact that the same constant is stored +/// only once with this method (versus, as many function that uses the constant +/// for the regular approach, even for float). +/// Again, the simplest solution would be to promote every +/// constant and rematerialize them when they are actually cheap to create. +static bool shouldConvert(const Constant *Cst) { + if (isa(Cst)) + return false; + + // FIXME: In some cases, it may be interesting to promote in memory + // a zero initialized constant. + // E.g., when the type of Cst require more instructions than the + // adrp/add/load sequence or when this sequence can be shared by several + // instances of Cst. + // Ideally, we could promote this into a global and rematerialize the constant + // when it was a bad idea. + if (Cst->isZeroValue()) + return false; + + if (Stress) + return true; + + // FIXME: see function \todo + if (Cst->getType()->isVectorTy()) + return false; + return isConstantUsingVectorTy(Cst->getType()); +} + +Instruction * +AArch64PromoteConstant::findInsertionPoint(Value::user_iterator &Use) { + // If this user is a phi, the insertion point is in the related + // incoming basic block. + PHINode *PhiInst = dyn_cast(*Use); + Instruction *InsertionPoint; + if (PhiInst) + InsertionPoint = + PhiInst->getIncomingBlock(Use.getOperandNo())->getTerminator(); + else + InsertionPoint = dyn_cast(*Use); + assert(InsertionPoint && "User is not an instruction!"); + return InsertionPoint; +} + +bool AArch64PromoteConstant::isDominated(Instruction *NewPt, + Value::user_iterator &UseIt, + InsertionPoints &InsertPts) { + + DominatorTree &DT = getAnalysis( + *NewPt->getParent()->getParent()).getDomTree(); + + // Traverse all the existing insertion points and check if one is dominating + // NewPt. If it is, remember that. + for (auto &IPI : InsertPts) { + if (NewPt == IPI.first || DT.dominates(IPI.first, NewPt) || + // When IPI.first is a terminator instruction, DT may think that + // the result is defined on the edge. + // Here we are testing the insertion point, not the definition. + (IPI.first->getParent() != NewPt->getParent() && + DT.dominates(IPI.first->getParent(), NewPt->getParent()))) { + // No need to insert this point. Just record the dominated use. + DEBUG(dbgs() << "Insertion point dominated by:\n"); + DEBUG(IPI.first->print(dbgs())); + DEBUG(dbgs() << '\n'); + IPI.second.push_back(UseIt); + return true; + } + } + return false; +} + +bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, + Value::user_iterator &UseIt, + InsertionPoints &InsertPts) { + DominatorTree &DT = getAnalysis( + *NewPt->getParent()->getParent()).getDomTree(); + BasicBlock *NewBB = NewPt->getParent(); + + // Traverse all the existing insertion point and check if one is dominated by + // NewPt and thus useless or can be combined with NewPt into a common + // dominator. + for (InsertionPoints::iterator IPI = InsertPts.begin(), + EndIPI = InsertPts.end(); + IPI != EndIPI; ++IPI) { + BasicBlock *CurBB = IPI->first->getParent(); + if (NewBB == CurBB) { + // Instructions are in the same block. + // By construction, NewPt is dominating the other. + // Indeed, isDominated returned false with the exact same arguments. + DEBUG(dbgs() << "Merge insertion point with:\n"); + DEBUG(IPI->first->print(dbgs())); + DEBUG(dbgs() << "\nat considered insertion point.\n"); + appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts); + return true; + } + + // Look for a common dominator + BasicBlock *CommonDominator = DT.findNearestCommonDominator(NewBB, CurBB); + // If none exists, we cannot merge these two points. + if (!CommonDominator) + continue; + + if (CommonDominator != NewBB) { + // By construction, the CommonDominator cannot be CurBB. + assert(CommonDominator != CurBB && + "Instruction has not been rejected during isDominated check!"); + // Take the last instruction of the CommonDominator as insertion point + NewPt = CommonDominator->getTerminator(); + } + // else, CommonDominator is the block of NewBB, hence NewBB is the last + // possible insertion point in that block. + DEBUG(dbgs() << "Merge insertion point with:\n"); + DEBUG(IPI->first->print(dbgs())); + DEBUG(dbgs() << '\n'); + DEBUG(NewPt->print(dbgs())); + DEBUG(dbgs() << '\n'); + appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts); + return true; + } + return false; +} + +void AArch64PromoteConstant::computeInsertionPoints( + Constant *Val, InsertionPointsPerFunc &InsPtsPerFunc) { + DEBUG(dbgs() << "** Compute insertion points **\n"); + for (Value::user_iterator UseIt = Val->user_begin(), + EndUseIt = Val->user_end(); + UseIt != EndUseIt; ++UseIt) { + // If the user is not an Instruction, we cannot modify it. + if (!isa(*UseIt)) + continue; + + // Filter out uses that should not be converted. + if (!shouldConvertUse(Val, cast(*UseIt), UseIt.getOperandNo())) + continue; + + DEBUG(dbgs() << "Considered use, opidx " << UseIt.getOperandNo() << ":\n"); + DEBUG((*UseIt)->print(dbgs())); + DEBUG(dbgs() << '\n'); + + Instruction *InsertionPoint = findInsertionPoint(UseIt); + + DEBUG(dbgs() << "Considered insertion point:\n"); + DEBUG(InsertionPoint->print(dbgs())); + DEBUG(dbgs() << '\n'); + + // Check if the current insertion point is useless, i.e., it is dominated + // by another one. + InsertionPoints &InsertPts = + InsPtsPerFunc[InsertionPoint->getParent()->getParent()]; + if (isDominated(InsertionPoint, UseIt, InsertPts)) + continue; + // This insertion point is useful, check if we can merge some insertion + // point in a common dominator or if NewPt dominates an existing one. + if (tryAndMerge(InsertionPoint, UseIt, InsertPts)) + continue; + + DEBUG(dbgs() << "Keep considered insertion point\n"); + + // It is definitely useful by its own + InsertPts[InsertionPoint].push_back(UseIt); + } +} + +bool AArch64PromoteConstant::insertDefinitions( + Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc) { + // We will create one global variable per Module. + DenseMap ModuleToMergedGV; + bool HasChanged = false; + + // Traverse all insertion points in all the function. + for (InsertionPointsPerFunc::iterator FctToInstPtsIt = InsPtsPerFunc.begin(), + EndIt = InsPtsPerFunc.end(); + FctToInstPtsIt != EndIt; ++FctToInstPtsIt) { + InsertionPoints &InsertPts = FctToInstPtsIt->second; +// Do more checking for debug purposes. +#ifndef NDEBUG + DominatorTree &DT = getAnalysis( + *FctToInstPtsIt->first).getDomTree(); +#endif + GlobalVariable *PromotedGV; + assert(!InsertPts.empty() && "Empty uses does not need a definition"); + + Module *M = FctToInstPtsIt->first->getParent(); + DenseMap::iterator MapIt = + ModuleToMergedGV.find(M); + if (MapIt == ModuleToMergedGV.end()) { + PromotedGV = new GlobalVariable( + *M, Cst->getType(), true, GlobalValue::InternalLinkage, nullptr, + "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal); + PromotedGV->setInitializer(Cst); + ModuleToMergedGV[M] = PromotedGV; + DEBUG(dbgs() << "Global replacement: "); + DEBUG(PromotedGV->print(dbgs())); + DEBUG(dbgs() << '\n'); + ++NumPromoted; + HasChanged = true; + } else { + PromotedGV = MapIt->second; + } + + for (InsertionPoints::iterator IPI = InsertPts.begin(), + EndIPI = InsertPts.end(); + IPI != EndIPI; ++IPI) { + // Create the load of the global variable. + IRBuilder<> Builder(IPI->first->getParent(), IPI->first); + LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV); + DEBUG(dbgs() << "**********\n"); + DEBUG(dbgs() << "New def: "); + DEBUG(LoadedCst->print(dbgs())); + DEBUG(dbgs() << '\n'); + + // Update the dominated uses. + Users &DominatedUsers = IPI->second; + for (Value::user_iterator Use : DominatedUsers) { +#ifndef NDEBUG + assert((DT.dominates(LoadedCst, cast(*Use)) || + (isa(*Use) && + DT.dominates(LoadedCst, findInsertionPoint(Use)))) && + "Inserted definition does not dominate all its uses!"); +#endif + DEBUG(dbgs() << "Use to update " << Use.getOperandNo() << ":"); + DEBUG(Use->print(dbgs())); + DEBUG(dbgs() << '\n'); + Use->setOperand(Use.getOperandNo(), LoadedCst); + ++NumPromotedUses; + } + } + } + return HasChanged; +} + +bool AArch64PromoteConstant::computeAndInsertDefinitions(Constant *Val) { + InsertionPointsPerFunc InsertPtsPerFunc; + computeInsertionPoints(Val, InsertPtsPerFunc); + return insertDefinitions(Val, InsertPtsPerFunc); +} + +bool AArch64PromoteConstant::promoteConstant(Constant *Cst) { + assert(Cst && "Given variable is not a valid constant."); + + if (!shouldConvert(Cst)) + return false; + + DEBUG(dbgs() << "******************************\n"); + DEBUG(dbgs() << "Candidate constant: "); + DEBUG(Cst->print(dbgs())); + DEBUG(dbgs() << '\n'); + + return computeAndInsertDefinitions(Cst); +} + +bool AArch64PromoteConstant::runOnFunction(Function &F) { + // Look for instructions using constant vector. Promote that constant to a + // global variable. Create as few loads of this variable as possible and + // update the uses accordingly. + bool LocalChange = false; + SmallSet AlreadyChecked; + + for (auto &MBB : F) { + for (auto &MI : MBB) { + // Traverse the operand, looking for constant vectors. Replace them by a + // load of a global variable of constant vector type. + for (unsigned OpIdx = 0, EndOpIdx = MI.getNumOperands(); + OpIdx != EndOpIdx; ++OpIdx) { + Constant *Cst = dyn_cast(MI.getOperand(OpIdx)); + // There is no point in promoting global values as they are already + // global. Do not promote constant expressions either, as they may + // require some code expansion. + if (Cst && !isa(Cst) && !isa(Cst) && + AlreadyChecked.insert(Cst)) + LocalChange |= promoteConstant(Cst); + } + } + } + return LocalChange; +} diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp new file mode 100644 index 00000000000..48a361d50e5 --- /dev/null +++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -0,0 +1,404 @@ +//===- AArch64RegisterInfo.cpp - AArch64 Register Information -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the AArch64 implementation of the TargetRegisterInfo +// class. +// +//===----------------------------------------------------------------------===// + +#include "AArch64RegisterInfo.h" +#include "AArch64FrameLowering.h" +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetOptions.h" + +using namespace llvm; + +#define GET_REGINFO_TARGET_DESC +#include "AArch64GenRegisterInfo.inc" + +AArch64RegisterInfo::AArch64RegisterInfo(const AArch64InstrInfo *tii, + const AArch64Subtarget *sti) + : AArch64GenRegisterInfo(AArch64::LR), TII(tii), STI(sti) {} + +const MCPhysReg * +AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { + assert(MF && "Invalid MachineFunction pointer."); + if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg) + return CSR_AArch64_AllRegs_SaveList; + else + return CSR_AArch64_AAPCS_SaveList; +} + +const uint32_t * +AArch64RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { + if (CC == CallingConv::AnyReg) + return CSR_AArch64_AllRegs_RegMask; + else + return CSR_AArch64_AAPCS_RegMask; +} + +const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const { + if (STI->isTargetDarwin()) + return CSR_AArch64_TLS_Darwin_RegMask; + + assert(STI->isTargetELF() && "only expect Darwin or ELF TLS"); + return CSR_AArch64_TLS_ELF_RegMask; +} + +const uint32_t * +AArch64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const { + // This should return a register mask that is the same as that returned by + // getCallPreservedMask but that additionally preserves the register used for + // the first i64 argument (which must also be the register used to return a + // single i64 return value) + // + // In case that the calling convention does not use the same register for + // both, the function should return NULL (does not currently apply) + return CSR_AArch64_AAPCS_ThisReturn_RegMask; +} + +BitVector +AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + // FIXME: avoid re-calculating this every time. + BitVector Reserved(getNumRegs()); + Reserved.set(AArch64::SP); + Reserved.set(AArch64::XZR); + Reserved.set(AArch64::WSP); + Reserved.set(AArch64::WZR); + + if (TFI->hasFP(MF) || STI->isTargetDarwin()) { + Reserved.set(AArch64::FP); + Reserved.set(AArch64::W29); + } + + if (STI->isTargetDarwin()) { + Reserved.set(AArch64::X18); // Platform register + Reserved.set(AArch64::W18); + } + + if (hasBasePointer(MF)) { + Reserved.set(AArch64::X19); + Reserved.set(AArch64::W19); + } + + return Reserved; +} + +bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF, + unsigned Reg) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + switch (Reg) { + default: + break; + case AArch64::SP: + case AArch64::XZR: + case AArch64::WSP: + case AArch64::WZR: + return true; + case AArch64::X18: + case AArch64::W18: + return STI->isTargetDarwin(); + case AArch64::FP: + case AArch64::W29: + return TFI->hasFP(MF) || STI->isTargetDarwin(); + case AArch64::W19: + case AArch64::X19: + return hasBasePointer(MF); + } + + return false; +} + +const TargetRegisterClass * +AArch64RegisterInfo::getPointerRegClass(const MachineFunction &MF, + unsigned Kind) const { + return &AArch64::GPR64RegClass; +} + +const TargetRegisterClass * +AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { + if (RC == &AArch64::CCRRegClass) + return nullptr; // Can't copy NZCV. + return RC; +} + +unsigned AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; } + +bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + + // In the presence of variable sized objects, if the fixed stack size is + // large enough that referencing from the FP won't result in things being + // in range relatively often, we can use a base pointer to allow access + // from the other direction like the SP normally works. + if (MFI->hasVarSizedObjects()) { + // Conservatively estimate whether the negative offset from the frame + // pointer will be sufficient to reach. If a function has a smallish + // frame, it's less likely to have lots of spills and callee saved + // space, so it's all more likely to be within range of the frame pointer. + // If it's wrong, we'll materialize the constant and still get to the + // object; it's just suboptimal. Negative offsets use the unscaled + // load/store instructions, which have a 9-bit signed immediate. + if (MFI->getLocalFrameSize() < 256) + return false; + return true; + } + + return false; +} + +unsigned +AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + return TFI->hasFP(MF) ? AArch64::FP : AArch64::SP; +} + +bool AArch64RegisterInfo::requiresRegisterScavenging( + const MachineFunction &MF) const { + return true; +} + +bool AArch64RegisterInfo::requiresVirtualBaseRegisters( + const MachineFunction &MF) const { + return true; +} + +bool +AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + // AArch64FrameLowering::resolveFrameIndexReference() can always fall back + // to the stack pointer, so only put the emergency spill slot next to the + // FP when there's no better way to access it (SP or base pointer). + return MFI->hasVarSizedObjects() && !hasBasePointer(MF); +} + +bool AArch64RegisterInfo::requiresFrameIndexScavenging( + const MachineFunction &MF) const { + return true; +} + +bool +AArch64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + // Only consider eliminating leaf frames. + if (MFI->hasCalls() || (MF.getTarget().Options.DisableFramePointerElim(MF) && + MFI->adjustsStack())) + return true; + return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken(); +} + +/// needsFrameBaseReg - Returns true if the instruction's frame index +/// reference would be better served by a base register other than FP +/// or SP. Used by LocalStackFrameAllocation to determine which frame index +/// references it should create new base registers for. +bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI, + int64_t Offset) const { + for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i) + assert(i < MI->getNumOperands() && + "Instr doesn't have FrameIndex operand!"); + + // It's the load/store FI references that cause issues, as it can be difficult + // to materialize the offset if it won't fit in the literal field. Estimate + // based on the size of the local frame and some conservative assumptions + // about the rest of the stack frame (note, this is pre-regalloc, so + // we don't know everything for certain yet) whether this offset is likely + // to be out of range of the immediate. Return true if so. + + // We only generate virtual base registers for loads and stores, so + // return false for everything else. + if (!MI->mayLoad() && !MI->mayStore()) + return false; + + // Without a virtual base register, if the function has variable sized + // objects, all fixed-size local references will be via the frame pointer, + // Approximate the offset and see if it's legal for the instruction. + // Note that the incoming offset is based on the SP value at function entry, + // so it'll be negative. + MachineFunction &MF = *MI->getParent()->getParent(); + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Estimate an offset from the frame pointer. + // Conservatively assume all GPR callee-saved registers get pushed. + // FP, LR, X19-X28, D8-D15. 64-bits each. + int64_t FPOffset = Offset - 16 * 20; + // Estimate an offset from the stack pointer. + // The incoming offset is relating to the SP at the start of the function, + // but when we access the local it'll be relative to the SP after local + // allocation, so adjust our SP-relative offset by that allocation size. + Offset += MFI->getLocalFrameSize(); + // Assume that we'll have at least some spill slots allocated. + // FIXME: This is a total SWAG number. We should run some statistics + // and pick a real one. + Offset += 128; // 128 bytes of spill slots + + // If there is a frame pointer, try using it. + // The FP is only available if there is no dynamic realignment. We + // don't know for sure yet whether we'll need that, so we guess based + // on whether there are any local variables that would trigger it. + if (TFI->hasFP(MF) && isFrameOffsetLegal(MI, FPOffset)) + return false; + + // If we can reference via the stack pointer or base pointer, try that. + // FIXME: This (and the code that resolves the references) can be improved + // to only disallow SP relative references in the live range of + // the VLA(s). In practice, it's unclear how much difference that + // would make, but it may be worth doing. + if (isFrameOffsetLegal(MI, Offset)) + return false; + + // The offset likely isn't legal; we want to allocate a virtual base register. + return true; +} + +bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, + int64_t Offset) const { + assert(Offset <= INT_MAX && "Offset too big to fit in int."); + assert(MI && "Unable to get the legal offset for nil instruction."); + int SaveOffset = Offset; + return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal; +} + +/// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx +/// at the beginning of the basic block. +void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, + unsigned BaseReg, + int FrameIdx, + int64_t Offset) const { + MachineBasicBlock::iterator Ins = MBB->begin(); + DebugLoc DL; // Defaults to "unknown" + if (Ins != MBB->end()) + DL = Ins->getDebugLoc(); + + const MCInstrDesc &MCID = TII->get(AArch64::ADDXri); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + const MachineFunction &MF = *MBB->getParent(); + MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF)); + unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0); + + BuildMI(*MBB, Ins, DL, MCID, BaseReg) + .addFrameIndex(FrameIdx) + .addImm(Offset) + .addImm(Shifter); +} + +void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, + int64_t Offset) const { + int Off = Offset; // ARM doesn't need the general 64-bit offsets + unsigned i = 0; + + while (!MI.getOperand(i).isFI()) { + ++i; + assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); + } + bool Done = rewriteAArch64FrameIndex(MI, i, BaseReg, Off, TII); + assert(Done && "Unable to resolve frame index!"); + (void)Done; +} + +void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, unsigned FIOperandNum, + RegScavenger *RS) const { + assert(SPAdj == 0 && "Unexpected"); + + MachineInstr &MI = *II; + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + const AArch64FrameLowering *TFI = static_cast( + MF.getTarget().getFrameLowering()); + + int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); + unsigned FrameReg; + int Offset; + + // Special handling of dbg_value, stackmap and patchpoint instructions. + if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP || + MI.getOpcode() == TargetOpcode::PATCHPOINT) { + Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg, + /*PreferFP=*/true); + Offset += MI.getOperand(FIOperandNum + 1).getImm(); + MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/); + MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset); + return; + } + + // Modify MI as necessary to handle as much of 'Offset' as possible + Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg); + if (rewriteAArch64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII)) + return; + + assert((!RS || !RS->isScavengingFrameIndex(FrameIndex)) && + "Emergency spill slot is out of reach"); + + // If we get here, the immediate doesn't fit into the instruction. We folded + // as much as possible above. Handle the rest, providing a register that is + // SP+LargeImm. + unsigned ScratchReg = + MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII); + MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true); +} + +namespace llvm { + +unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const { + const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); + + switch (RC->getID()) { + default: + return 0; + case AArch64::GPR32RegClassID: + case AArch64::GPR32spRegClassID: + case AArch64::GPR32allRegClassID: + case AArch64::GPR64spRegClassID: + case AArch64::GPR64allRegClassID: + case AArch64::GPR64RegClassID: + case AArch64::GPR32commonRegClassID: + case AArch64::GPR64commonRegClassID: + return 32 - 1 // XZR/SP + - (TFI->hasFP(MF) || STI->isTargetDarwin()) // FP + - STI->isTargetDarwin() // X18 reserved as platform register + - hasBasePointer(MF); // X19 + case AArch64::FPR8RegClassID: + case AArch64::FPR16RegClassID: + case AArch64::FPR32RegClassID: + case AArch64::FPR64RegClassID: + case AArch64::FPR128RegClassID: + return 32; + + case AArch64::DDRegClassID: + case AArch64::DDDRegClassID: + case AArch64::DDDDRegClassID: + case AArch64::QQRegClassID: + case AArch64::QQQRegClassID: + case AArch64::QQQQRegClassID: + return 32; + + case AArch64::FPR128_loRegClassID: + return 16; + } +} + +} // namespace llvm diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h new file mode 100644 index 00000000000..76af1edce72 --- /dev/null +++ b/lib/Target/AArch64/AArch64RegisterInfo.h @@ -0,0 +1,101 @@ +//==- AArch64RegisterInfo.h - AArch64 Register Information Impl --*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the AArch64 implementation of the MRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_AArch64REGISTERINFO_H +#define LLVM_TARGET_AArch64REGISTERINFO_H + +#define GET_REGINFO_HEADER +#include "AArch64GenRegisterInfo.inc" + +namespace llvm { + +class AArch64InstrInfo; +class AArch64Subtarget; +class MachineFunction; +class RegScavenger; +class TargetRegisterClass; + +struct AArch64RegisterInfo : public AArch64GenRegisterInfo { +private: + const AArch64InstrInfo *TII; + const AArch64Subtarget *STI; + +public: + AArch64RegisterInfo(const AArch64InstrInfo *tii, const AArch64Subtarget *sti); + + bool isReservedReg(const MachineFunction &MF, unsigned Reg) const; + + /// Code Generation virtual methods... + const MCPhysReg * + getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override; + const uint32_t *getCallPreservedMask(CallingConv::ID) const override; + + unsigned getCSRFirstUseCost() const override { + // The cost will be compared against BlockFrequency where entry has the + // value of 1 << 14. A value of 5 will choose to spill or split really + // cold path instead of using a callee-saved register. + return 5; + } + + // Calls involved in thread-local variable lookup save more registers than + // normal calls, so they need a different mask to represent this. + const uint32_t *getTLSCallPreservedMask() const; + + /// getThisReturnPreservedMask - Returns a call preserved mask specific to the + /// case that 'returned' is on an i64 first argument if the calling convention + /// is one that can (partially) model this attribute with a preserved mask + /// (i.e. it is a calling convention that uses the same register for the first + /// i64 argument and an i64 return value) + /// + /// Should return NULL in the case that the calling convention does not have + /// this property + const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const; + + BitVector getReservedRegs(const MachineFunction &MF) const override; + const TargetRegisterClass * + getPointerRegClass(const MachineFunction &MF, + unsigned Kind = 0) const override; + const TargetRegisterClass * + getCrossCopyRegClass(const TargetRegisterClass *RC) const override; + + bool requiresRegisterScavenging(const MachineFunction &MF) const override; + bool useFPForScavengingIndex(const MachineFunction &MF) const override; + bool requiresFrameIndexScavenging(const MachineFunction &MF) const override; + + bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override; + bool isFrameOffsetLegal(const MachineInstr *MI, + int64_t Offset) const override; + void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg, + int FrameIdx, + int64_t Offset) const override; + void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, + int64_t Offset) const override; + void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, + unsigned FIOperandNum, + RegScavenger *RS = nullptr) const override; + bool cannotEliminateFrame(const MachineFunction &MF) const; + + bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override; + bool hasBasePointer(const MachineFunction &MF) const; + unsigned getBaseRegister() const; + + // Debug information queries. + unsigned getFrameRegister(const MachineFunction &MF) const override; + + unsigned getRegPressureLimit(const TargetRegisterClass *RC, + MachineFunction &MF) const override; +}; + +} // end namespace llvm + +#endif // LLVM_TARGET_AArch64REGISTERINFO_H diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td new file mode 100644 index 00000000000..21c927f2385 --- /dev/null +++ b/lib/Target/AArch64/AArch64RegisterInfo.td @@ -0,0 +1,593 @@ +//=- AArch64RegisterInfo.td - Describe the AArch64 Regisers --*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + + +class AArch64Reg enc, string n, list subregs = [], + list altNames = []> + : Register { + let HWEncoding = enc; + let Namespace = "AArch64"; + let SubRegs = subregs; +} + +let Namespace = "AArch64" in { + def sub_32 : SubRegIndex<32>; + + def bsub : SubRegIndex<8>; + def hsub : SubRegIndex<16>; + def ssub : SubRegIndex<32>; + def dsub : SubRegIndex<32>; + def qhisub : SubRegIndex<64>; + def qsub : SubRegIndex<64>; + // Note: Code depends on these having consecutive numbers + def dsub0 : SubRegIndex<64>; + def dsub1 : SubRegIndex<64>; + def dsub2 : SubRegIndex<64>; + def dsub3 : SubRegIndex<64>; + // Note: Code depends on these having consecutive numbers + def qsub0 : SubRegIndex<128>; + def qsub1 : SubRegIndex<128>; + def qsub2 : SubRegIndex<128>; + def qsub3 : SubRegIndex<128>; +} + +let Namespace = "AArch64" in { + def vreg : RegAltNameIndex; + def vlist1 : RegAltNameIndex; +} + +//===----------------------------------------------------------------------===// +// Registers +//===----------------------------------------------------------------------===// +def W0 : AArch64Reg<0, "w0" >, DwarfRegNum<[0]>; +def W1 : AArch64Reg<1, "w1" >, DwarfRegNum<[1]>; +def W2 : AArch64Reg<2, "w2" >, DwarfRegNum<[2]>; +def W3 : AArch64Reg<3, "w3" >, DwarfRegNum<[3]>; +def W4 : AArch64Reg<4, "w4" >, DwarfRegNum<[4]>; +def W5 : AArch64Reg<5, "w5" >, DwarfRegNum<[5]>; +def W6 : AArch64Reg<6, "w6" >, DwarfRegNum<[6]>; +def W7 : AArch64Reg<7, "w7" >, DwarfRegNum<[7]>; +def W8 : AArch64Reg<8, "w8" >, DwarfRegNum<[8]>; +def W9 : AArch64Reg<9, "w9" >, DwarfRegNum<[9]>; +def W10 : AArch64Reg<10, "w10">, DwarfRegNum<[10]>; +def W11 : AArch64Reg<11, "w11">, DwarfRegNum<[11]>; +def W12 : AArch64Reg<12, "w12">, DwarfRegNum<[12]>; +def W13 : AArch64Reg<13, "w13">, DwarfRegNum<[13]>; +def W14 : AArch64Reg<14, "w14">, DwarfRegNum<[14]>; +def W15 : AArch64Reg<15, "w15">, DwarfRegNum<[15]>; +def W16 : AArch64Reg<16, "w16">, DwarfRegNum<[16]>; +def W17 : AArch64Reg<17, "w17">, DwarfRegNum<[17]>; +def W18 : AArch64Reg<18, "w18">, DwarfRegNum<[18]>; +def W19 : AArch64Reg<19, "w19">, DwarfRegNum<[19]>; +def W20 : AArch64Reg<20, "w20">, DwarfRegNum<[20]>; +def W21 : AArch64Reg<21, "w21">, DwarfRegNum<[21]>; +def W22 : AArch64Reg<22, "w22">, DwarfRegNum<[22]>; +def W23 : AArch64Reg<23, "w23">, DwarfRegNum<[23]>; +def W24 : AArch64Reg<24, "w24">, DwarfRegNum<[24]>; +def W25 : AArch64Reg<25, "w25">, DwarfRegNum<[25]>; +def W26 : AArch64Reg<26, "w26">, DwarfRegNum<[26]>; +def W27 : AArch64Reg<27, "w27">, DwarfRegNum<[27]>; +def W28 : AArch64Reg<28, "w28">, DwarfRegNum<[28]>; +def W29 : AArch64Reg<29, "w29">, DwarfRegNum<[29]>; +def W30 : AArch64Reg<30, "w30">, DwarfRegNum<[30]>; +def WSP : AArch64Reg<31, "wsp">, DwarfRegNum<[31]>; +def WZR : AArch64Reg<31, "wzr">, DwarfRegAlias; + +let SubRegIndices = [sub_32] in { +def X0 : AArch64Reg<0, "x0", [W0]>, DwarfRegAlias; +def X1 : AArch64Reg<1, "x1", [W1]>, DwarfRegAlias; +def X2 : AArch64Reg<2, "x2", [W2]>, DwarfRegAlias; +def X3 : AArch64Reg<3, "x3", [W3]>, DwarfRegAlias; +def X4 : AArch64Reg<4, "x4", [W4]>, DwarfRegAlias; +def X5 : AArch64Reg<5, "x5", [W5]>, DwarfRegAlias; +def X6 : AArch64Reg<6, "x6", [W6]>, DwarfRegAlias; +def X7 : AArch64Reg<7, "x7", [W7]>, DwarfRegAlias; +def X8 : AArch64Reg<8, "x8", [W8]>, DwarfRegAlias; +def X9 : AArch64Reg<9, "x9", [W9]>, DwarfRegAlias; +def X10 : AArch64Reg<10, "x10", [W10]>, DwarfRegAlias; +def X11 : AArch64Reg<11, "x11", [W11]>, DwarfRegAlias; +def X12 : AArch64Reg<12, "x12", [W12]>, DwarfRegAlias; +def X13 : AArch64Reg<13, "x13", [W13]>, DwarfRegAlias; +def X14 : AArch64Reg<14, "x14", [W14]>, DwarfRegAlias; +def X15 : AArch64Reg<15, "x15", [W15]>, DwarfRegAlias; +def X16 : AArch64Reg<16, "x16", [W16]>, DwarfRegAlias; +def X17 : AArch64Reg<17, "x17", [W17]>, DwarfRegAlias; +def X18 : AArch64Reg<18, "x18", [W18]>, DwarfRegAlias; +def X19 : AArch64Reg<19, "x19", [W19]>, DwarfRegAlias; +def X20 : AArch64Reg<20, "x20", [W20]>, DwarfRegAlias; +def X21 : AArch64Reg<21, "x21", [W21]>, DwarfRegAlias; +def X22 : AArch64Reg<22, "x22", [W22]>, DwarfRegAlias; +def X23 : AArch64Reg<23, "x23", [W23]>, DwarfRegAlias; +def X24 : AArch64Reg<24, "x24", [W24]>, DwarfRegAlias; +def X25 : AArch64Reg<25, "x25", [W25]>, DwarfRegAlias; +def X26 : AArch64Reg<26, "x26", [W26]>, DwarfRegAlias; +def X27 : AArch64Reg<27, "x27", [W27]>, DwarfRegAlias; +def X28 : AArch64Reg<28, "x28", [W28]>, DwarfRegAlias; +def FP : AArch64Reg<29, "x29", [W29]>, DwarfRegAlias; +def LR : AArch64Reg<30, "x30", [W30]>, DwarfRegAlias; +def SP : AArch64Reg<31, "sp", [WSP]>, DwarfRegAlias; +def XZR : AArch64Reg<31, "xzr", [WZR]>, DwarfRegAlias; +} + +// Condition code register. +def NZCV : AArch64Reg<0, "nzcv">; + +// GPR register classes with the intersections of GPR32/GPR32sp and +// GPR64/GPR64sp for use by the coalescer. +def GPR32common : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 0, 30)> { + let AltOrders = [(rotl GPR32common, 8)]; + let AltOrderSelect = [{ return 1; }]; +} +def GPR64common : RegisterClass<"AArch64", [i64], 64, + (add (sequence "X%u", 0, 28), FP, LR)> { + let AltOrders = [(rotl GPR64common, 8)]; + let AltOrderSelect = [{ return 1; }]; +} +// GPR register classes which exclude SP/WSP. +def GPR32 : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR)> { + let AltOrders = [(rotl GPR32, 8)]; + let AltOrderSelect = [{ return 1; }]; +} +def GPR64 : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR)> { + let AltOrders = [(rotl GPR64, 8)]; + let AltOrderSelect = [{ return 1; }]; +} + +// GPR register classes which include SP/WSP. +def GPR32sp : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WSP)> { + let AltOrders = [(rotl GPR32sp, 8)]; + let AltOrderSelect = [{ return 1; }]; +} +def GPR64sp : RegisterClass<"AArch64", [i64], 64, (add GPR64common, SP)> { + let AltOrders = [(rotl GPR64sp, 8)]; + let AltOrderSelect = [{ return 1; }]; +} + +def GPR32sponly : RegisterClass<"AArch64", [i32], 32, (add WSP)>; +def GPR64sponly : RegisterClass<"AArch64", [i64], 64, (add SP)>; + +def GPR64spPlus0Operand : AsmOperandClass { + let Name = "GPR64sp0"; + let RenderMethod = "addRegOperands"; + let ParserMethod = "tryParseGPR64sp0Operand"; +} + +def GPR64sp0 : RegisterOperand { + let ParserMatchClass = GPR64spPlus0Operand; +} + +// GPR register classes which include WZR/XZR AND SP/WSP. This is not a +// constraint used by any instructions, it is used as a common super-class. +def GPR32all : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR, WSP)>; +def GPR64all : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR, SP)>; + +// For tail calls, we can't use callee-saved registers, as they are restored +// to the saved value before the tail call, which would clobber a call address. +// This is for indirect tail calls to store the address of the destination. +def tcGPR64 : RegisterClass<"AArch64", [i64], 64, (sub GPR64common, X19, X20, X21, + X22, X23, X24, X25, X26, + X27, X28)>; + +// GPR register classes for post increment amount of vector load/store that +// has alternate printing when Rm=31 and prints a constant immediate value +// equal to the total number of bytes transferred. + +// FIXME: TableGen *should* be able to do these itself now. There appears to be +// a bug in counting how many operands a Post-indexed MCInst should have which +// means the aliases don't trigger. +def GPR64pi1 : RegisterOperand">; +def GPR64pi2 : RegisterOperand">; +def GPR64pi3 : RegisterOperand">; +def GPR64pi4 : RegisterOperand">; +def GPR64pi6 : RegisterOperand">; +def GPR64pi8 : RegisterOperand">; +def GPR64pi12 : RegisterOperand">; +def GPR64pi16 : RegisterOperand">; +def GPR64pi24 : RegisterOperand">; +def GPR64pi32 : RegisterOperand">; +def GPR64pi48 : RegisterOperand">; +def GPR64pi64 : RegisterOperand">; + +// Condition code regclass. +def CCR : RegisterClass<"AArch64", [i32], 32, (add NZCV)> { + let CopyCost = -1; // Don't allow copying of status registers. + + // CCR is not allocatable. + let isAllocatable = 0; +} + +//===----------------------------------------------------------------------===// +// Floating Point Scalar Registers +//===----------------------------------------------------------------------===// + +def B0 : AArch64Reg<0, "b0">, DwarfRegNum<[64]>; +def B1 : AArch64Reg<1, "b1">, DwarfRegNum<[65]>; +def B2 : AArch64Reg<2, "b2">, DwarfRegNum<[66]>; +def B3 : AArch64Reg<3, "b3">, DwarfRegNum<[67]>; +def B4 : AArch64Reg<4, "b4">, DwarfRegNum<[68]>; +def B5 : AArch64Reg<5, "b5">, DwarfRegNum<[69]>; +def B6 : AArch64Reg<6, "b6">, DwarfRegNum<[70]>; +def B7 : AArch64Reg<7, "b7">, DwarfRegNum<[71]>; +def B8 : AArch64Reg<8, "b8">, DwarfRegNum<[72]>; +def B9 : AArch64Reg<9, "b9">, DwarfRegNum<[73]>; +def B10 : AArch64Reg<10, "b10">, DwarfRegNum<[74]>; +def B11 : AArch64Reg<11, "b11">, DwarfRegNum<[75]>; +def B12 : AArch64Reg<12, "b12">, DwarfRegNum<[76]>; +def B13 : AArch64Reg<13, "b13">, DwarfRegNum<[77]>; +def B14 : AArch64Reg<14, "b14">, DwarfRegNum<[78]>; +def B15 : AArch64Reg<15, "b15">, DwarfRegNum<[79]>; +def B16 : AArch64Reg<16, "b16">, DwarfRegNum<[80]>; +def B17 : AArch64Reg<17, "b17">, DwarfRegNum<[81]>; +def B18 : AArch64Reg<18, "b18">, DwarfRegNum<[82]>; +def B19 : AArch64Reg<19, "b19">, DwarfRegNum<[83]>; +def B20 : AArch64Reg<20, "b20">, DwarfRegNum<[84]>; +def B21 : AArch64Reg<21, "b21">, DwarfRegNum<[85]>; +def B22 : AArch64Reg<22, "b22">, DwarfRegNum<[86]>; +def B23 : AArch64Reg<23, "b23">, DwarfRegNum<[87]>; +def B24 : AArch64Reg<24, "b24">, DwarfRegNum<[88]>; +def B25 : AArch64Reg<25, "b25">, DwarfRegNum<[89]>; +def B26 : AArch64Reg<26, "b26">, DwarfRegNum<[90]>; +def B27 : AArch64Reg<27, "b27">, DwarfRegNum<[91]>; +def B28 : AArch64Reg<28, "b28">, DwarfRegNum<[92]>; +def B29 : AArch64Reg<29, "b29">, DwarfRegNum<[93]>; +def B30 : AArch64Reg<30, "b30">, DwarfRegNum<[94]>; +def B31 : AArch64Reg<31, "b31">, DwarfRegNum<[95]>; + +let SubRegIndices = [bsub] in { +def H0 : AArch64Reg<0, "h0", [B0]>, DwarfRegAlias; +def H1 : AArch64Reg<1, "h1", [B1]>, DwarfRegAlias; +def H2 : AArch64Reg<2, "h2", [B2]>, DwarfRegAlias; +def H3 : AArch64Reg<3, "h3", [B3]>, DwarfRegAlias; +def H4 : AArch64Reg<4, "h4", [B4]>, DwarfRegAlias; +def H5 : AArch64Reg<5, "h5", [B5]>, DwarfRegAlias; +def H6 : AArch64Reg<6, "h6", [B6]>, DwarfRegAlias; +def H7 : AArch64Reg<7, "h7", [B7]>, DwarfRegAlias; +def H8 : AArch64Reg<8, "h8", [B8]>, DwarfRegAlias; +def H9 : AArch64Reg<9, "h9", [B9]>, DwarfRegAlias; +def H10 : AArch64Reg<10, "h10", [B10]>, DwarfRegAlias; +def H11 : AArch64Reg<11, "h11", [B11]>, DwarfRegAlias; +def H12 : AArch64Reg<12, "h12", [B12]>, DwarfRegAlias; +def H13 : AArch64Reg<13, "h13", [B13]>, DwarfRegAlias; +def H14 : AArch64Reg<14, "h14", [B14]>, DwarfRegAlias; +def H15 : AArch64Reg<15, "h15", [B15]>, DwarfRegAlias; +def H16 : AArch64Reg<16, "h16", [B16]>, DwarfRegAlias; +def H17 : AArch64Reg<17, "h17", [B17]>, DwarfRegAlias; +def H18 : AArch64Reg<18, "h18", [B18]>, DwarfRegAlias; +def H19 : AArch64Reg<19, "h19", [B19]>, DwarfRegAlias; +def H20 : AArch64Reg<20, "h20", [B20]>, DwarfRegAlias; +def H21 : AArch64Reg<21, "h21", [B21]>, DwarfRegAlias; +def H22 : AArch64Reg<22, "h22", [B22]>, DwarfRegAlias; +def H23 : AArch64Reg<23, "h23", [B23]>, DwarfRegAlias; +def H24 : AArch64Reg<24, "h24", [B24]>, DwarfRegAlias; +def H25 : AArch64Reg<25, "h25", [B25]>, DwarfRegAlias; +def H26 : AArch64Reg<26, "h26", [B26]>, DwarfRegAlias; +def H27 : AArch64Reg<27, "h27", [B27]>, DwarfRegAlias; +def H28 : AArch64Reg<28, "h28", [B28]>, DwarfRegAlias; +def H29 : AArch64Reg<29, "h29", [B29]>, DwarfRegAlias; +def H30 : AArch64Reg<30, "h30", [B30]>, DwarfRegAlias; +def H31 : AArch64Reg<31, "h31", [B31]>, DwarfRegAlias; +} + +let SubRegIndices = [hsub] in { +def S0 : AArch64Reg<0, "s0", [H0]>, DwarfRegAlias; +def S1 : AArch64Reg<1, "s1", [H1]>, DwarfRegAlias; +def S2 : AArch64Reg<2, "s2", [H2]>, DwarfRegAlias; +def S3 : AArch64Reg<3, "s3", [H3]>, DwarfRegAlias; +def S4 : AArch64Reg<4, "s4", [H4]>, DwarfRegAlias; +def S5 : AArch64Reg<5, "s5", [H5]>, DwarfRegAlias; +def S6 : AArch64Reg<6, "s6", [H6]>, DwarfRegAlias; +def S7 : AArch64Reg<7, "s7", [H7]>, DwarfRegAlias; +def S8 : AArch64Reg<8, "s8", [H8]>, DwarfRegAlias; +def S9 : AArch64Reg<9, "s9", [H9]>, DwarfRegAlias; +def S10 : AArch64Reg<10, "s10", [H10]>, DwarfRegAlias; +def S11 : AArch64Reg<11, "s11", [H11]>, DwarfRegAlias; +def S12 : AArch64Reg<12, "s12", [H12]>, DwarfRegAlias; +def S13 : AArch64Reg<13, "s13", [H13]>, DwarfRegAlias; +def S14 : AArch64Reg<14, "s14", [H14]>, DwarfRegAlias; +def S15 : AArch64Reg<15, "s15", [H15]>, DwarfRegAlias; +def S16 : AArch64Reg<16, "s16", [H16]>, DwarfRegAlias; +def S17 : AArch64Reg<17, "s17", [H17]>, DwarfRegAlias; +def S18 : AArch64Reg<18, "s18", [H18]>, DwarfRegAlias; +def S19 : AArch64Reg<19, "s19", [H19]>, DwarfRegAlias; +def S20 : AArch64Reg<20, "s20", [H20]>, DwarfRegAlias; +def S21 : AArch64Reg<21, "s21", [H21]>, DwarfRegAlias; +def S22 : AArch64Reg<22, "s22", [H22]>, DwarfRegAlias; +def S23 : AArch64Reg<23, "s23", [H23]>, DwarfRegAlias; +def S24 : AArch64Reg<24, "s24", [H24]>, DwarfRegAlias; +def S25 : AArch64Reg<25, "s25", [H25]>, DwarfRegAlias; +def S26 : AArch64Reg<26, "s26", [H26]>, DwarfRegAlias; +def S27 : AArch64Reg<27, "s27", [H27]>, DwarfRegAlias; +def S28 : AArch64Reg<28, "s28", [H28]>, DwarfRegAlias; +def S29 : AArch64Reg<29, "s29", [H29]>, DwarfRegAlias; +def S30 : AArch64Reg<30, "s30", [H30]>, DwarfRegAlias; +def S31 : AArch64Reg<31, "s31", [H31]>, DwarfRegAlias; +} + +let SubRegIndices = [ssub], RegAltNameIndices = [vreg, vlist1] in { +def D0 : AArch64Reg<0, "d0", [S0], ["v0", ""]>, DwarfRegAlias; +def D1 : AArch64Reg<1, "d1", [S1], ["v1", ""]>, DwarfRegAlias; +def D2 : AArch64Reg<2, "d2", [S2], ["v2", ""]>, DwarfRegAlias; +def D3 : AArch64Reg<3, "d3", [S3], ["v3", ""]>, DwarfRegAlias; +def D4 : AArch64Reg<4, "d4", [S4], ["v4", ""]>, DwarfRegAlias; +def D5 : AArch64Reg<5, "d5", [S5], ["v5", ""]>, DwarfRegAlias; +def D6 : AArch64Reg<6, "d6", [S6], ["v6", ""]>, DwarfRegAlias; +def D7 : AArch64Reg<7, "d7", [S7], ["v7", ""]>, DwarfRegAlias; +def D8 : AArch64Reg<8, "d8", [S8], ["v8", ""]>, DwarfRegAlias; +def D9 : AArch64Reg<9, "d9", [S9], ["v9", ""]>, DwarfRegAlias; +def D10 : AArch64Reg<10, "d10", [S10], ["v10", ""]>, DwarfRegAlias; +def D11 : AArch64Reg<11, "d11", [S11], ["v11", ""]>, DwarfRegAlias; +def D12 : AArch64Reg<12, "d12", [S12], ["v12", ""]>, DwarfRegAlias; +def D13 : AArch64Reg<13, "d13", [S13], ["v13", ""]>, DwarfRegAlias; +def D14 : AArch64Reg<14, "d14", [S14], ["v14", ""]>, DwarfRegAlias; +def D15 : AArch64Reg<15, "d15", [S15], ["v15", ""]>, DwarfRegAlias; +def D16 : AArch64Reg<16, "d16", [S16], ["v16", ""]>, DwarfRegAlias; +def D17 : AArch64Reg<17, "d17", [S17], ["v17", ""]>, DwarfRegAlias; +def D18 : AArch64Reg<18, "d18", [S18], ["v18", ""]>, DwarfRegAlias; +def D19 : AArch64Reg<19, "d19", [S19], ["v19", ""]>, DwarfRegAlias; +def D20 : AArch64Reg<20, "d20", [S20], ["v20", ""]>, DwarfRegAlias; +def D21 : AArch64Reg<21, "d21", [S21], ["v21", ""]>, DwarfRegAlias; +def D22 : AArch64Reg<22, "d22", [S22], ["v22", ""]>, DwarfRegAlias; +def D23 : AArch64Reg<23, "d23", [S23], ["v23", ""]>, DwarfRegAlias; +def D24 : AArch64Reg<24, "d24", [S24], ["v24", ""]>, DwarfRegAlias; +def D25 : AArch64Reg<25, "d25", [S25], ["v25", ""]>, DwarfRegAlias; +def D26 : AArch64Reg<26, "d26", [S26], ["v26", ""]>, DwarfRegAlias; +def D27 : AArch64Reg<27, "d27", [S27], ["v27", ""]>, DwarfRegAlias; +def D28 : AArch64Reg<28, "d28", [S28], ["v28", ""]>, DwarfRegAlias; +def D29 : AArch64Reg<29, "d29", [S29], ["v29", ""]>, DwarfRegAlias; +def D30 : AArch64Reg<30, "d30", [S30], ["v30", ""]>, DwarfRegAlias; +def D31 : AArch64Reg<31, "d31", [S31], ["v31", ""]>, DwarfRegAlias; +} + +let SubRegIndices = [dsub], RegAltNameIndices = [vreg, vlist1] in { +def Q0 : AArch64Reg<0, "q0", [D0], ["v0", ""]>, DwarfRegAlias; +def Q1 : AArch64Reg<1, "q1", [D1], ["v1", ""]>, DwarfRegAlias; +def Q2 : AArch64Reg<2, "q2", [D2], ["v2", ""]>, DwarfRegAlias; +def Q3 : AArch64Reg<3, "q3", [D3], ["v3", ""]>, DwarfRegAlias; +def Q4 : AArch64Reg<4, "q4", [D4], ["v4", ""]>, DwarfRegAlias; +def Q5 : AArch64Reg<5, "q5", [D5], ["v5", ""]>, DwarfRegAlias; +def Q6 : AArch64Reg<6, "q6", [D6], ["v6", ""]>, DwarfRegAlias; +def Q7 : AArch64Reg<7, "q7", [D7], ["v7", ""]>, DwarfRegAlias; +def Q8 : AArch64Reg<8, "q8", [D8], ["v8", ""]>, DwarfRegAlias; +def Q9 : AArch64Reg<9, "q9", [D9], ["v9", ""]>, DwarfRegAlias; +def Q10 : AArch64Reg<10, "q10", [D10], ["v10", ""]>, DwarfRegAlias; +def Q11 : AArch64Reg<11, "q11", [D11], ["v11", ""]>, DwarfRegAlias; +def Q12 : AArch64Reg<12, "q12", [D12], ["v12", ""]>, DwarfRegAlias; +def Q13 : AArch64Reg<13, "q13", [D13], ["v13", ""]>, DwarfRegAlias; +def Q14 : AArch64Reg<14, "q14", [D14], ["v14", ""]>, DwarfRegAlias; +def Q15 : AArch64Reg<15, "q15", [D15], ["v15", ""]>, DwarfRegAlias; +def Q16 : AArch64Reg<16, "q16", [D16], ["v16", ""]>, DwarfRegAlias; +def Q17 : AArch64Reg<17, "q17", [D17], ["v17", ""]>, DwarfRegAlias; +def Q18 : AArch64Reg<18, "q18", [D18], ["v18", ""]>, DwarfRegAlias; +def Q19 : AArch64Reg<19, "q19", [D19], ["v19", ""]>, DwarfRegAlias; +def Q20 : AArch64Reg<20, "q20", [D20], ["v20", ""]>, DwarfRegAlias; +def Q21 : AArch64Reg<21, "q21", [D21], ["v21", ""]>, DwarfRegAlias; +def Q22 : AArch64Reg<22, "q22", [D22], ["v22", ""]>, DwarfRegAlias; +def Q23 : AArch64Reg<23, "q23", [D23], ["v23", ""]>, DwarfRegAlias; +def Q24 : AArch64Reg<24, "q24", [D24], ["v24", ""]>, DwarfRegAlias; +def Q25 : AArch64Reg<25, "q25", [D25], ["v25", ""]>, DwarfRegAlias; +def Q26 : AArch64Reg<26, "q26", [D26], ["v26", ""]>, DwarfRegAlias; +def Q27 : AArch64Reg<27, "q27", [D27], ["v27", ""]>, DwarfRegAlias; +def Q28 : AArch64Reg<28, "q28", [D28], ["v28", ""]>, DwarfRegAlias; +def Q29 : AArch64Reg<29, "q29", [D29], ["v29", ""]>, DwarfRegAlias; +def Q30 : AArch64Reg<30, "q30", [D30], ["v30", ""]>, DwarfRegAlias; +def Q31 : AArch64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias; +} + +def FPR8 : RegisterClass<"AArch64", [untyped], 8, (sequence "B%u", 0, 31)> { + let Size = 8; +} +def FPR16 : RegisterClass<"AArch64", [f16], 16, (sequence "H%u", 0, 31)> { + let Size = 16; +} +def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>; +def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32, + v1i64], + 64, (sequence "D%u", 0, 31)>; +// We don't (yet) have an f128 legal type, so don't use that here. We +// normalize 128-bit vectors to v2f64 for arg passing and such, so use +// that here. +def FPR128 : RegisterClass<"AArch64", + [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128], + 128, (sequence "Q%u", 0, 31)>; + +// The lower 16 vector registers. Some instructions can only take registers +// in this range. +def FPR128_lo : RegisterClass<"AArch64", + [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + 128, (trunc FPR128, 16)>; + +// Pairs, triples, and quads of 64-bit vector registers. +def DSeqPairs : RegisterTuples<[dsub0, dsub1], [(rotl FPR64, 0), (rotl FPR64, 1)]>; +def DSeqTriples : RegisterTuples<[dsub0, dsub1, dsub2], + [(rotl FPR64, 0), (rotl FPR64, 1), + (rotl FPR64, 2)]>; +def DSeqQuads : RegisterTuples<[dsub0, dsub1, dsub2, dsub3], + [(rotl FPR64, 0), (rotl FPR64, 1), + (rotl FPR64, 2), (rotl FPR64, 3)]>; +def DD : RegisterClass<"AArch64", [untyped], 64, (add DSeqPairs)> { + let Size = 128; +} +def DDD : RegisterClass<"AArch64", [untyped], 64, (add DSeqTriples)> { + let Size = 196; +} +def DDDD : RegisterClass<"AArch64", [untyped], 64, (add DSeqQuads)> { + let Size = 256; +} + +// Pairs, triples, and quads of 128-bit vector registers. +def QSeqPairs : RegisterTuples<[qsub0, qsub1], [(rotl FPR128, 0), (rotl FPR128, 1)]>; +def QSeqTriples : RegisterTuples<[qsub0, qsub1, qsub2], + [(rotl FPR128, 0), (rotl FPR128, 1), + (rotl FPR128, 2)]>; +def QSeqQuads : RegisterTuples<[qsub0, qsub1, qsub2, qsub3], + [(rotl FPR128, 0), (rotl FPR128, 1), + (rotl FPR128, 2), (rotl FPR128, 3)]>; +def QQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqPairs)> { + let Size = 256; +} +def QQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqTriples)> { + let Size = 384; +} +def QQQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqQuads)> { + let Size = 512; +} + + +// Vector operand versions of the FP registers. Alternate name printing and +// assmebler matching. +def VectorReg64AsmOperand : AsmOperandClass { + let Name = "VectorReg64"; + let PredicateMethod = "isVectorReg"; +} +def VectorReg128AsmOperand : AsmOperandClass { + let Name = "VectorReg128"; + let PredicateMethod = "isVectorReg"; +} + +def V64 : RegisterOperand { + let ParserMatchClass = VectorReg64AsmOperand; +} + +def V128 : RegisterOperand { + let ParserMatchClass = VectorReg128AsmOperand; +} + +def VectorRegLoAsmOperand : AsmOperandClass { let Name = "VectorRegLo"; } +def V128_lo : RegisterOperand { + let ParserMatchClass = VectorRegLoAsmOperand; +} + +class TypedVecListAsmOperand + : AsmOperandClass { + let Name = "TypedVectorList" # count # "_" # lanes # kind; + + let PredicateMethod + = "isTypedVectorList<" # count # ", " # lanes # ", '" # kind # "'>"; + let RenderMethod = "addVectorList" # regsize # "Operands<" # count # ">"; +} + +class TypedVecListRegOperand + : RegisterOperand">; + +multiclass VectorList { + // With implicit types (probably on instruction instead). E.g. { v0, v1 } + def _64AsmOperand : AsmOperandClass { + let Name = NAME # "64"; + let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">"; + let RenderMethod = "addVectorList64Operands<" # count # ">"; + } + + def "64" : RegisterOperand { + let ParserMatchClass = !cast(NAME # "_64AsmOperand"); + } + + def _128AsmOperand : AsmOperandClass { + let Name = NAME # "128"; + let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">"; + let RenderMethod = "addVectorList128Operands<" # count # ">"; + } + + def "128" : RegisterOperand { + let ParserMatchClass = !cast(NAME # "_128AsmOperand"); + } + + // 64-bit register lists with explicit type. + + // { v0.8b, v1.8b } + def _8bAsmOperand : TypedVecListAsmOperand; + def "8b" : TypedVecListRegOperand { + let ParserMatchClass = !cast(NAME # "_8bAsmOperand"); + } + + // { v0.4h, v1.4h } + def _4hAsmOperand : TypedVecListAsmOperand; + def "4h" : TypedVecListRegOperand { + let ParserMatchClass = !cast(NAME # "_4hAsmOperand"); + } + + // { v0.2s, v1.2s } + def _2sAsmOperand : TypedVecListAsmOperand; + def "2s" : TypedVecListRegOperand { + let ParserMatchClass = !cast(NAME # "_2sAsmOperand"); + } + + // { v0.1d, v1.1d } + def _1dAsmOperand : TypedVecListAsmOperand; + def "1d" : TypedVecListRegOperand { + let ParserMatchClass = !cast(NAME # "_1dAsmOperand"); + } + + // 128-bit register lists with explicit type + + // { v0.16b, v1.16b } + def _16bAsmOperand : TypedVecListAsmOperand; + def "16b" : TypedVecListRegOperand { + let ParserMatchClass = !cast(NAME # "_16bAsmOperand"); + } + + // { v0.8h, v1.8h } + def _8hAsmOperand : TypedVecListAsmOperand; + def "8h" : TypedVecListRegOperand { + let ParserMatchClass = !cast(NAME # "_8hAsmOperand"); + } + + // { v0.4s, v1.4s } + def _4sAsmOperand : TypedVecListAsmOperand; + def "4s" : TypedVecListRegOperand { + let ParserMatchClass = !cast(NAME # "_4sAsmOperand"); + } + + // { v0.2d, v1.2d } + def _2dAsmOperand : TypedVecListAsmOperand; + def "2d" : TypedVecListRegOperand { + let ParserMatchClass = !cast(NAME # "_2dAsmOperand"); + } + + // { v0.b, v1.b } + def _bAsmOperand : TypedVecListAsmOperand; + def "b" : TypedVecListRegOperand { + let ParserMatchClass = !cast(NAME # "_bAsmOperand"); + } + + // { v0.h, v1.h } + def _hAsmOperand : TypedVecListAsmOperand; + def "h" : TypedVecListRegOperand { + let ParserMatchClass = !cast(NAME # "_hAsmOperand"); + } + + // { v0.s, v1.s } + def _sAsmOperand : TypedVecListAsmOperand; + def "s" : TypedVecListRegOperand { + let ParserMatchClass = !cast(NAME # "_sAsmOperand"); + } + + // { v0.d, v1.d } + def _dAsmOperand : TypedVecListAsmOperand; + def "d" : TypedVecListRegOperand { + let ParserMatchClass = !cast(NAME # "_dAsmOperand"); + } + + +} + +defm VecListOne : VectorList<1, FPR64, FPR128>; +defm VecListTwo : VectorList<2, DD, QQ>; +defm VecListThree : VectorList<3, DDD, QQQ>; +defm VecListFour : VectorList<4, DDDD, QQQQ>; + + +// Register operand versions of the scalar FP registers. +def FPR16Op : RegisterOperand; +def FPR32Op : RegisterOperand; +def FPR64Op : RegisterOperand; +def FPR128Op : RegisterOperand; diff --git a/lib/Target/AArch64/AArch64SchedA53.td b/lib/Target/AArch64/AArch64SchedA53.td new file mode 100644 index 00000000000..0c3949ecfc1 --- /dev/null +++ b/lib/Target/AArch64/AArch64SchedA53.td @@ -0,0 +1,291 @@ +//==- AArch64SchedA53.td - Cortex-A53 Scheduling Definitions -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the ARM Cortex A53 processors. +// +//===----------------------------------------------------------------------===// + +// ===---------------------------------------------------------------------===// +// The following definitions describe the simpler per-operand machine model. +// This works with MachineScheduler. See MCSchedModel.h for details. + +// Cortex-A53 machine model for scheduling and other instruction cost heuristics. +def CortexA53Model : SchedMachineModel { + let MicroOpBufferSize = 0; // Explicitly set to zero since A53 is in-order. + let IssueWidth = 2; // 2 micro-ops are dispatched per cycle. + let MinLatency = 1 ; // OperandCycles are interpreted as MinLatency. + let LoadLatency = 3; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation + // Specification - Instruction Timings" + // v 1.0 Spreadsheet +} + + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available. + +// Modeling each pipeline as a ProcResource using the BufferSize = 0 since +// Cortex-A53 is in-order. + +def A53UnitALU : ProcResource<2> { let BufferSize = 0; } // Int ALU +def A53UnitMAC : ProcResource<1> { let BufferSize = 0; } // Int MAC +def A53UnitDiv : ProcResource<1> { let BufferSize = 0; } // Int Division +def A53UnitLdSt : ProcResource<1> { let BufferSize = 0; } // Load/Store +def A53UnitB : ProcResource<1> { let BufferSize = 0; } // Branch +def A53UnitFPALU : ProcResource<1> { let BufferSize = 0; } // FP ALU +def A53UnitFPMDS : ProcResource<1> { let BufferSize = 0; } // FP Mult/Div/Sqrt + + +//===----------------------------------------------------------------------===// +// Subtarget-specific SchedWrite types which both map the ProcResources and +// set the latency. + +let SchedModel = CortexA53Model in { + +// ALU - Despite having a full latency of 4, most of the ALU instructions can +// forward a cycle earlier and then two cycles earlier in the case of a +// shift-only instruction. These latencies will be incorrect when the +// result cannot be forwarded, but modeling isn't rocket surgery. +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 3; } + +// MAC +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } + +// Div +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } + +// Load +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } + +// Vector Load - Vector loads take 1-5 cycles to issue. For the WriteVecLd +// below, choosing the median of 3 which makes the latency 6. +// May model this more carefully in the future. The remaining +// A53WriteVLD# types represent the 1-5 cycle issues explicitly. +def : WriteRes { let Latency = 6; + let ResourceCycles = [3]; } +def A53WriteVLD1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; } +def A53WriteVLD2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5; + let ResourceCycles = [2]; } +def A53WriteVLD3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6; + let ResourceCycles = [3]; } +def A53WriteVLD4 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 7; + let ResourceCycles = [4]; } +def A53WriteVLD5 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 8; + let ResourceCycles = [5]; } + +// Pre/Post Indexing - Performed as part of address generation which is already +// accounted for in the WriteST* latencies below +def : WriteRes { let Latency = 0; } + +// Store +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } + +// Vector Store - Similar to vector loads, can take 1-3 cycles to issue. +def : WriteRes { let Latency = 5; + let ResourceCycles = [2];} +def A53WriteVST1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; } +def A53WriteVST2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5; + let ResourceCycles = [2]; } +def A53WriteVST3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6; + let ResourceCycles = [3]; } + +// Branch +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; + +// FP ALU +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 6; } + +// FP Mul, Div, Sqrt +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 33; + let ResourceCycles = [29]; } +def A53WriteFMAC : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 10; } +def A53WriteFDivSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 18; + let ResourceCycles = [14]; } +def A53WriteFDivDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 33; + let ResourceCycles = [29]; } +def A53WriteFSqrtSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 17; + let ResourceCycles = [13]; } +def A53WriteFSqrtDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 32; + let ResourceCycles = [28]; } + +//===----------------------------------------------------------------------===// +// Subtarget-specific SchedRead types. + +// No forwarding for these reads. +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +// ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable +// operands are needed one cycle later if and only if they are to be +// shifted. Otherwise, they too are needed two cycle later. This same +// ReadAdvance applies to Extended registers as well, even though there is +// a seperate SchedPredicate for them. +def : ReadAdvance; +def A53ReadShifted : SchedReadAdvance<1, [WriteImm,WriteI, + WriteISReg, WriteIEReg,WriteIS, + WriteID32,WriteID64, + WriteIM32,WriteIM64]>; +def A53ReadNotShifted : SchedReadAdvance<2, [WriteImm,WriteI, + WriteISReg, WriteIEReg,WriteIS, + WriteID32,WriteID64, + WriteIM32,WriteIM64]>; +def A53ReadISReg : SchedReadVariant<[ + SchedVar, + SchedVar]>; +def : SchedAlias; + +def A53ReadIEReg : SchedReadVariant<[ + SchedVar, + SchedVar]>; +def : SchedAlias; + +// MAC - Operands are generally needed one cycle later in the MAC pipe. +// Accumulator operands are needed two cycles later. +def : ReadAdvance; +def : ReadAdvance; + +// Div +def : ReadAdvance; + +//===----------------------------------------------------------------------===// +// Subtarget-specific InstRWs. + +//--- +// Miscellaneous +//--- +def : InstRW<[WriteI], (instrs COPY)>; + +//--- +// Vector Loads +//--- +def : InstRW<[A53WriteVLD1], (instregex "LD1i(8|16|32|64)$")>; +def : InstRW<[A53WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>; +def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[A53WriteVLD1], (instregex "LD2i(8|16|32|64)$")>; +def : InstRW<[A53WriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>; +def : InstRW<[A53WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>; +def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>; +def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>; +def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>; + +def : InstRW<[A53WriteVLD2], (instregex "LD3i(8|16|32|64)$")>; +def : InstRW<[A53WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)$")>; +def : InstRW<[A53WriteVLD3], (instregex "LD3Threev(2d)$")>; +def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>; +def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>; +def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD3Threev(2d)_POST$")>; + +def : InstRW<[A53WriteVLD2], (instregex "LD4i(8|16|32|64)$")>; +def : InstRW<[A53WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>; +def : InstRW<[A53WriteVLD4], (instregex "LD4Fourv(2d)$")>; +def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>; +def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[A53WriteVLD5, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>; +def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>; + +//--- +// Vector Stores +//--- +def : InstRW<[A53WriteVST1], (instregex "ST1i(8|16|32|64)$")>; +def : InstRW<[A53WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>; +def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[A53WriteVST1], (instregex "ST2i(8|16|32|64)$")>; +def : InstRW<[A53WriteVST1], (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[A53WriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>; +def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[A53WriteVST2], (instregex "ST3i(8|16|32|64)$")>; +def : InstRW<[A53WriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)$")>; +def : InstRW<[A53WriteVST2], (instregex "ST3Threev(2d)$")>; +def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>; +def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>; +def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3Threev(2d)_POST$")>; + +def : InstRW<[A53WriteVST2], (instregex "ST4i(8|16|32|64)$")>; +def : InstRW<[A53WriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>; +def : InstRW<[A53WriteVST2], (instregex "ST4Fourv(2d)$")>; +def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>; +def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>; +def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>; + +//--- +// Floating Point MAC, DIV, SQRT +//--- +def : InstRW<[A53WriteFMAC], (instregex "^FN?M(ADD|SUB).*")>; +def : InstRW<[A53WriteFMAC], (instregex "^FML(A|S).*")>; +def : InstRW<[A53WriteFDivSP], (instrs FDIVSrr)>; +def : InstRW<[A53WriteFDivDP], (instrs FDIVDrr)>; +def : InstRW<[A53WriteFDivSP], (instregex "^FDIVv.*32$")>; +def : InstRW<[A53WriteFDivDP], (instregex "^FDIVv.*64$")>; +def : InstRW<[A53WriteFSqrtSP], (instregex "^.*SQRT.*32$")>; +def : InstRW<[A53WriteFSqrtDP], (instregex "^.*SQRT.*64$")>; + +} diff --git a/lib/Target/AArch64/AArch64SchedCyclone.td b/lib/Target/AArch64/AArch64SchedCyclone.td new file mode 100644 index 00000000000..a2a18023778 --- /dev/null +++ b/lib/Target/AArch64/AArch64SchedCyclone.td @@ -0,0 +1,865 @@ +//=- ARMSchedCyclone.td - AArch64 Cyclone Scheduling Defs ----*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for AArch64 Cyclone to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def CycloneModel : SchedMachineModel { + let IssueWidth = 6; // 6 micro-ops are dispatched per cycle. + let MicroOpBufferSize = 192; // Based on the reorder buffer. + let LoadLatency = 4; // Optimistic load latency. + let MispredictPenalty = 16; // 14-19 cycles are typical. +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on Cyclone. + +// 4 integer pipes +def CyUnitI : ProcResource<4> { + let BufferSize = 48; +} + +// 2 branch units: I[0..1] +def CyUnitB : ProcResource<2> { + let Super = CyUnitI; + let BufferSize = 24; +} + +// 1 indirect-branch unit: I[0] +def CyUnitBR : ProcResource<1> { + let Super = CyUnitB; +} + +// 2 shifter pipes: I[2..3] +// When an instruction consumes a CyUnitIS, it also consumes a CyUnitI +def CyUnitIS : ProcResource<2> { + let Super = CyUnitI; + let BufferSize = 24; +} + +// 1 mul pipe: I[0] +def CyUnitIM : ProcResource<1> { + let Super = CyUnitBR; + let BufferSize = 32; +} + +// 1 div pipe: I[1] +def CyUnitID : ProcResource<1> { + let Super = CyUnitB; + let BufferSize = 16; +} + +// 1 integer division unit. This is driven by the ID pipe, but only +// consumes the pipe for one cycle at issue and another cycle at writeback. +def CyUnitIntDiv : ProcResource<1>; + +// 2 ld/st pipes. +def CyUnitLS : ProcResource<2> { + let BufferSize = 28; +} + +// 3 fp/vector pipes. +def CyUnitV : ProcResource<3> { + let BufferSize = 48; +} +// 2 fp/vector arithmetic and multiply pipes: V[0-1] +def CyUnitVM : ProcResource<2> { + let Super = CyUnitV; + let BufferSize = 32; +} +// 1 fp/vector division/sqrt pipe: V[2] +def CyUnitVD : ProcResource<1> { + let Super = CyUnitV; + let BufferSize = 16; +} +// 1 fp compare pipe: V[0] +def CyUnitVC : ProcResource<1> { + let Super = CyUnitVM; + let BufferSize = 16; +} + +// 2 fp division/square-root units. These are driven by the VD pipe, +// but only consume the pipe for one cycle at issue and a cycle at writeback. +def CyUnitFloatDiv : ProcResource<2>; + +//===----------------------------------------------------------------------===// +// Define scheduler read/write resources and latency on Cyclone. +// This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1. + +let SchedModel = CycloneModel in { + +//--- +// 7.8.1. Moves +//--- + +// A single nop micro-op (uX). +def WriteX : SchedWriteRes<[]> { let Latency = 0; } + +// Move zero is a register rename (to machine register zero). +// The move is replaced by a single nop micro-op. +// MOVZ Rd, #0 +// AND Rd, Rzr, #imm +def WriteZPred : SchedPredicate<[{TII->isGPRZero(MI)}]>; +def WriteImmZ : SchedWriteVariant<[ + SchedVar, + SchedVar]>; +def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>; + +// Move GPR is a register rename and single nop micro-op. +// ORR Xd, XZR, Xm +// ADD Xd, Xn, #0 +def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(MI)}]>; +def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(MI)}]>; +def WriteMov : SchedWriteVariant<[ + SchedVar, + SchedVar, + SchedVar]>; +def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>; + +// Move non-zero immediate is an integer ALU op. +// MOVN,MOVZ,MOVK +def : WriteRes; + +//--- +// 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional, +// Shifts and Bitfield Operations +//--- + +// ADR,ADRP +// ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri +// ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr +// ADC(S),SBC(S) +// Aliases: CMN, CMP, TST +// +// Conditional operations. +// CCMNi,CCMPi,CCMNr,CCMPr, +// CSEL,CSINC,CSINV,CSNEG +// +// Bit counting and reversal operations. +// CLS,CLZ,RBIT,REV,REV16,REV32 +def : WriteRes; + +// ADD with shifted register operand is a single micro-op that +// consumes a shift pipeline for two cycles. +// ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs +// EXAMPLE: ADDrs Xn, Xm LSL #imm +def : WriteRes { + let Latency = 2; + let ResourceCycles = [2]; +} + +// ADD with extended register operand is the same as shifted reg operand. +// ADD(S)re,SUB(S)re +// EXAMPLE: ADDXre Xn, Xm, UXTB #1 +def : WriteRes { + let Latency = 2; + let ResourceCycles = [2]; +} + +// Variable shift and bitfield operations. +// ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM +def : WriteRes; + +// EXTR Shifts a pair of registers and requires two micro-ops. +// The second micro-op is delayed, as modeled by ReadExtrHi. +// EXTR Xn, Xm, #imm +def : WriteRes { + let Latency = 2; + let NumMicroOps = 2; +} + +// EXTR's first register read is delayed by one cycle, effectively +// shortening its writer's latency. +// EXTR Xn, Xm, #imm +def : ReadAdvance; + +//--- +// 7.8.6. Multiplies +//--- + +// MUL/MNEG are aliases for MADD/MSUB. +// MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL +def : WriteRes { + let Latency = 4; +} +// MADDX,MSUBX,SMULH,UMULH +def : WriteRes { + let Latency = 5; +} + +//--- +// 7.8.7. Divide +//--- + +// 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient. +// The ID pipe is consumed for 2 cycles: issue and writeback. +// SDIVW,UDIVW +def : WriteRes { + let Latency = 10; + let ResourceCycles = [2, 10]; +} +// 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient. +// The ID pipe is consumed for 2 cycles: issue and writeback. +// SDIVX,UDIVX +def : WriteRes { + let Latency = 13; + let ResourceCycles = [2, 13]; +} + +//--- +// 7.8.8,7.8.10. Load/Store, single element +//--- + +// Integer loads take 4 cycles and use one LS unit for one cycle. +def : WriteRes { + let Latency = 4; +} + +// Store-load forwarding is 4 cycles. +// +// Note: The store-exclusive sequence incorporates this +// latency. However, general heuristics should not model the +// dependence between a store and subsequent may-alias load because +// hardware speculation works. +def : WriteRes { + let Latency = 4; +} + +// Load from base address plus an optionally scaled register offset. +// Rt latency is latency WriteIS + WriteLD. +// EXAMPLE: LDR Xn, Xm [, lsl 3] +def CyWriteLDIdx : SchedWriteVariant<[ + SchedVar, // Load from scaled register. + SchedVar]>; // Load from register offset. +def : SchedAlias; // Map AArch64->Cyclone type. + +// EXAMPLE: STR Xn, Xm [, lsl 3] +def CyWriteSTIdx : SchedWriteVariant<[ + SchedVar, // Store to scaled register. + SchedVar]>; // Store to register offset. +def : SchedAlias; // Map AArch64->Cyclone type. + +// Read the (unshifted) base register Xn in the second micro-op one cycle later. +// EXAMPLE: LDR Xn, Xm [, lsl 3] +def ReadBaseRS : SchedReadAdvance<1>; +def CyReadAdrBase : SchedReadVariant<[ + SchedVar, // Read base reg after shifting offset. + SchedVar]>; // Read base reg with no shift. +def : SchedAlias; // Map AArch64->Cyclone type. + +//--- +// 7.8.9,7.8.11. Load/Store, paired +//--- + +// Address pre/post increment is a simple ALU op with one cycle latency. +def : WriteRes; + +// LDP high register write is fused with the load, but a nop micro-op remains. +def : WriteRes { + let Latency = 4; +} + +// STP is a vector op and store, except for QQ, which is just two stores. +def : SchedAlias; +def : InstRW<[WriteST, WriteST], (instrs STPQi)>; + +//--- +// 7.8.13. Branches +//--- + +// Branches take a single micro-op. +// The misprediction penalty is defined as a SchedMachineModel property. +def : WriteRes {let Latency = 0;} +def : WriteRes {let Latency = 0;} + +//--- +// 7.8.14. Never-issued Instructions, Barrier and Hint Operations +//--- + +// NOP,SEV,SEVL,WFE,WFI,YIELD +def : WriteRes {let Latency = 0;} +// ISB +def : InstRW<[WriteI], (instrs ISB)>; +// SLREX,DMB,DSB +def : WriteRes; + +// System instructions get an invalid latency because the latency of +// other operations across them is meaningless. +def : WriteRes {let Latency = -1;} + +//===----------------------------------------------------------------------===// +// 7.9 Vector Unit Instructions + +// Simple vector operations take 2 cycles. +def : WriteRes {let Latency = 2;} + +// Define some longer latency vector op types for Cyclone. +def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;} +def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;} +def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;} +def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;} + +// Simple floating-point operations take 2 cycles. +def : WriteRes {let Latency = 2;} + +//--- +// 7.9.1 Vector Moves +//--- + +// TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently +// generates expensive int-float conversion instead: +// FMOVDi Dd, #0.0 +// FMOVv2f64ns Vd.2d, #0.0 + +// FMOVSi,FMOVDi +def : WriteRes {let Latency = 2;} + +// MOVI,MVNI are WriteV +// FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV + +// Move FPR is a register rename and single nop micro-op. +// ORR.16b Vd,Vn,Vn +// COPY is handled above in the WriteMov Variant. +def WriteVMov : SchedWriteVariant<[ + SchedVar, + SchedVar]>; +def : InstRW<[WriteVMov], (instrs ORRv16i8)>; + +// FMOVSr,FMOVDr are WriteF. + +// MOV V,V is a WriteV. + +// CPY D,V[x] is a WriteV + +// INS V[x],V[y] is a WriteV. + +// FMOVWSr,FMOVXDr,FMOVXDHighr +def : WriteRes { + let Latency = 5; +} + +// FMOVSWr,FMOVDXr +def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>; + +// INS V[x],R +def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>; +def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>; + +// SMOV,UMOV R,V[x] +def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>; +def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>; + +// DUP V,R +def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>; + +// DUP V,V[x] is a WriteV. + +//--- +// 7.9.2 Integer Arithmetic, Logical, and Comparisons +//--- + +// BIC,ORR V,#imm are WriteV + +def : InstRW<[CyWriteV3], (instregex "ABSv")>; + +// MVN,NEG,NOT are WriteV + +def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>; + +// ADDP is a WriteV. +def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;} +def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>; + +def : InstRW<[CyWriteV3], + (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>; + +def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>; + +// ADD,SUB are WriteV + +// Forward declare. +def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;} + +// Add/Diff and accumulate uses the vector multiply unit. +def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;} +def CyReadVAccum : SchedReadAdvance<1, + [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>; + +def : InstRW<[CyWriteVAccum, CyReadVAccum], + (instregex "SADALP","UADALP")>; + +def : InstRW<[CyWriteVAccum, CyReadVAccum], + (instregex "SABAv","UABAv","SABALv","UABALv")>; + +def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>; + +def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>; + +def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>; + +// WriteV includes: +// AND,BIC,CMTST,EOR,ORN,ORR +// ADDP +// SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD +// SADDL,SSUBL,UADDL,USUBL +// SADDW,SSUBW,UADDW,USUBW + +def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv", + "CMLEv","CMLTv", + "CMHIv","CMHSv")>; + +def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv", + "SMAXPv","SMINPv","UMAXPv","UMINPv")>; + +def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv", + "SABDLv","UABDLv")>; + +//--- +// 7.9.3 Floating Point Arithmetic and Comparisons +//--- + +// FABS,FNEG are WriteF + +def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>; +def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>; + +def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i", + "FMINPv2i","FMINNMPv2i")>; + +def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>; + +def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32, + FSUBSrr,FSUBv2f32,FSUBv4f32, + FADDPv2f32,FADDPv4f32, + FABD32,FABDv2f32,FABDv4f32)>; +def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64, + FSUBDrr,FSUBv2f64, + FADDPv2f64, + FABD64,FABDv2f64)>; + +def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>; + +def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT", + "FMAXS","FMAXD","FMAXv", + "FMINS","FMIND","FMINv", + "FMAXNMS","FMAXNMD","FMAXNMv", + "FMINNMS","FMINNMD","FMINNMv", + "FMAXPv2f","FMAXPv4f", + "FMINPv2f","FMINPv4f", + "FMAXNMPv2f","FMAXNMPv4f", + "FMINNMPv2f","FMINNMPv4f")>; + +// FCMP,FCMPE,FCCMP,FCCMPE +def : WriteRes {let Latency = 4;} + +// FCSEL is a WriteF. + +//--- +// 7.9.4 Shifts and Bitfield Operations +//--- + +// SHL is a WriteV + +def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;} +def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>; + +def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;} +def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>; + +// Shift and accumulate uses the vector multiply unit. +def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;} +def CyReadVShiftAcc : SchedReadAdvance<1, + [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>; +def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc], + (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>; + +// SSHL,USHL are WriteV. + +def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>; + +// SQSHL,SQSHLU,UQSHL are WriteV. + +def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>; + +// WriteV includes: +// SHLL,SSHLL,USHLL +// SLI,SRI +// BIF,BIT,BSL +// EXT +// CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN +// XTN2 + +def : InstRW<[CyWriteV4], + (instregex "RSHRNv","SHRNv", + "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv", + "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>; + +//--- +// 7.9.5 Multiplication +//--- + +def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;} +def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv", + "SQDMULLv","SQDMULHv","SQRDMULHv")>; + +// FMUL,FMULX,FNMUL default to WriteFMul. +def : WriteRes { let Latency = 4;} + +def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;} +def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed, + FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>; + +def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>; +def : InstRW<[CyWriteVMul, CyReadVMulAcc], + (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL", + "SQDMLAL","SQDMLSL")>; + +def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;} +def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;} +def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>; +def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>; + +def : InstRW<[CyWriteSMul, CyReadSMul], + (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr, + FMLAv2f32,FMLAv4f32, + FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>; +def : InstRW<[CyWriteDMul, CyReadDMul], + (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr, + FMLAv2f64,FMLAv2i64_indexed, + FMLSv2f64,FMLSv2i64_indexed)>; + +def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; } +def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>; + +//--- +// 7.9.6 Divide and Square Root +//--- + +// FDIV,FSQRT +// TODO: Add 64-bit variant with 19 cycle latency. +// TODO: Specialize FSQRT for longer latency. +def : WriteRes { + let Latency = 17; + let ResourceCycles = [2, 17]; +} + +def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>; + +def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; } +def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>; + +def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; } +def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; } +def : InstRW<[WriteFRECPS], (instregex "FRECPSv")>; +def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>; + +//--- +// 7.9.7 Integer-FP Conversions +//--- + +// FCVT lengthen f16/s32 +def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>; + +// FCVT,FCVTN,FCVTXN +// SCVTF,UCVTF V,V +// FRINT(AIMNPXZ) V,V +def : WriteRes {let Latency = 4;} + +// SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles. +def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>; +def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>; + +// FCVT Rd, S/D = V6+LD4: 10 cycles +def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>; +def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>; + +// FCVTL is a WriteV + +//--- +// 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup +//--- + +def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;} +def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr, + AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr, + SHA1SU0rrr)>; + +def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;} +def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>; + +def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;} +def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr, + SHA256Hrrr,SHA256H2rrr)>; + +// TRN,UZP,ZUP are WriteV. + +// TBL,TBX are WriteV. + +//--- +// 7.9.11-7.9.14 Load/Store, single element and paired +//--- + +// Loading into the vector unit takes 5 cycles vs 4 for integer loads. +def : WriteRes { + let Latency = 5; +} + +// Store-load forwarding is 4 cycles. +def : WriteRes { + let Latency = 4; +} + +// WriteVLDPair/VSTPair sequences are expanded by the target description. + +//--- +// 7.9.15 Load, element operations +//--- + +// Only the first WriteVLD and WriteAdr for writeback matches def operands. +// Subsequent WriteVLDs consume resources. Since all loaded values have the +// same latency, this is acceptable. + +// Vd is read 5 cycles after issuing the vector load. +def : ReadAdvance; + +def : InstRW<[WriteVLD], + (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLD, WriteAdr], + (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; + +// Register writes from the load's high half are fused micro-ops. +def : InstRW<[WriteVLD], + (instregex "LD1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[WriteVLD, WriteAdr], + (instregex "LD1Twov(8b|4h|2s|1d)_POST")>; +def : InstRW<[WriteVLD, WriteVLD], + (instregex "LD1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLD, WriteAdr, WriteVLD], + (instregex "LD1Twov(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVLD, WriteVLD], + (instregex "LD1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[WriteVLD, WriteAdr, WriteVLD], + (instregex "LD1Threev(8b|4h|2s|1d)_POST")>; +def : InstRW<[WriteVLD, WriteVLD, WriteVLD], + (instregex "LD1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD], + (instregex "LD1Threev(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVLD, WriteVLD], + (instregex "LD1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[WriteVLD, WriteAdr, WriteVLD], + (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>; +def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD], + (instregex "LD1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD], + (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVLDShuffle, ReadVLD], + (instregex "LD1i(8|16|32)$")>; +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr], + (instregex "LD1i(8|16|32)_POST")>; + +def : InstRW<[WriteVLDShuffle, ReadVLD], (instrs LD1i64)>; +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>; + +def : InstRW<[WriteVLDShuffle], + (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLDShuffle, WriteAdr], + (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[WriteVLDShuffle, WriteV], + (instregex "LD2Twov(8b|4h|2s)$")>; +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV], + (instregex "LD2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[WriteVLDShuffle, WriteVLDShuffle], + (instregex "LD2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle], + (instregex "LD2Twov(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV], + (instregex "LD2i(8|16|32)$")>; +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV], + (instregex "LD2i(8|16|32)_POST")>; +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV], + (instregex "LD2i64$")>; +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV], + (instregex "LD2i64_POST")>; + +def : InstRW<[WriteVLDShuffle, WriteV], + (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV], + (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV], + (instregex "LD3Threev(8b|4h|2s)$")>; +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV], + (instregex "LD3Threev(8b|4h|2s)_POST")>; +def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle], + (instregex "LD3Threev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle], + (instregex "LD3Threev(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV], + (instregex "LD3i(8|16|32)$")>; +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV], + (instregex "LD3i(8|16|32)_POST")>; + +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV], + (instregex "LD3i64$")>; +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV], + (instregex "LD3i64_POST")>; + +def : InstRW<[WriteVLDShuffle, WriteV, WriteV], + (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>; +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV], + (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>; + +def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV], + (instrs LD3Rv1d,LD3Rv2d)>; +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV], + (instrs LD3Rv2d_POST,LD3Rv2d_POST)>; + +def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV], + (instregex "LD4Fourv(8b|4h|2s)$")>; +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV], + (instregex "LD4Fourv(8b|4h|2s)_POST")>; +def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle, + WriteVLDPairShuffle, WriteVLDPairShuffle], + (instregex "LD4Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle, + WriteVLDPairShuffle, WriteVLDPairShuffle], + (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV], + (instregex "LD4i(8|16|32)$")>; +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV], + (instregex "LD4i(8|16|32)_POST")>; + + +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV], + (instrs LD4i64)>; +def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV], + (instrs LD4i64_POST)>; + +def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV], + (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>; +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV], + (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>; + +def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV], + (instrs LD4Rv1d,LD4Rv2d)>; +def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV], + (instrs LD4Rv1d_POST,LD4Rv2d_POST)>; + +//--- +// 7.9.16 Store, element operations +//--- + +// Only the WriteAdr for writeback matches a def operands. +// Subsequent WriteVLDs only consume resources. + +def : InstRW<[WriteVST], + (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, WriteVST], + (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVSTShuffle], + (instregex "ST1Twov(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, WriteVSTShuffle], + (instregex "ST1Twov(8b|4h|2s|1d)_POST")>; +def : InstRW<[WriteVST, WriteVST], + (instregex "ST1Twov(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, WriteVST, WriteVST], + (instregex "ST1Twov(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVSTShuffle, WriteVST], + (instregex "ST1Threev(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST], + (instregex "ST1Threev(8b|4h|2s|1d)_POST")>; +def : InstRW<[WriteVST, WriteVST, WriteVST], + (instregex "ST1Threev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST], + (instregex "ST1Threev(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], + (instregex "ST1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], + (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>; +def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST], + (instregex "ST1Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST], + (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVSTShuffle], (instregex "ST1i(8|16|32)$")>; +def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>; + +def : InstRW<[WriteVSTShuffle], (instrs ST1i64)>; +def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>; + +def : InstRW<[WriteVSTShuffle], + (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[WriteAdr, WriteVSTShuffle], + (instregex "ST2Twov(8b|4h|2s)_POST")>; +def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], + (instregex "ST2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], + (instregex "ST2Twov(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVSTShuffle], (instregex "ST2i(8|16|32)$")>; +def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>; +def : InstRW<[WriteVSTShuffle], (instrs ST2i64)>; +def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>; + +def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], + (instregex "ST3Threev(8b|4h|2s)$")>; +def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], + (instregex "ST3Threev(8b|4h|2s)_POST")>; +def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle], + (instregex "ST3Threev(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle], + (instregex "ST3Threev(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVSTShuffle], (instregex "ST3i(8|16|32)$")>; +def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>; + +def :InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64)>; +def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>; + +def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle], + (instregex "ST4Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle], + (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>; +def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle, + WriteVSTPairShuffle, WriteVSTPairShuffle], + (instregex "ST4Fourv(16b|8h|4s|2d)$")>; +def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle, + WriteVSTPairShuffle, WriteVSTPairShuffle], + (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>; + +def : InstRW<[WriteVSTPairShuffle], (instregex "ST4i(8|16|32)$")>; +def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>; + +def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST4i64)>; +def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>; + +//--- +// Unused SchedRead types +//--- + +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +} // SchedModel = CycloneModel diff --git a/lib/Target/AArch64/AArch64Schedule.td b/lib/Target/AArch64/AArch64Schedule.td new file mode 100644 index 00000000000..eaa9110ab1b --- /dev/null +++ b/lib/Target/AArch64/AArch64Schedule.td @@ -0,0 +1,104 @@ +//==-- AArch64Schedule.td - AArch64 Scheduling Definitions -*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// Define TII for use in SchedVariant Predicates. +// const MachineInstr *MI and const TargetSchedModel *SchedModel +// are defined by default. +def : PredicateProlog<[{ + const AArch64InstrInfo *TII = + static_cast(SchedModel->getInstrInfo()); + (void)TII; +}]>; + +// AArch64 Scheduler Definitions + +def WriteImm : SchedWrite; // MOVN, MOVZ +// TODO: Provide variants for MOV32/64imm Pseudos that dynamically +// select the correct sequence of WriteImms. + +def WriteI : SchedWrite; // ALU +def WriteISReg : SchedWrite; // ALU of Shifted-Reg +def WriteIEReg : SchedWrite; // ALU of Extended-Reg +def ReadI : SchedRead; // ALU +def ReadISReg : SchedRead; // ALU of Shifted-Reg +def ReadIEReg : SchedRead; // ALU of Extended-Reg +def WriteExtr : SchedWrite; // EXTR shifts a reg pair +def ReadExtrHi : SchedRead; // Read the high reg of the EXTR pair +def WriteIS : SchedWrite; // Shift/Scale +def WriteID32 : SchedWrite; // 32-bit Divide +def WriteID64 : SchedWrite; // 64-bit Divide +def ReadID : SchedRead; // 32/64-bit Divide +def WriteIM32 : SchedWrite; // 32-bit Multiply +def WriteIM64 : SchedWrite; // 64-bit Multiply +def ReadIM : SchedRead; // 32/64-bit Multiply +def ReadIMA : SchedRead; // 32/64-bit Multiply Accumulate +def WriteBr : SchedWrite; // Branch +def WriteBrReg : SchedWrite; // Indirect Branch + +def WriteLD : SchedWrite; // Load from base addr plus immediate offset +def WriteST : SchedWrite; // Store to base addr plus immediate offset +def WriteSTP : SchedWrite; // Store a register pair. +def WriteAdr : SchedWrite; // Address pre/post increment. + +def WriteLDIdx : SchedWrite; // Load from a register index (maybe scaled). +def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled). +def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST. + +// Predicate for determining when a shiftable register is shifted. +def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(MI)}]>; + +// Predicate for determining when a extendedable register is extended. +def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(MI)}]>; + +// ScaledIdxPred is true if a WriteLDIdx operand will be +// scaled. Subtargets can use this to dynamically select resources and +// latency for WriteLDIdx and ReadAdrBase. +def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(MI)}]>; + +// Serialized two-level address load. +// EXAMPLE: LOADGot +def WriteLDAdr : WriteSequence<[WriteAdr, WriteLD]>; + +// Serialized two-level address lookup. +// EXAMPLE: MOVaddr... +def WriteAdrAdr : WriteSequence<[WriteAdr, WriteAdr]>; + +// The second register of a load-pair. +// LDP,LDPSW,LDNP,LDXP,LDAXP +def WriteLDHi : SchedWrite; + +// Store-exclusive is a store followed by a dependent load. +def WriteSTX : WriteSequence<[WriteST, WriteLD]>; + +def WriteSys : SchedWrite; // Long, variable latency system ops. +def WriteBarrier : SchedWrite; // Memory barrier. +def WriteHint : SchedWrite; // Hint instruction. + +def WriteF : SchedWrite; // General floating-point ops. +def WriteFCmp : SchedWrite; // Floating-point compare. +def WriteFCvt : SchedWrite; // Float conversion. +def WriteFCopy : SchedWrite; // Float-int register copy. +def WriteFImm : SchedWrite; // Floating-point immediate. +def WriteFMul : SchedWrite; // Floating-point multiply. +def WriteFDiv : SchedWrite; // Floating-point division. + +def WriteV : SchedWrite; // Vector ops. +def WriteVLD : SchedWrite; // Vector loads. +def WriteVST : SchedWrite; // Vector stores. + +// Read the unwritten lanes of the VLD's destination registers. +def ReadVLD : SchedRead; + +// Sequential vector load and shuffle. +def WriteVLDShuffle : WriteSequence<[WriteVLD, WriteV]>; +def WriteVLDPairShuffle : WriteSequence<[WriteVLD, WriteV, WriteV]>; + +// Store a shuffled vector. +def WriteVSTShuffle : WriteSequence<[WriteV, WriteVST]>; +def WriteVSTPairShuffle : WriteSequence<[WriteV, WriteV, WriteVST]>; diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp new file mode 100644 index 00000000000..5c65b750ee5 --- /dev/null +++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -0,0 +1,59 @@ +//===-- AArch64SelectionDAGInfo.cpp - AArch64 SelectionDAG Info -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the AArch64SelectionDAGInfo class. +// +//===----------------------------------------------------------------------===// + +#include "AArch64TargetMachine.h" +using namespace llvm; + +#define DEBUG_TYPE "aarch64-selectiondag-info" + +AArch64SelectionDAGInfo::AArch64SelectionDAGInfo(const TargetMachine &TM) + : TargetSelectionDAGInfo(TM), + Subtarget(&TM.getSubtarget()) {} + +AArch64SelectionDAGInfo::~AArch64SelectionDAGInfo() {} + +SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( + SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, bool isVolatile, + MachinePointerInfo DstPtrInfo) const { + // Check to see if there is a specialized entry-point for memory zeroing. + ConstantSDNode *V = dyn_cast(Src); + ConstantSDNode *SizeValue = dyn_cast(Size); + const char *bzeroEntry = + (V && V->isNullValue()) ? Subtarget->getBZeroEntry() : nullptr; + // For small size (< 256), it is not beneficial to use bzero + // instead of memset. + if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) { + const AArch64TargetLowering &TLI = + *static_cast( + DAG.getTarget().getTargetLowering()); + + EVT IntPtr = TLI.getPointerTy(); + Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext()); + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + Entry.Node = Dst; + Entry.Ty = IntPtrTy; + Args.push_back(Entry); + Entry.Node = Size; + Args.push_back(Entry); + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl).setChain(Chain) + .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0) + .setDiscardResult(); + std::pair CallResult = TLI.LowerCallTo(CLI); + return CallResult.second; + } + return SDValue(); +} diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h new file mode 100644 index 00000000000..8381f9916a8 --- /dev/null +++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h @@ -0,0 +1,37 @@ +//===-- AArch64SelectionDAGInfo.h - AArch64 SelectionDAG Info ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the AArch64 subclass for TargetSelectionDAGInfo. +// +//===----------------------------------------------------------------------===// + +#ifndef AArch64SELECTIONDAGINFO_H +#define AArch64SELECTIONDAGINFO_H + +#include "llvm/Target/TargetSelectionDAGInfo.h" + +namespace llvm { + +class AArch64SelectionDAGInfo : public TargetSelectionDAGInfo { + /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can + /// make the right decision when generating code for different targets. + const AArch64Subtarget *Subtarget; + +public: + explicit AArch64SelectionDAGInfo(const TargetMachine &TM); + ~AArch64SelectionDAGInfo(); + + SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain, + SDValue Dst, SDValue Src, SDValue Size, + unsigned Align, bool isVolatile, + MachinePointerInfo DstPtrInfo) const override; +}; +} + +#endif diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp new file mode 100644 index 00000000000..45f8ddbd2d8 --- /dev/null +++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp @@ -0,0 +1,168 @@ +//===--- AArch64StorePairSuppress.cpp --- Suppress store pair formation ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass identifies floating point stores that should not be combined into +// store pairs. Later we may do the same for floating point loads. +// ===---------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineTraceMetrics.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-stp-suppress" + +namespace { +class AArch64StorePairSuppress : public MachineFunctionPass { + const AArch64InstrInfo *TII; + const TargetRegisterInfo *TRI; + const MachineRegisterInfo *MRI; + MachineFunction *MF; + TargetSchedModel SchedModel; + MachineTraceMetrics *Traces; + MachineTraceMetrics::Ensemble *MinInstr; + +public: + static char ID; + AArch64StorePairSuppress() : MachineFunctionPass(ID) {} + + virtual const char *getPassName() const override { + return "AArch64 Store Pair Suppression"; + } + + bool runOnMachineFunction(MachineFunction &F) override; + +private: + bool shouldAddSTPToBlock(const MachineBasicBlock *BB); + + bool isNarrowFPStore(const MachineInstr &MI); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addPreserved(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +char AArch64StorePairSuppress::ID = 0; +} // anonymous + +FunctionPass *llvm::createAArch64StorePairSuppressPass() { + return new AArch64StorePairSuppress(); +} + +/// Return true if an STP can be added to this block without increasing the +/// critical resource height. STP is good to form in Ld/St limited blocks and +/// bad to form in float-point limited blocks. This is true independent of the +/// critical path. If the critical path is longer than the resource height, the +/// extra vector ops can limit physreg renaming. Otherwise, it could simply +/// oversaturate the vector units. +bool AArch64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB) { + if (!MinInstr) + MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); + + MachineTraceMetrics::Trace BBTrace = MinInstr->getTrace(BB); + unsigned ResLength = BBTrace.getResourceLength(); + + // Get the machine model's scheduling class for STPQi. + // Bypass TargetSchedule's SchedClass resolution since we only have an opcode. + unsigned SCIdx = TII->get(AArch64::STPDi).getSchedClass(); + const MCSchedClassDesc *SCDesc = + SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx); + + // If a subtarget does not define resources for STPQi, bail here. + if (SCDesc->isValid() && !SCDesc->isVariant()) { + unsigned ResLenWithSTP = BBTrace.getResourceLength( + ArrayRef(), SCDesc); + if (ResLenWithSTP > ResLength) { + DEBUG(dbgs() << " Suppress STP in BB: " << BB->getNumber() + << " resources " << ResLength << " -> " << ResLenWithSTP + << "\n"); + return false; + } + } + return true; +} + +/// Return true if this is a floating-point store smaller than the V reg. On +/// cyclone, these require a vector shuffle before storing a pair. +/// Ideally we would call getMatchingPairOpcode() and have the machine model +/// tell us if it's profitable with no cpu knowledge here. +/// +/// FIXME: We plan to develop a decent Target abstraction for simple loads and +/// stores. Until then use a nasty switch similar to AArch64LoadStoreOptimizer. +bool AArch64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + return false; + case AArch64::STRSui: + case AArch64::STRDui: + case AArch64::STURSi: + case AArch64::STURDi: + return true; + } +} + +bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) { + MF = &mf; + TII = static_cast(MF->getTarget().getInstrInfo()); + TRI = MF->getTarget().getRegisterInfo(); + MRI = &MF->getRegInfo(); + const TargetSubtargetInfo &ST = + MF->getTarget().getSubtarget(); + SchedModel.init(*ST.getSchedModel(), &ST, TII); + + Traces = &getAnalysis(); + MinInstr = nullptr; + + DEBUG(dbgs() << "*** " << getPassName() << ": " << MF->getName() << '\n'); + + if (!SchedModel.hasInstrSchedModel()) { + DEBUG(dbgs() << " Skipping pass: no machine model present.\n"); + return false; + } + + // Check for a sequence of stores to the same base address. We don't need to + // precisely determine whether a store pair can be formed. But we do want to + // filter out most situations where we can't form store pairs to avoid + // computing trace metrics in those cases. + for (auto &MBB : *MF) { + bool SuppressSTP = false; + unsigned PrevBaseReg = 0; + for (auto &MI : MBB) { + if (!isNarrowFPStore(MI)) + continue; + unsigned BaseReg; + unsigned Offset; + if (TII->getLdStBaseRegImmOfs(&MI, BaseReg, Offset, TRI)) { + if (PrevBaseReg == BaseReg) { + // If this block can take STPs, skip ahead to the next block. + if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent())) + break; + // Otherwise, continue unpairing the stores in this block. + DEBUG(dbgs() << "Unpairing store " << MI << "\n"); + SuppressSTP = true; + TII->suppressLdStPair(&MI); + } + PrevBaseReg = BaseReg; + } else + PrevBaseReg = 0; + } + } + // This pass just sets some internal MachineMemOperand flags. It can't really + // invalidate anything. + return false; +} diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp new file mode 100644 index 00000000000..cd69994620d --- /dev/null +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -0,0 +1,116 @@ +//===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the AArch64 specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-subtarget" + +#define GET_SUBTARGETINFO_CTOR +#define GET_SUBTARGETINFO_TARGET_DESC +#include "AArch64GenSubtargetInfo.inc" + +static cl::opt +EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if " + "converter pass"), cl::init(true), cl::Hidden); + +AArch64Subtarget::AArch64Subtarget(const std::string &TT, + const std::string &CPU, + const std::string &FS, bool LittleEndian) + : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others), + HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false), + HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), CPUString(CPU), + TargetTriple(TT), IsLittleEndian(LittleEndian) { + // Determine default and user-specified characteristics + + if (CPUString.empty()) + CPUString = "generic"; + + ParseSubtargetFeatures(CPUString, FS); +} + +/// ClassifyGlobalReference - Find the target operand flags that describe +/// how a global value should be referenced for the current subtarget. +unsigned char +AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, + const TargetMachine &TM) const { + + // Determine whether this is a reference to a definition or a declaration. + // Materializable GVs (in JIT lazy compilation mode) do not require an extra + // load from stub. + bool isDecl = GV->hasAvailableExternallyLinkage(); + if (GV->isDeclaration() && !GV->isMaterializable()) + isDecl = true; + + // MachO large model always goes via a GOT, simply to get a single 8-byte + // absolute relocation on all global addresses. + if (TM.getCodeModel() == CodeModel::Large && isTargetMachO()) + return AArch64II::MO_GOT; + + // The small code mode's direct accesses use ADRP, which cannot necessarily + // produce the value 0 (if the code is above 4GB). Therefore they must use the + // GOT. + if (TM.getCodeModel() == CodeModel::Small && GV->isWeakForLinker() && isDecl) + return AArch64II::MO_GOT; + + // If symbol visibility is hidden, the extra load is not needed if + // the symbol is definitely defined in the current translation unit. + + // The handling of non-hidden symbols in PIC mode is rather target-dependent: + // + On MachO, if the symbol is defined in this module the GOT can be + // skipped. + // + On ELF, the R_AARCH64_COPY relocation means that even symbols actually + // defined could end up in unexpected places. Use a GOT. + if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility()) { + if (isTargetMachO()) + return (isDecl || GV->isWeakForLinker()) ? AArch64II::MO_GOT + : AArch64II::MO_NO_FLAG; + else + // No need to go through the GOT for local symbols on ELF. + return GV->hasLocalLinkage() ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT; + } + + return AArch64II::MO_NO_FLAG; +} + +/// This function returns the name of a function which has an interface +/// like the non-standard bzero function, if such a function exists on +/// the current subtarget and it is considered prefereable over +/// memset with zero passed as the second argument. Otherwise it +/// returns null. +const char *AArch64Subtarget::getBZeroEntry() const { + // Prefer bzero on Darwin only. + if(isTargetDarwin()) + return "bzero"; + + return nullptr; +} + +void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, + MachineInstr *begin, MachineInstr *end, + unsigned NumRegionInstrs) const { + // LNT run (at least on Cyclone) showed reasonably significant gains for + // bi-directional scheduling. 253.perlbmk. + Policy.OnlyTopDown = false; + Policy.OnlyBottomUp = false; +} + +bool AArch64Subtarget::enableEarlyIfConversion() const { + return EnableEarlyIfConvert; +} diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h new file mode 100644 index 00000000000..590ea0580ea --- /dev/null +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -0,0 +1,110 @@ +//===--- AArch64Subtarget.h - Define Subtarget for the AArch64 -*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the AArch64 specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#ifndef AArch64SUBTARGET_H +#define AArch64SUBTARGET_H + +#include "llvm/Target/TargetSubtargetInfo.h" +#include "AArch64RegisterInfo.h" +#include + +#define GET_SUBTARGETINFO_HEADER +#include "AArch64GenSubtargetInfo.inc" + +namespace llvm { +class GlobalValue; +class StringRef; + +class AArch64Subtarget : public AArch64GenSubtargetInfo { +protected: + enum ARMProcFamilyEnum {Others, CortexA53, CortexA57, Cyclone}; + + /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others. + ARMProcFamilyEnum ARMProcFamily; + + bool HasFPARMv8; + bool HasNEON; + bool HasCrypto; + bool HasCRC; + + // HasZeroCycleRegMove - Has zero-cycle register mov instructions. + bool HasZeroCycleRegMove; + + // HasZeroCycleZeroing - Has zero-cycle zeroing instructions. + bool HasZeroCycleZeroing; + + /// CPUString - String name of used CPU. + std::string CPUString; + + /// TargetTriple - What processor and OS we're targeting. + Triple TargetTriple; + + /// IsLittleEndian - Is the target little endian? + bool IsLittleEndian; + +public: + /// This constructor initializes the data members to match that + /// of the specified triple. + AArch64Subtarget(const std::string &TT, const std::string &CPU, + const std::string &FS, bool LittleEndian); + + bool enableMachineScheduler() const override { return true; } + + bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; } + + bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; } + + bool hasFPARMv8() const { return HasFPARMv8; } + bool hasNEON() const { return HasNEON; } + bool hasCrypto() const { return HasCrypto; } + bool hasCRC() const { return HasCRC; } + + bool isLittleEndian() const { return IsLittleEndian; } + + bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } + + bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } + + bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } + + bool isCyclone() const { return CPUString == "cyclone"; } + + /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size + /// that still makes it profitable to inline the call. + unsigned getMaxInlineSizeThreshold() const { return 64; } + + /// ParseSubtargetFeatures - Parses features string setting specified + /// subtarget options. Definition of function is auto generated by tblgen. + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + + /// ClassifyGlobalReference - Find the target operand flags that describe + /// how a global value should be referenced for the current subtarget. + unsigned char ClassifyGlobalReference(const GlobalValue *GV, + const TargetMachine &TM) const; + + /// This function returns the name of a function which has an interface + /// like the non-standard bzero function, if such a function exists on + /// the current subtarget and it is considered prefereable over + /// memset with zero passed as the second argument. Otherwise it + /// returns null. + const char *getBZeroEntry() const; + + void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin, + MachineInstr *end, + unsigned NumRegionInstrs) const override; + + bool enableEarlyIfConversion() const override; +}; +} // End llvm namespace + +#endif // AArch64SUBTARGET_H diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp new file mode 100644 index 00000000000..0b5dd2f067e --- /dev/null +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -0,0 +1,208 @@ +//===-- AArch64TargetMachine.cpp - Define TargetMachine for AArch64 -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64TargetMachine.h" +#include "llvm/PassManager.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Transforms/Scalar.h" +using namespace llvm; + +static cl::opt +EnableCCMP("aarch64-ccmp", cl::desc("Enable the CCMP formation pass"), + cl::init(true), cl::Hidden); + +static cl::opt +EnableStPairSuppress("aarch64-stp-suppress", cl::desc("Suppress STP for AArch64"), + cl::init(true), cl::Hidden); + +static cl::opt +EnableAdvSIMDScalar("aarch64-simd-scalar", cl::desc("Enable use of AdvSIMD scalar" + " integer instructions"), cl::init(false), cl::Hidden); + +static cl::opt +EnablePromoteConstant("aarch64-promote-const", cl::desc("Enable the promote " + "constant pass"), cl::init(true), cl::Hidden); + +static cl::opt +EnableCollectLOH("aarch64-collect-loh", cl::desc("Enable the pass that emits the" + " linker optimization hints (LOH)"), cl::init(true), + cl::Hidden); + +static cl::opt +EnableDeadRegisterElimination("aarch64-dead-def-elimination", cl::Hidden, + cl::desc("Enable the pass that removes dead" + " definitons and replaces stores to" + " them with stores to the zero" + " register"), + cl::init(true)); + +static cl::opt +EnableLoadStoreOpt("aarch64-load-store-opt", cl::desc("Enable the load/store pair" + " optimization pass"), cl::init(true), cl::Hidden); + +extern "C" void LLVMInitializeAArch64Target() { + // Register the target. + RegisterTargetMachine X(TheAArch64leTarget); + RegisterTargetMachine Y(TheAArch64beTarget); + + RegisterTargetMachine Z(TheARM64leTarget); + RegisterTargetMachine W(TheARM64beTarget); +} + +/// TargetMachine ctor - Create an AArch64 architecture model. +/// +AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL, + bool LittleEndian) + : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + Subtarget(TT, CPU, FS, LittleEndian), + // This nested ternary is horrible, but DL needs to be properly + // initialized + // before TLInfo is constructed. + DL(Subtarget.isTargetMachO() + ? "e-m:o-i64:64-i128:128-n32:64-S128" + : (LittleEndian ? "e-m:e-i64:64-i128:128-n32:64-S128" + : "E-m:e-i64:64-i128:128-n32:64-S128")), + InstrInfo(Subtarget), TLInfo(*this), FrameLowering(*this, Subtarget), + TSInfo(*this) { + initAsmInfo(); +} + +void AArch64leTargetMachine::anchor() { } + +AArch64leTargetMachine:: +AArch64leTargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} + +void AArch64beTargetMachine::anchor() { } + +AArch64beTargetMachine:: +AArch64beTargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL) + : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} + +namespace { +/// AArch64 Code Generator Pass Configuration Options. +class AArch64PassConfig : public TargetPassConfig { +public: + AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM) + : TargetPassConfig(TM, PM) {} + + AArch64TargetMachine &getAArch64TargetMachine() const { + return getTM(); + } + + bool addPreISel() override; + bool addInstSelector() override; + bool addILPOpts() override; + bool addPreRegAlloc() override; + bool addPostRegAlloc() override; + bool addPreSched2() override; + bool addPreEmitPass() override; +}; +} // namespace + +void AArch64TargetMachine::addAnalysisPasses(PassManagerBase &PM) { + // Add first the target-independent BasicTTI pass, then our AArch64 pass. This + // allows the AArch64 pass to delegate to the target independent layer when + // appropriate. + PM.add(createBasicTargetTransformInfoPass(this)); + PM.add(createAArch64TargetTransformInfoPass(this)); +} + +TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) { + return new AArch64PassConfig(this, PM); +} + +// Pass Pipeline Configuration +bool AArch64PassConfig::addPreISel() { + // Run promote constant before global merge, so that the promoted constants + // get a chance to be merged + if (TM->getOptLevel() != CodeGenOpt::None && EnablePromoteConstant) + addPass(createAArch64PromoteConstantPass()); + if (TM->getOptLevel() != CodeGenOpt::None) + addPass(createGlobalMergePass(TM)); + if (TM->getOptLevel() != CodeGenOpt::None) + addPass(createAArch64AddressTypePromotionPass()); + + // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg + // ourselves. + addPass(createAtomicExpandLoadLinkedPass(TM)); + + return false; +} + +bool AArch64PassConfig::addInstSelector() { + addPass(createAArch64ISelDag(getAArch64TargetMachine(), getOptLevel())); + + // For ELF, cleanup any local-dynamic TLS accesses (i.e. combine as many + // references to _TLS_MODULE_BASE_ as possible. + if (TM->getSubtarget().isTargetELF() && + getOptLevel() != CodeGenOpt::None) + addPass(createAArch64CleanupLocalDynamicTLSPass()); + + return false; +} + +bool AArch64PassConfig::addILPOpts() { + if (EnableCCMP) + addPass(createAArch64ConditionalCompares()); + addPass(&EarlyIfConverterID); + if (EnableStPairSuppress) + addPass(createAArch64StorePairSuppressPass()); + return true; +} + +bool AArch64PassConfig::addPreRegAlloc() { + // Use AdvSIMD scalar instructions whenever profitable. + if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar) + addPass(createAArch64AdvSIMDScalar()); + return true; +} + +bool AArch64PassConfig::addPostRegAlloc() { + // Change dead register definitions to refer to the zero register. + if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination) + addPass(createAArch64DeadRegisterDefinitions()); + return true; +} + +bool AArch64PassConfig::addPreSched2() { + // Expand some pseudo instructions to allow proper scheduling. + addPass(createAArch64ExpandPseudoPass()); + // Use load/store pair instructions when possible. + if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt) + addPass(createAArch64LoadStoreOptimizationPass()); + return true; +} + +bool AArch64PassConfig::addPreEmitPass() { + // Relax conditional branch instructions if they're otherwise out of + // range of their destination. + addPass(createAArch64BranchRelaxation()); + if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH && + TM->getSubtarget().isTargetMachO()) + addPass(createAArch64CollectLOHPass()); + return true; +} diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h new file mode 100644 index 00000000000..079b19b23bb --- /dev/null +++ b/lib/Target/AArch64/AArch64TargetMachine.h @@ -0,0 +1,94 @@ +//==-- AArch64TargetMachine.h - Define TargetMachine for AArch64 -*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares the AArch64 specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#ifndef AArch64TARGETMACHINE_H +#define AArch64TARGETMACHINE_H + +#include "AArch64InstrInfo.h" +#include "AArch64ISelLowering.h" +#include "AArch64Subtarget.h" +#include "AArch64FrameLowering.h" +#include "AArch64SelectionDAGInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/MC/MCStreamer.h" + +namespace llvm { + +class AArch64TargetMachine : public LLVMTargetMachine { +protected: + AArch64Subtarget Subtarget; + +private: + const DataLayout DL; + AArch64InstrInfo InstrInfo; + AArch64TargetLowering TLInfo; + AArch64FrameLowering FrameLowering; + AArch64SelectionDAGInfo TSInfo; + +public: + AArch64TargetMachine(const Target &T, StringRef TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL, bool IsLittleEndian); + + const AArch64Subtarget *getSubtargetImpl() const override { + return &Subtarget; + } + const AArch64TargetLowering *getTargetLowering() const override { + return &TLInfo; + } + const DataLayout *getDataLayout() const override { return &DL; } + const AArch64FrameLowering *getFrameLowering() const override { + return &FrameLowering; + } + const AArch64InstrInfo *getInstrInfo() const override { return &InstrInfo; } + const AArch64RegisterInfo *getRegisterInfo() const override { + return &InstrInfo.getRegisterInfo(); + } + const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override { + return &TSInfo; + } + + // Pass Pipeline Configuration + TargetPassConfig *createPassConfig(PassManagerBase &PM) override; + + /// \brief Register AArch64 analysis passes with a pass manager. + void addAnalysisPasses(PassManagerBase &PM) override; +}; + +// AArch64leTargetMachine - AArch64 little endian target machine. +// +class AArch64leTargetMachine : public AArch64TargetMachine { + virtual void anchor(); +public: + AArch64leTargetMachine(const Target &T, StringRef TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); +}; + +// AArch64beTargetMachine - AArch64 big endian target machine. +// +class AArch64beTargetMachine : public AArch64TargetMachine { + virtual void anchor(); +public: + AArch64beTargetMachine(const Target &T, StringRef TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp new file mode 100644 index 00000000000..4069038dffe --- /dev/null +++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp @@ -0,0 +1,52 @@ +//===-- AArch64TargetObjectFile.cpp - AArch64 Object Info -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AArch64TargetObjectFile.h" +#include "AArch64TargetMachine.h" +#include "llvm/IR/Mangler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Support/Dwarf.h" +using namespace llvm; +using namespace dwarf; + +void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx, + const TargetMachine &TM) { + TargetLoweringObjectFileELF::Initialize(Ctx, TM); + InitializeELF(TM.Options.UseInitArray); +} + +const MCExpr *AArch64_MachoTargetObjectFile::getTTypeGlobalReference( + const GlobalValue *GV, unsigned Encoding, Mangler &Mang, + const TargetMachine &TM, MachineModuleInfo *MMI, + MCStreamer &Streamer) const { + // On Darwin, we can reference dwarf symbols with foo@GOT-., which + // is an indirect pc-relative reference. The default implementation + // won't reference using the GOT, so we need this target-specific + // version. + if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) { + const MCSymbol *Sym = TM.getSymbol(GV, Mang); + const MCExpr *Res = + MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, getContext()); + MCSymbol *PCSym = getContext().CreateTempSymbol(); + Streamer.EmitLabel(PCSym); + const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, getContext()); + return MCBinaryExpr::CreateSub(Res, PC, getContext()); + } + + return TargetLoweringObjectFileMachO::getTTypeGlobalReference( + GV, Encoding, Mang, TM, MMI, Streamer); +} + +MCSymbol *AArch64_MachoTargetObjectFile::getCFIPersonalitySymbol( + const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM, + MachineModuleInfo *MMI) const { + return TM.getSymbol(GV, Mang); +} diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h new file mode 100644 index 00000000000..de63cb42542 --- /dev/null +++ b/lib/Target/AArch64/AArch64TargetObjectFile.h @@ -0,0 +1,40 @@ +//===-- AArch64TargetObjectFile.h - AArch64 Object Info -*- C++ ---------*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_AArch64_TARGETOBJECTFILE_H +#define LLVM_TARGET_AArch64_TARGETOBJECTFILE_H + +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/Target/TargetLoweringObjectFile.h" + +namespace llvm { +class AArch64TargetMachine; + +/// This implementation is used for AArch64 ELF targets (Linux in particular). +class AArch64_ELFTargetObjectFile : public TargetLoweringObjectFileELF { + void Initialize(MCContext &Ctx, const TargetMachine &TM) override; +}; + +/// AArch64_MachoTargetObjectFile - This TLOF implementation is used for Darwin. +class AArch64_MachoTargetObjectFile : public TargetLoweringObjectFileMachO { +public: + const MCExpr *getTTypeGlobalReference(const GlobalValue *GV, + unsigned Encoding, Mangler &Mang, + const TargetMachine &TM, + MachineModuleInfo *MMI, + MCStreamer &Streamer) const override; + + MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang, + const TargetMachine &TM, + MachineModuleInfo *MMI) const override; +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp new file mode 100644 index 00000000000..33e482a53a4 --- /dev/null +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -0,0 +1,464 @@ +//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements a TargetTransformInfo analysis pass specific to the +/// AArch64 target machine. It uses the target's detailed information to provide +/// more precise answers to certain TTI queries, while letting the target +/// independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64TargetMachine.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/CostTable.h" +#include "llvm/Target/TargetLowering.h" +#include +using namespace llvm; + +#define DEBUG_TYPE "aarch64tti" + +// Declare the pass initialization routine locally as target-specific passes +// don't have a target-wide initialization entry point, and so we rely on the +// pass constructor initialization. +namespace llvm { +void initializeAArch64TTIPass(PassRegistry &); +} + +namespace { + +class AArch64TTI final : public ImmutablePass, public TargetTransformInfo { + const AArch64TargetMachine *TM; + const AArch64Subtarget *ST; + const AArch64TargetLowering *TLI; + + /// Estimate the overhead of scalarizing an instruction. Insert and Extract + /// are set if the result needs to be inserted and/or extracted from vectors. + unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; + +public: + AArch64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) { + llvm_unreachable("This pass cannot be directly constructed"); + } + + AArch64TTI(const AArch64TargetMachine *TM) + : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), + TLI(TM->getTargetLowering()) { + initializeAArch64TTIPass(*PassRegistry::getPassRegistry()); + } + + void initializePass() override { pushTTIStack(this); } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + TargetTransformInfo::getAnalysisUsage(AU); + } + + /// Pass identification. + static char ID; + + /// Provide necessary pointer adjustments for the two base classes. + void *getAdjustedAnalysisPointer(const void *ID) override { + if (ID == &TargetTransformInfo::ID) + return (TargetTransformInfo *)this; + return this; + } + + /// \name Scalar TTI Implementations + /// @{ + unsigned getIntImmCost(int64_t Val) const; + unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override; + unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, + Type *Ty) const override; + unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty) const override; + PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override; + + /// @} + + /// \name Vector TTI Implementations + /// @{ + + unsigned getNumberOfRegisters(bool Vector) const override { + if (Vector) { + if (ST->hasNEON()) + return 32; + return 0; + } + return 31; + } + + unsigned getRegisterBitWidth(bool Vector) const override { + if (Vector) { + if (ST->hasNEON()) + return 128; + return 0; + } + return 64; + } + + unsigned getMaximumUnrollFactor() const override { return 2; } + + unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const + override; + + unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const + override; + + unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, + OperandValueKind Opd1Info = OK_AnyValue, + OperandValueKind Opd2Info = OK_AnyValue) const + override; + + unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override; + + unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const + override; + + unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace) const override; + /// @} +}; + +} // end anonymous namespace + +INITIALIZE_AG_PASS(AArch64TTI, TargetTransformInfo, "aarch64tti", + "AArch64 Target Transform Info", true, true, false) +char AArch64TTI::ID = 0; + +ImmutablePass * +llvm::createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM) { + return new AArch64TTI(TM); +} + +/// \brief Calculate the cost of materializing a 64-bit value. This helper +/// method might only calculate a fraction of a larger immediate. Therefore it +/// is valid to return a cost of ZERO. +unsigned AArch64TTI::getIntImmCost(int64_t Val) const { + // Check if the immediate can be encoded within an instruction. + if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) + return 0; + + if (Val < 0) + Val = ~Val; + + // Calculate how many moves we will need to materialize this constant. + unsigned LZ = countLeadingZeros((uint64_t)Val); + return (64 - LZ + 15) / 16; +} + +/// \brief Calculate the cost of materializing the given constant. +unsigned AArch64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + if (BitSize == 0) + return ~0U; + + // Sign-extend all constants to a multiple of 64-bit. + APInt ImmVal = Imm; + if (BitSize & 0x3f) + ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); + + // Split the constant into 64-bit chunks and calculate the cost for each + // chunk. + unsigned Cost = 0; + for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { + APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); + int64_t Val = Tmp.getSExtValue(); + Cost += getIntImmCost(Val); + } + // We need at least one instruction to materialze the constant. + return std::max(1U, Cost); +} + +unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx, + const APInt &Imm, Type *Ty) const { + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + // There is no cost model for constants with a bit size of 0. Return TCC_Free + // here, so that constant hoisting will ignore this constant. + if (BitSize == 0) + return TCC_Free; + + unsigned ImmIdx = ~0U; + switch (Opcode) { + default: + return TCC_Free; + case Instruction::GetElementPtr: + // Always hoist the base address of a GetElementPtr. + if (Idx == 0) + return 2 * TCC_Basic; + return TCC_Free; + case Instruction::Store: + ImmIdx = 0; + break; + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::ICmp: + ImmIdx = 1; + break; + // Always return TCC_Free for the shift value of a shift instruction. + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + if (Idx == 1) + return TCC_Free; + break; + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::IntToPtr: + case Instruction::PtrToInt: + case Instruction::BitCast: + case Instruction::PHI: + case Instruction::Call: + case Instruction::Select: + case Instruction::Ret: + case Instruction::Load: + break; + } + + if (Idx == ImmIdx) { + unsigned NumConstants = (BitSize + 63) / 64; + unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty); + return (Cost <= NumConstants * TCC_Basic) + ? static_cast(TCC_Free) : Cost; + } + return AArch64TTI::getIntImmCost(Imm, Ty); +} + +unsigned AArch64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, + const APInt &Imm, Type *Ty) const { + assert(Ty->isIntegerTy()); + + unsigned BitSize = Ty->getPrimitiveSizeInBits(); + // There is no cost model for constants with a bit size of 0. Return TCC_Free + // here, so that constant hoisting will ignore this constant. + if (BitSize == 0) + return TCC_Free; + + switch (IID) { + default: + return TCC_Free; + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: + if (Idx == 1) { + unsigned NumConstants = (BitSize + 63) / 64; + unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty); + return (Cost <= NumConstants * TCC_Basic) + ? static_cast(TCC_Free) : Cost; + } + break; + case Intrinsic::experimental_stackmap: + if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) + return TCC_Free; + break; + case Intrinsic::experimental_patchpoint_void: + case Intrinsic::experimental_patchpoint_i64: + if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) + return TCC_Free; + break; + } + return AArch64TTI::getIntImmCost(Imm, Ty); +} + +AArch64TTI::PopcntSupportKind +AArch64TTI::getPopcntSupport(unsigned TyWidth) const { + assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); + if (TyWidth == 32 || TyWidth == 64) + return PSK_FastHardware; + // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. + return PSK_Software; +} + +unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst, + Type *Src) const { + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + EVT SrcTy = TLI->getValueType(Src); + EVT DstTy = TLI->getValueType(Dst); + + if (!SrcTy.isSimple() || !DstTy.isSimple()) + return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); + + static const TypeConversionCostTblEntry ConversionTbl[] = { + // LowerVectorINT_TO_FP: + { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, + // LowerVectorFP_TO_INT + { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, + { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, + { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, + { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 }, + { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 }, + { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 4 }, + { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 4 }, + { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 }, + { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 4 }, + { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 4 }, + { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 4 }, + }; + + int Idx = ConvertCostTableLookup( + ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(), + SrcTy.getSimpleVT()); + if (Idx != -1) + return ConversionTbl[Idx].Cost; + + return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); +} + +unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) const { + assert(Val->isVectorTy() && "This must be a vector type"); + + if (Index != -1U) { + // Legalize the type. + std::pair LT = TLI->getTypeLegalizationCost(Val); + + // This type is legalized to a scalar type. + if (!LT.second.isVector()) + return 0; + + // The type may be split. Normalize the index to the new type. + unsigned Width = LT.second.getVectorNumElements(); + Index = Index % Width; + + // The element at index zero is already inside the vector. + if (Index == 0) + return 0; + } + + // All other insert/extracts cost this much. + return 2; +} + +unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, + OperandValueKind Opd1Info, + OperandValueKind Opd2Info) const { + // Legalize the type. + std::pair LT = TLI->getTypeLegalizationCost(Ty); + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + + switch (ISD) { + default: + return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info, + Opd2Info); + case ISD::ADD: + case ISD::MUL: + case ISD::XOR: + case ISD::OR: + case ISD::AND: + // These nodes are marked as 'custom' for combining purposes only. + // We know that they are legal. See LowerAdd in ISelLowering. + return 1 * LT.first; + } +} + +unsigned AArch64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { + // Address computations in vectorized code with non-consecutive addresses will + // likely result in more instructions compared to scalar code where the + // computation can more often be merged into the index mode. The resulting + // extra micro-ops can significantly decrease throughput. + unsigned NumVectorInstToHideOverhead = 10; + + if (Ty->isVectorTy() && IsComplex) + return NumVectorInstToHideOverhead; + + // In many cases the address computation is not merged into the instruction + // addressing mode. + return 1; +} + +unsigned AArch64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy) const { + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + // We don't lower vector selects well that are wider than the register width. + if (ValTy->isVectorTy() && ISD == ISD::SELECT) { + // We would need this many instructions to hide the scalarization happening. + unsigned AmortizationCost = 20; + static const TypeConversionCostTblEntry + VectorSelectTbl[] = { + { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost }, + { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost }, + { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost }, + { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, + { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, + { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } + }; + + EVT SelCondTy = TLI->getValueType(CondTy); + EVT SelValTy = TLI->getValueType(ValTy); + if (SelCondTy.isSimple() && SelValTy.isSimple()) { + int Idx = + ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(), + SelValTy.getSimpleVT()); + if (Idx != -1) + return VectorSelectTbl[Idx].Cost; + } + } + return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); +} + +unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, + unsigned AddressSpace) const { + std::pair LT = TLI->getTypeLegalizationCost(Src); + + if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 && + Src->getVectorElementType()->isIntegerTy(64)) { + // Unaligned stores are extremely inefficient. We don't split + // unaligned v2i64 stores because the negative impact that has shown in + // practice on inlined memcpy code. + // We make v2i64 stores expensive so that we will only vectorize if there + // are 6 other instructions getting vectorized. + unsigned AmortizationCost = 6; + + return LT.first * 2 * AmortizationCost; + } + + if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) && + Src->getVectorNumElements() < 8) { + // We scalarize the loads/stores because there is not v.4b register and we + // have to promote the elements to v.4h. + unsigned NumVecElts = Src->getVectorNumElements(); + unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; + // We generate 2 instructions per vector element. + return NumVectorizableInstsToAmortize * NumVecElts * 2; + } + + return LT.first; +} diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp new file mode 100644 index 00000000000..65b77c547dc --- /dev/null +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -0,0 +1,4047 @@ +//==- AArch64AsmParser.cpp - Parse AArch64 assembly to MCInst instructions -==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "MCTargetDesc/AArch64MCExpr.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Twine.h" +#include +using namespace llvm; + +namespace { + +class AArch64Operand; + +class AArch64AsmParser : public MCTargetAsmParser { +public: + typedef SmallVectorImpl OperandVector; + +private: + StringRef Mnemonic; ///< Instruction mnemonic. + MCSubtargetInfo &STI; + MCAsmParser &Parser; + + MCAsmParser &getParser() const { return Parser; } + MCAsmLexer &getLexer() const { return Parser.getLexer(); } + + SMLoc getLoc() const { return Parser.getTok().getLoc(); } + + bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands); + AArch64CC::CondCode parseCondCodeString(StringRef Cond); + bool parseCondCode(OperandVector &Operands, bool invertCondCode); + int tryParseRegister(); + int tryMatchVectorRegister(StringRef &Kind, bool expected); + bool parseRegister(OperandVector &Operands); + bool parseSymbolicImmVal(const MCExpr *&ImmVal); + bool parseVectorList(OperandVector &Operands); + bool parseOperand(OperandVector &Operands, bool isCondCode, + bool invertCondCode); + + void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); } + bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); } + bool showMatchError(SMLoc Loc, unsigned ErrCode); + + bool parseDirectiveWord(unsigned Size, SMLoc L); + bool parseDirectiveTLSDescCall(SMLoc L); + + bool parseDirectiveLOH(StringRef LOH, SMLoc L); + + bool validateInstruction(MCInst &Inst, SmallVectorImpl &Loc); + bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, MCStreamer &Out, + unsigned &ErrorInfo, + bool MatchingInlineAsm) override; +/// @name Auto-generated Match Functions +/// { + +#define GET_ASSEMBLER_HEADER +#include "AArch64GenAsmMatcher.inc" + + /// } + + OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands); + OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands); + OperandMatchResultTy tryParseMRSSystemRegister(OperandVector &Operands); + OperandMatchResultTy tryParseSysReg(OperandVector &Operands); + OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands); + OperandMatchResultTy tryParsePrefetch(OperandVector &Operands); + OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands); + OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands); + OperandMatchResultTy tryParseFPImm(OperandVector &Operands); + OperandMatchResultTy tryParseAddSubImm(OperandVector &Operands); + OperandMatchResultTy tryParseGPR64sp0Operand(OperandVector &Operands); + bool tryParseVectorRegister(OperandVector &Operands); + +public: + enum AArch64MatchResultTy { + Match_InvalidSuffix = FIRST_TARGET_MATCH_RESULT_TY, +#define GET_OPERAND_DIAGNOSTIC_TYPES +#include "AArch64GenAsmMatcher.inc" + }; + AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser, + const MCInstrInfo &MII, + const MCTargetOptions &Options) + : MCTargetAsmParser(), STI(_STI), Parser(_Parser) { + MCAsmParserExtension::Initialize(_Parser); + + // Initialize the set of available features. + setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + } + + bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) override; + bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; + bool ParseDirective(AsmToken DirectiveID) override; + unsigned validateTargetOperandClass(MCParsedAsmOperand *Op, + unsigned Kind) override; + + static bool classifySymbolRef(const MCExpr *Expr, + AArch64MCExpr::VariantKind &ELFRefKind, + MCSymbolRefExpr::VariantKind &DarwinRefKind, + int64_t &Addend); +}; +} // end anonymous namespace + +namespace { + +/// AArch64Operand - Instances of this class represent a parsed AArch64 machine +/// instruction. +class AArch64Operand : public MCParsedAsmOperand { +private: + enum KindTy { + k_Immediate, + k_ShiftedImm, + k_CondCode, + k_Register, + k_VectorList, + k_VectorIndex, + k_Token, + k_SysReg, + k_SysCR, + k_Prefetch, + k_ShiftExtend, + k_FPImm, + k_Barrier + } Kind; + + SMLoc StartLoc, EndLoc; + + struct TokOp { + const char *Data; + unsigned Length; + bool IsSuffix; // Is the operand actually a suffix on the mnemonic. + }; + + struct RegOp { + unsigned RegNum; + bool isVector; + }; + + struct VectorListOp { + unsigned RegNum; + unsigned Count; + unsigned NumElements; + unsigned ElementKind; + }; + + struct VectorIndexOp { + unsigned Val; + }; + + struct ImmOp { + const MCExpr *Val; + }; + + struct ShiftedImmOp { + const MCExpr *Val; + unsigned ShiftAmount; + }; + + struct CondCodeOp { + AArch64CC::CondCode Code; + }; + + struct FPImmOp { + unsigned Val; // Encoded 8-bit representation. + }; + + struct BarrierOp { + unsigned Val; // Not the enum since not all values have names. + }; + + struct SysRegOp { + const char *Data; + unsigned Length; + uint64_t FeatureBits; // We need to pass through information about which + // core we are compiling for so that the SysReg + // Mappers can appropriately conditionalize. + }; + + struct SysCRImmOp { + unsigned Val; + }; + + struct PrefetchOp { + unsigned Val; + }; + + struct ShiftExtendOp { + AArch64_AM::ShiftExtendType Type; + unsigned Amount; + bool HasExplicitAmount; + }; + + struct ExtendOp { + unsigned Val; + }; + + union { + struct TokOp Tok; + struct RegOp Reg; + struct VectorListOp VectorList; + struct VectorIndexOp VectorIndex; + struct ImmOp Imm; + struct ShiftedImmOp ShiftedImm; + struct CondCodeOp CondCode; + struct FPImmOp FPImm; + struct BarrierOp Barrier; + struct SysRegOp SysReg; + struct SysCRImmOp SysCRImm; + struct PrefetchOp Prefetch; + struct ShiftExtendOp ShiftExtend; + }; + + // Keep the MCContext around as the MCExprs may need manipulated during + // the add<>Operands() calls. + MCContext &Ctx; + + AArch64Operand(KindTy K, MCContext &_Ctx) + : MCParsedAsmOperand(), Kind(K), Ctx(_Ctx) {} + +public: + AArch64Operand(const AArch64Operand &o) : MCParsedAsmOperand(), Ctx(o.Ctx) { + Kind = o.Kind; + StartLoc = o.StartLoc; + EndLoc = o.EndLoc; + switch (Kind) { + case k_Token: + Tok = o.Tok; + break; + case k_Immediate: + Imm = o.Imm; + break; + case k_ShiftedImm: + ShiftedImm = o.ShiftedImm; + break; + case k_CondCode: + CondCode = o.CondCode; + break; + case k_FPImm: + FPImm = o.FPImm; + break; + case k_Barrier: + Barrier = o.Barrier; + break; + case k_Register: + Reg = o.Reg; + break; + case k_VectorList: + VectorList = o.VectorList; + break; + case k_VectorIndex: + VectorIndex = o.VectorIndex; + break; + case k_SysReg: + SysReg = o.SysReg; + break; + case k_SysCR: + SysCRImm = o.SysCRImm; + break; + case k_Prefetch: + Prefetch = o.Prefetch; + break; + case k_ShiftExtend: + ShiftExtend = o.ShiftExtend; + break; + } + } + + /// getStartLoc - Get the location of the first token of this operand. + SMLoc getStartLoc() const override { return StartLoc; } + /// getEndLoc - Get the location of the last token of this operand. + SMLoc getEndLoc() const override { return EndLoc; } + + StringRef getToken() const { + assert(Kind == k_Token && "Invalid access!"); + return StringRef(Tok.Data, Tok.Length); + } + + bool isTokenSuffix() const { + assert(Kind == k_Token && "Invalid access!"); + return Tok.IsSuffix; + } + + const MCExpr *getImm() const { + assert(Kind == k_Immediate && "Invalid access!"); + return Imm.Val; + } + + const MCExpr *getShiftedImmVal() const { + assert(Kind == k_ShiftedImm && "Invalid access!"); + return ShiftedImm.Val; + } + + unsigned getShiftedImmShift() const { + assert(Kind == k_ShiftedImm && "Invalid access!"); + return ShiftedImm.ShiftAmount; + } + + AArch64CC::CondCode getCondCode() const { + assert(Kind == k_CondCode && "Invalid access!"); + return CondCode.Code; + } + + unsigned getFPImm() const { + assert(Kind == k_FPImm && "Invalid access!"); + return FPImm.Val; + } + + unsigned getBarrier() const { + assert(Kind == k_Barrier && "Invalid access!"); + return Barrier.Val; + } + + unsigned getReg() const override { + assert(Kind == k_Register && "Invalid access!"); + return Reg.RegNum; + } + + unsigned getVectorListStart() const { + assert(Kind == k_VectorList && "Invalid access!"); + return VectorList.RegNum; + } + + unsigned getVectorListCount() const { + assert(Kind == k_VectorList && "Invalid access!"); + return VectorList.Count; + } + + unsigned getVectorIndex() const { + assert(Kind == k_VectorIndex && "Invalid access!"); + return VectorIndex.Val; + } + + StringRef getSysReg() const { + assert(Kind == k_SysReg && "Invalid access!"); + return StringRef(SysReg.Data, SysReg.Length); + } + + uint64_t getSysRegFeatureBits() const { + assert(Kind == k_SysReg && "Invalid access!"); + return SysReg.FeatureBits; + } + + unsigned getSysCR() const { + assert(Kind == k_SysCR && "Invalid access!"); + return SysCRImm.Val; + } + + unsigned getPrefetch() const { + assert(Kind == k_Prefetch && "Invalid access!"); + return Prefetch.Val; + } + + AArch64_AM::ShiftExtendType getShiftExtendType() const { + assert(Kind == k_ShiftExtend && "Invalid access!"); + return ShiftExtend.Type; + } + + unsigned getShiftExtendAmount() const { + assert(Kind == k_ShiftExtend && "Invalid access!"); + return ShiftExtend.Amount; + } + + bool hasShiftExtendAmount() const { + assert(Kind == k_ShiftExtend && "Invalid access!"); + return ShiftExtend.HasExplicitAmount; + } + + bool isImm() const override { return Kind == k_Immediate; } + bool isMem() const override { return false; } + bool isSImm9() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= -256 && Val < 256); + } + bool isSImm7s4() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= -256 && Val <= 252 && (Val & 3) == 0); + } + bool isSImm7s8() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= -512 && Val <= 504 && (Val & 7) == 0); + } + bool isSImm7s16() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= -1024 && Val <= 1008 && (Val & 15) == 0); + } + + bool isSymbolicUImm12Offset(const MCExpr *Expr, unsigned Scale) const { + AArch64MCExpr::VariantKind ELFRefKind; + MCSymbolRefExpr::VariantKind DarwinRefKind; + int64_t Addend; + if (!AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, + Addend)) { + // If we don't understand the expression, assume the best and + // let the fixup and relocation code deal with it. + return true; + } + + if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF || + ELFRefKind == AArch64MCExpr::VK_LO12 || + ELFRefKind == AArch64MCExpr::VK_GOT_LO12 || + ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 || + ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC || + ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 || + ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC || + ELFRefKind == AArch64MCExpr::VK_GOTTPREL_LO12_NC || + ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) { + // Note that we don't range-check the addend. It's adjusted modulo page + // size when converted, so there is no "out of range" condition when using + // @pageoff. + return Addend >= 0 && (Addend % Scale) == 0; + } else if (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF || + DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) { + // @gotpageoff/@tlvppageoff can only be used directly, not with an addend. + return Addend == 0; + } + + return false; + } + + template bool isUImm12Offset() const { + if (!isImm()) + return false; + + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return isSymbolicUImm12Offset(getImm(), Scale); + + int64_t Val = MCE->getValue(); + return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000; + } + + bool isImm0_7() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 0 && Val < 8); + } + bool isImm1_8() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val > 0 && Val < 9); + } + bool isImm0_15() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 0 && Val < 16); + } + bool isImm1_16() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val > 0 && Val < 17); + } + bool isImm0_31() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 0 && Val < 32); + } + bool isImm1_31() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 1 && Val < 32); + } + bool isImm1_32() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 1 && Val < 33); + } + bool isImm0_63() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 0 && Val < 64); + } + bool isImm1_63() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 1 && Val < 64); + } + bool isImm1_64() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 1 && Val < 65); + } + bool isImm0_127() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 0 && Val < 128); + } + bool isImm0_255() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 0 && Val < 256); + } + bool isImm0_65535() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 0 && Val < 65536); + } + bool isImm32_63() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 32 && Val < 64); + } + bool isLogicalImm32() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + return AArch64_AM::isLogicalImmediate(MCE->getValue(), 32); + } + bool isLogicalImm64() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + return AArch64_AM::isLogicalImmediate(MCE->getValue(), 64); + } + bool isShiftedImm() const { return Kind == k_ShiftedImm; } + bool isAddSubImm() const { + if (!isShiftedImm() && !isImm()) + return false; + + const MCExpr *Expr; + + // An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'. + if (isShiftedImm()) { + unsigned Shift = ShiftedImm.ShiftAmount; + Expr = ShiftedImm.Val; + if (Shift != 0 && Shift != 12) + return false; + } else { + Expr = getImm(); + } + + AArch64MCExpr::VariantKind ELFRefKind; + MCSymbolRefExpr::VariantKind DarwinRefKind; + int64_t Addend; + if (AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind, + DarwinRefKind, Addend)) { + return DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF + || DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF + || (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF && Addend == 0) + || ELFRefKind == AArch64MCExpr::VK_LO12 + || ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12 + || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 + || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC + || ELFRefKind == AArch64MCExpr::VK_TPREL_HI12 + || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 + || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC + || ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12; + } + + // Otherwise it should be a real immediate in range: + const MCConstantExpr *CE = cast(Expr); + return CE->getValue() >= 0 && CE->getValue() <= 0xfff; + } + bool isCondCode() const { return Kind == k_CondCode; } + bool isSIMDImmType10() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return false; + return AArch64_AM::isAdvSIMDModImmType10(MCE->getValue()); + } + bool isBranchTarget26() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return true; + int64_t Val = MCE->getValue(); + if (Val & 0x3) + return false; + return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2)); + } + bool isPCRelLabel19() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return true; + int64_t Val = MCE->getValue(); + if (Val & 0x3) + return false; + return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2)); + } + bool isBranchTarget14() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + return true; + int64_t Val = MCE->getValue(); + if (Val & 0x3) + return false; + return (Val >= -(0x2000 << 2) && Val <= (0x1fff << 2)); + } + + bool + isMovWSymbol(ArrayRef AllowedModifiers) const { + if (!isImm()) + return false; + + AArch64MCExpr::VariantKind ELFRefKind; + MCSymbolRefExpr::VariantKind DarwinRefKind; + int64_t Addend; + if (!AArch64AsmParser::classifySymbolRef(getImm(), ELFRefKind, + DarwinRefKind, Addend)) { + return false; + } + if (DarwinRefKind != MCSymbolRefExpr::VK_None) + return false; + + for (unsigned i = 0; i != AllowedModifiers.size(); ++i) { + if (ELFRefKind == AllowedModifiers[i]) + return Addend == 0; + } + + return false; + } + + bool isMovZSymbolG3() const { + static AArch64MCExpr::VariantKind Variants[] = { AArch64MCExpr::VK_ABS_G3 }; + return isMovWSymbol(Variants); + } + + bool isMovZSymbolG2() const { + static AArch64MCExpr::VariantKind Variants[] = { + AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S, + AArch64MCExpr::VK_TPREL_G2, AArch64MCExpr::VK_DTPREL_G2}; + return isMovWSymbol(Variants); + } + + bool isMovZSymbolG1() const { + static AArch64MCExpr::VariantKind Variants[] = { + AArch64MCExpr::VK_ABS_G1, AArch64MCExpr::VK_ABS_G1_S, + AArch64MCExpr::VK_GOTTPREL_G1, AArch64MCExpr::VK_TPREL_G1, + AArch64MCExpr::VK_DTPREL_G1, + }; + return isMovWSymbol(Variants); + } + + bool isMovZSymbolG0() const { + static AArch64MCExpr::VariantKind Variants[] = { + AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S, + AArch64MCExpr::VK_TPREL_G0, AArch64MCExpr::VK_DTPREL_G0}; + return isMovWSymbol(Variants); + } + + bool isMovKSymbolG3() const { + static AArch64MCExpr::VariantKind Variants[] = { AArch64MCExpr::VK_ABS_G3 }; + return isMovWSymbol(Variants); + } + + bool isMovKSymbolG2() const { + static AArch64MCExpr::VariantKind Variants[] = { + AArch64MCExpr::VK_ABS_G2_NC}; + return isMovWSymbol(Variants); + } + + bool isMovKSymbolG1() const { + static AArch64MCExpr::VariantKind Variants[] = { + AArch64MCExpr::VK_ABS_G1_NC, AArch64MCExpr::VK_TPREL_G1_NC, + AArch64MCExpr::VK_DTPREL_G1_NC + }; + return isMovWSymbol(Variants); + } + + bool isMovKSymbolG0() const { + static AArch64MCExpr::VariantKind Variants[] = { + AArch64MCExpr::VK_ABS_G0_NC, AArch64MCExpr::VK_GOTTPREL_G0_NC, + AArch64MCExpr::VK_TPREL_G0_NC, AArch64MCExpr::VK_DTPREL_G0_NC + }; + return isMovWSymbol(Variants); + } + + template + bool isMOVZMovAlias() const { + if (!isImm()) return false; + + const MCConstantExpr *CE = dyn_cast(getImm()); + if (!CE) return false; + uint64_t Value = CE->getValue(); + + if (RegWidth == 32) + Value &= 0xffffffffULL; + + // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0". + if (Value == 0 && Shift != 0) + return false; + + return (Value & ~(0xffffULL << Shift)) == 0; + } + + template + bool isMOVNMovAlias() const { + if (!isImm()) return false; + + const MCConstantExpr *CE = dyn_cast(getImm()); + if (!CE) return false; + uint64_t Value = CE->getValue(); + + // MOVZ takes precedence over MOVN. + for (int MOVZShift = 0; MOVZShift <= 48; MOVZShift += 16) + if ((Value & ~(0xffffULL << MOVZShift)) == 0) + return false; + + Value = ~Value; + if (RegWidth == 32) + Value &= 0xffffffffULL; + + return (Value & ~(0xffffULL << Shift)) == 0; + } + + bool isFPImm() const { return Kind == k_FPImm; } + bool isBarrier() const { return Kind == k_Barrier; } + bool isSysReg() const { return Kind == k_SysReg; } + bool isMRSSystemRegister() const { + if (!isSysReg()) return false; + + bool IsKnownRegister; + auto Mapper = AArch64SysReg::MRSMapper(getSysRegFeatureBits()); + Mapper.fromString(getSysReg(), IsKnownRegister); + + return IsKnownRegister; + } + bool isMSRSystemRegister() const { + if (!isSysReg()) return false; + + bool IsKnownRegister; + auto Mapper = AArch64SysReg::MSRMapper(getSysRegFeatureBits()); + Mapper.fromString(getSysReg(), IsKnownRegister); + + return IsKnownRegister; + } + bool isSystemPStateField() const { + if (!isSysReg()) return false; + + bool IsKnownRegister; + AArch64PState::PStateMapper().fromString(getSysReg(), IsKnownRegister); + + return IsKnownRegister; + } + bool isReg() const override { return Kind == k_Register && !Reg.isVector; } + bool isVectorReg() const { return Kind == k_Register && Reg.isVector; } + bool isVectorRegLo() const { + return Kind == k_Register && Reg.isVector && + AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains( + Reg.RegNum); + } + bool isGPR32as64() const { + return Kind == k_Register && !Reg.isVector && + AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum); + } + + bool isGPR64sp0() const { + return Kind == k_Register && !Reg.isVector && + AArch64MCRegisterClasses[AArch64::GPR64spRegClassID].contains(Reg.RegNum); + } + + /// Is this a vector list with the type implicit (presumably attached to the + /// instruction itself)? + template bool isImplicitlyTypedVectorList() const { + return Kind == k_VectorList && VectorList.Count == NumRegs && + !VectorList.ElementKind; + } + + template + bool isTypedVectorList() const { + if (Kind != k_VectorList) + return false; + if (VectorList.Count != NumRegs) + return false; + if (VectorList.ElementKind != ElementKind) + return false; + return VectorList.NumElements == NumElements; + } + + bool isVectorIndex1() const { + return Kind == k_VectorIndex && VectorIndex.Val == 1; + } + bool isVectorIndexB() const { + return Kind == k_VectorIndex && VectorIndex.Val < 16; + } + bool isVectorIndexH() const { + return Kind == k_VectorIndex && VectorIndex.Val < 8; + } + bool isVectorIndexS() const { + return Kind == k_VectorIndex && VectorIndex.Val < 4; + } + bool isVectorIndexD() const { + return Kind == k_VectorIndex && VectorIndex.Val < 2; + } + bool isToken() const override { return Kind == k_Token; } + bool isTokenEqual(StringRef Str) const { + return Kind == k_Token && getToken() == Str; + } + bool isSysCR() const { return Kind == k_SysCR; } + bool isPrefetch() const { return Kind == k_Prefetch; } + bool isShiftExtend() const { return Kind == k_ShiftExtend; } + bool isShifter() const { + if (!isShiftExtend()) + return false; + + AArch64_AM::ShiftExtendType ST = getShiftExtendType(); + return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR || + ST == AArch64_AM::ASR || ST == AArch64_AM::ROR || + ST == AArch64_AM::MSL); + } + bool isExtend() const { + if (!isShiftExtend()) + return false; + + AArch64_AM::ShiftExtendType ET = getShiftExtendType(); + return (ET == AArch64_AM::UXTB || ET == AArch64_AM::SXTB || + ET == AArch64_AM::UXTH || ET == AArch64_AM::SXTH || + ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW || + ET == AArch64_AM::UXTX || ET == AArch64_AM::SXTX || + ET == AArch64_AM::LSL) && + getShiftExtendAmount() <= 4; + } + + bool isExtend64() const { + if (!isExtend()) + return false; + // UXTX and SXTX require a 64-bit source register (the ExtendLSL64 class). + AArch64_AM::ShiftExtendType ET = getShiftExtendType(); + return ET != AArch64_AM::UXTX && ET != AArch64_AM::SXTX; + } + bool isExtendLSL64() const { + if (!isExtend()) + return false; + AArch64_AM::ShiftExtendType ET = getShiftExtendType(); + return (ET == AArch64_AM::UXTX || ET == AArch64_AM::SXTX || + ET == AArch64_AM::LSL) && + getShiftExtendAmount() <= 4; + } + + template bool isMemXExtend() const { + if (!isExtend()) + return false; + AArch64_AM::ShiftExtendType ET = getShiftExtendType(); + return (ET == AArch64_AM::LSL || ET == AArch64_AM::SXTX) && + (getShiftExtendAmount() == Log2_32(Width / 8) || + getShiftExtendAmount() == 0); + } + + template bool isMemWExtend() const { + if (!isExtend()) + return false; + AArch64_AM::ShiftExtendType ET = getShiftExtendType(); + return (ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW) && + (getShiftExtendAmount() == Log2_32(Width / 8) || + getShiftExtendAmount() == 0); + } + + template + bool isArithmeticShifter() const { + if (!isShifter()) + return false; + + // An arithmetic shifter is LSL, LSR, or ASR. + AArch64_AM::ShiftExtendType ST = getShiftExtendType(); + return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR || + ST == AArch64_AM::ASR) && getShiftExtendAmount() < width; + } + + template + bool isLogicalShifter() const { + if (!isShifter()) + return false; + + // A logical shifter is LSL, LSR, ASR or ROR. + AArch64_AM::ShiftExtendType ST = getShiftExtendType(); + return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR || + ST == AArch64_AM::ASR || ST == AArch64_AM::ROR) && + getShiftExtendAmount() < width; + } + + bool isMovImm32Shifter() const { + if (!isShifter()) + return false; + + // A MOVi shifter is LSL of 0, 16, 32, or 48. + AArch64_AM::ShiftExtendType ST = getShiftExtendType(); + if (ST != AArch64_AM::LSL) + return false; + uint64_t Val = getShiftExtendAmount(); + return (Val == 0 || Val == 16); + } + + bool isMovImm64Shifter() const { + if (!isShifter()) + return false; + + // A MOVi shifter is LSL of 0 or 16. + AArch64_AM::ShiftExtendType ST = getShiftExtendType(); + if (ST != AArch64_AM::LSL) + return false; + uint64_t Val = getShiftExtendAmount(); + return (Val == 0 || Val == 16 || Val == 32 || Val == 48); + } + + bool isLogicalVecShifter() const { + if (!isShifter()) + return false; + + // A logical vector shifter is a left shift by 0, 8, 16, or 24. + unsigned Shift = getShiftExtendAmount(); + return getShiftExtendType() == AArch64_AM::LSL && + (Shift == 0 || Shift == 8 || Shift == 16 || Shift == 24); + } + + bool isLogicalVecHalfWordShifter() const { + if (!isLogicalVecShifter()) + return false; + + // A logical vector shifter is a left shift by 0 or 8. + unsigned Shift = getShiftExtendAmount(); + return getShiftExtendType() == AArch64_AM::LSL && + (Shift == 0 || Shift == 8); + } + + bool isMoveVecShifter() const { + if (!isShiftExtend()) + return false; + + // A logical vector shifter is a left shift by 8 or 16. + unsigned Shift = getShiftExtendAmount(); + return getShiftExtendType() == AArch64_AM::MSL && + (Shift == 8 || Shift == 16); + } + + // Fallback unscaled operands are for aliases of LDR/STR that fall back + // to LDUR/STUR when the offset is not legal for the former but is for + // the latter. As such, in addition to checking for being a legal unscaled + // address, also check that it is not a legal scaled address. This avoids + // ambiguity in the matcher. + template + bool isSImm9OffsetFB() const { + return isSImm9() && !isUImm12Offset(); + } + + bool isAdrpLabel() const { + // Validation was handled during parsing, so we just sanity check that + // something didn't go haywire. + if (!isImm()) + return false; + + if (const MCConstantExpr *CE = dyn_cast(Imm.Val)) { + int64_t Val = CE->getValue(); + int64_t Min = - (4096 * (1LL << (21 - 1))); + int64_t Max = 4096 * ((1LL << (21 - 1)) - 1); + return (Val % 4096) == 0 && Val >= Min && Val <= Max; + } + + return true; + } + + bool isAdrLabel() const { + // Validation was handled during parsing, so we just sanity check that + // something didn't go haywire. + if (!isImm()) + return false; + + if (const MCConstantExpr *CE = dyn_cast(Imm.Val)) { + int64_t Val = CE->getValue(); + int64_t Min = - (1LL << (21 - 1)); + int64_t Max = ((1LL << (21 - 1)) - 1); + return Val >= Min && Val <= Max; + } + + return true; + } + + void addExpr(MCInst &Inst, const MCExpr *Expr) const { + // Add as immediates when possible. Null MCExpr = 0. + if (!Expr) + Inst.addOperand(MCOperand::CreateImm(0)); + else if (const MCConstantExpr *CE = dyn_cast(Expr)) + Inst.addOperand(MCOperand::CreateImm(CE->getValue())); + else + Inst.addOperand(MCOperand::CreateExpr(Expr)); + } + + void addRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(getReg())); + } + + void addGPR32as64Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + assert( + AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(getReg())); + + const MCRegisterInfo *RI = Ctx.getRegisterInfo(); + uint32_t Reg = RI->getRegClass(AArch64::GPR32RegClassID).getRegister( + RI->getEncodingValue(getReg())); + + Inst.addOperand(MCOperand::CreateReg(Reg)); + } + + void addVectorReg64Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + assert( + AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg())); + Inst.addOperand(MCOperand::CreateReg(AArch64::D0 + getReg() - AArch64::Q0)); + } + + void addVectorReg128Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + assert( + AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg())); + Inst.addOperand(MCOperand::CreateReg(getReg())); + } + + void addVectorRegLoOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateReg(getReg())); + } + + template + void addVectorList64Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + static unsigned FirstRegs[] = { AArch64::D0, AArch64::D0_D1, + AArch64::D0_D1_D2, AArch64::D0_D1_D2_D3 }; + unsigned FirstReg = FirstRegs[NumRegs - 1]; + + Inst.addOperand( + MCOperand::CreateReg(FirstReg + getVectorListStart() - AArch64::Q0)); + } + + template + void addVectorList128Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + static unsigned FirstRegs[] = { AArch64::Q0, AArch64::Q0_Q1, + AArch64::Q0_Q1_Q2, AArch64::Q0_Q1_Q2_Q3 }; + unsigned FirstReg = FirstRegs[NumRegs - 1]; + + Inst.addOperand( + MCOperand::CreateReg(FirstReg + getVectorListStart() - AArch64::Q0)); + } + + void addVectorIndex1Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(getVectorIndex())); + } + + void addVectorIndexBOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(getVectorIndex())); + } + + void addVectorIndexHOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(getVectorIndex())); + } + + void addVectorIndexSOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(getVectorIndex())); + } + + void addVectorIndexDOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(getVectorIndex())); + } + + void addImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + // If this is a pageoff symrefexpr with an addend, adjust the addend + // to be only the page-offset portion. Otherwise, just add the expr + // as-is. + addExpr(Inst, getImm()); + } + + void addAddSubImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + if (isShiftedImm()) { + addExpr(Inst, getShiftedImmVal()); + Inst.addOperand(MCOperand::CreateImm(getShiftedImmShift())); + } else { + addExpr(Inst, getImm()); + Inst.addOperand(MCOperand::CreateImm(0)); + } + } + + void addCondCodeOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(getCondCode())); + } + + void addAdrpLabelOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) + addExpr(Inst, getImm()); + else + Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 12)); + } + + void addAdrLabelOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + + template + void addUImm12OffsetOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + + if (!MCE) { + Inst.addOperand(MCOperand::CreateExpr(getImm())); + return; + } + Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / Scale)); + } + + void addSImm9Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); + } + + void addSImm7s4Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 4)); + } + + void addSImm7s8Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 8)); + } + + void addSImm7s16Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 16)); + } + + void addImm0_7Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); + } + + void addImm1_8Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); + } + + void addImm0_15Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); + } + + void addImm1_16Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); + } + + void addImm0_31Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); + } + + void addImm1_31Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); + } + + void addImm1_32Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); + } + + void addImm0_63Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); + } + + void addImm1_63Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); + } + + void addImm1_64Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); + } + + void addImm0_127Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); + } + + void addImm0_255Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); + } + + void addImm0_65535Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); + } + + void addImm32_63Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); + } + + void addLogicalImm32Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + assert(MCE && "Invalid logical immediate operand!"); + uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 32); + Inst.addOperand(MCOperand::CreateImm(encoding)); + } + + void addLogicalImm64Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + assert(MCE && "Invalid logical immediate operand!"); + uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 64); + Inst.addOperand(MCOperand::CreateImm(encoding)); + } + + void addSIMDImmType10Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + assert(MCE && "Invalid immediate operand!"); + uint64_t encoding = AArch64_AM::encodeAdvSIMDModImmType10(MCE->getValue()); + Inst.addOperand(MCOperand::CreateImm(encoding)); + } + + void addBranchTarget26Operands(MCInst &Inst, unsigned N) const { + // Branch operands don't encode the low bits, so shift them off + // here. If it's a label, however, just put it on directly as there's + // not enough information now to do anything. + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) { + addExpr(Inst, getImm()); + return; + } + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2)); + } + + void addPCRelLabel19Operands(MCInst &Inst, unsigned N) const { + // Branch operands don't encode the low bits, so shift them off + // here. If it's a label, however, just put it on directly as there's + // not enough information now to do anything. + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) { + addExpr(Inst, getImm()); + return; + } + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2)); + } + + void addBranchTarget14Operands(MCInst &Inst, unsigned N) const { + // Branch operands don't encode the low bits, so shift them off + // here. If it's a label, however, just put it on directly as there's + // not enough information now to do anything. + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = dyn_cast(getImm()); + if (!MCE) { + addExpr(Inst, getImm()); + return; + } + assert(MCE && "Invalid constant immediate operand!"); + Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2)); + } + + void addFPImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(getFPImm())); + } + + void addBarrierOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(getBarrier())); + } + + void addMRSSystemRegisterOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + bool Valid; + auto Mapper = AArch64SysReg::MRSMapper(getSysRegFeatureBits()); + uint32_t Bits = Mapper.fromString(getSysReg(), Valid); + + Inst.addOperand(MCOperand::CreateImm(Bits)); + } + + void addMSRSystemRegisterOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + bool Valid; + auto Mapper = AArch64SysReg::MSRMapper(getSysRegFeatureBits()); + uint32_t Bits = Mapper.fromString(getSysReg(), Valid); + + Inst.addOperand(MCOperand::CreateImm(Bits)); + } + + void addSystemPStateFieldOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + bool Valid; + uint32_t Bits = + AArch64PState::PStateMapper().fromString(getSysReg(), Valid); + + Inst.addOperand(MCOperand::CreateImm(Bits)); + } + + void addSysCROperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(getSysCR())); + } + + void addPrefetchOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::CreateImm(getPrefetch())); + } + + void addShifterOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + unsigned Imm = + AArch64_AM::getShifterImm(getShiftExtendType(), getShiftExtendAmount()); + Inst.addOperand(MCOperand::CreateImm(Imm)); + } + + void addExtendOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + AArch64_AM::ShiftExtendType ET = getShiftExtendType(); + if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTW; + unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount()); + Inst.addOperand(MCOperand::CreateImm(Imm)); + } + + void addExtend64Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + AArch64_AM::ShiftExtendType ET = getShiftExtendType(); + if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTX; + unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount()); + Inst.addOperand(MCOperand::CreateImm(Imm)); + } + + void addMemExtendOperands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + AArch64_AM::ShiftExtendType ET = getShiftExtendType(); + bool IsSigned = ET == AArch64_AM::SXTW || ET == AArch64_AM::SXTX; + Inst.addOperand(MCOperand::CreateImm(IsSigned)); + Inst.addOperand(MCOperand::CreateImm(getShiftExtendAmount() != 0)); + } + + // For 8-bit load/store instructions with a register offset, both the + // "DoShift" and "NoShift" variants have a shift of 0. Because of this, + // they're disambiguated by whether the shift was explicit or implicit rather + // than its size. + void addMemExtend8Operands(MCInst &Inst, unsigned N) const { + assert(N == 2 && "Invalid number of operands!"); + AArch64_AM::ShiftExtendType ET = getShiftExtendType(); + bool IsSigned = ET == AArch64_AM::SXTW || ET == AArch64_AM::SXTX; + Inst.addOperand(MCOperand::CreateImm(IsSigned)); + Inst.addOperand(MCOperand::CreateImm(hasShiftExtendAmount())); + } + + template + void addMOVZMovAliasOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + const MCConstantExpr *CE = cast(getImm()); + uint64_t Value = CE->getValue(); + Inst.addOperand(MCOperand::CreateImm((Value >> Shift) & 0xffff)); + } + + template + void addMOVNMovAliasOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + const MCConstantExpr *CE = cast(getImm()); + uint64_t Value = CE->getValue(); + Inst.addOperand(MCOperand::CreateImm((~Value >> Shift) & 0xffff)); + } + + void print(raw_ostream &OS) const override; + + static AArch64Operand *CreateToken(StringRef Str, bool IsSuffix, SMLoc S, + MCContext &Ctx) { + AArch64Operand *Op = new AArch64Operand(k_Token, Ctx); + Op->Tok.Data = Str.data(); + Op->Tok.Length = Str.size(); + Op->Tok.IsSuffix = IsSuffix; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static AArch64Operand *CreateReg(unsigned RegNum, bool isVector, SMLoc S, + SMLoc E, MCContext &Ctx) { + AArch64Operand *Op = new AArch64Operand(k_Register, Ctx); + Op->Reg.RegNum = RegNum; + Op->Reg.isVector = isVector; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static AArch64Operand *CreateVectorList(unsigned RegNum, unsigned Count, + unsigned NumElements, char ElementKind, + SMLoc S, SMLoc E, MCContext &Ctx) { + AArch64Operand *Op = new AArch64Operand(k_VectorList, Ctx); + Op->VectorList.RegNum = RegNum; + Op->VectorList.Count = Count; + Op->VectorList.NumElements = NumElements; + Op->VectorList.ElementKind = ElementKind; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static AArch64Operand *CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, + MCContext &Ctx) { + AArch64Operand *Op = new AArch64Operand(k_VectorIndex, Ctx); + Op->VectorIndex.Val = Idx; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static AArch64Operand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E, + MCContext &Ctx) { + AArch64Operand *Op = new AArch64Operand(k_Immediate, Ctx); + Op->Imm.Val = Val; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static AArch64Operand *CreateShiftedImm(const MCExpr *Val, + unsigned ShiftAmount, SMLoc S, + SMLoc E, MCContext &Ctx) { + AArch64Operand *Op = new AArch64Operand(k_ShiftedImm, Ctx); + Op->ShiftedImm .Val = Val; + Op->ShiftedImm.ShiftAmount = ShiftAmount; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static AArch64Operand *CreateCondCode(AArch64CC::CondCode Code, SMLoc S, + SMLoc E, MCContext &Ctx) { + AArch64Operand *Op = new AArch64Operand(k_CondCode, Ctx); + Op->CondCode.Code = Code; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static AArch64Operand *CreateFPImm(unsigned Val, SMLoc S, MCContext &Ctx) { + AArch64Operand *Op = new AArch64Operand(k_FPImm, Ctx); + Op->FPImm.Val = Val; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static AArch64Operand *CreateBarrier(unsigned Val, SMLoc S, MCContext &Ctx) { + AArch64Operand *Op = new AArch64Operand(k_Barrier, Ctx); + Op->Barrier.Val = Val; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static AArch64Operand *CreateSysReg(StringRef Str, SMLoc S, + uint64_t FeatureBits, MCContext &Ctx) { + AArch64Operand *Op = new AArch64Operand(k_SysReg, Ctx); + Op->SysReg.Data = Str.data(); + Op->SysReg.Length = Str.size(); + Op->SysReg.FeatureBits = FeatureBits; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static AArch64Operand *CreateSysCR(unsigned Val, SMLoc S, SMLoc E, + MCContext &Ctx) { + AArch64Operand *Op = new AArch64Operand(k_SysCR, Ctx); + Op->SysCRImm.Val = Val; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } + + static AArch64Operand *CreatePrefetch(unsigned Val, SMLoc S, MCContext &Ctx) { + AArch64Operand *Op = new AArch64Operand(k_Prefetch, Ctx); + Op->Prefetch.Val = Val; + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + + static AArch64Operand *CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, + unsigned Val, bool HasExplicitAmount, + SMLoc S, SMLoc E, MCContext &Ctx) { + AArch64Operand *Op = new AArch64Operand(k_ShiftExtend, Ctx); + Op->ShiftExtend.Type = ShOp; + Op->ShiftExtend.Amount = Val; + Op->ShiftExtend.HasExplicitAmount = HasExplicitAmount; + Op->StartLoc = S; + Op->EndLoc = E; + return Op; + } +}; + +} // end anonymous namespace. + +void AArch64Operand::print(raw_ostream &OS) const { + switch (Kind) { + case k_FPImm: + OS << ""; + break; + case k_Barrier: { + bool Valid; + StringRef Name = AArch64DB::DBarrierMapper().toString(getBarrier(), Valid); + if (Valid) + OS << ""; + else + OS << ""; + break; + } + case k_Immediate: + getImm()->print(OS); + break; + case k_ShiftedImm: { + unsigned Shift = getShiftedImmShift(); + OS << "print(OS); + OS << ", lsl #" << AArch64_AM::getShiftValue(Shift) << ">"; + break; + } + case k_CondCode: + OS << ""; + break; + case k_Register: + OS << ""; + break; + case k_VectorList: { + OS << ""; + break; + } + case k_VectorIndex: + OS << ""; + break; + case k_SysReg: + OS << "'; + break; + case k_Token: + OS << "'" << getToken() << "'"; + break; + case k_SysCR: + OS << "c" << getSysCR(); + break; + case k_Prefetch: { + bool Valid; + StringRef Name = AArch64PRFM::PRFMMapper().toString(getPrefetch(), Valid); + if (Valid) + OS << ""; + else + OS << ""; + break; + } + case k_ShiftExtend: { + OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #" + << getShiftExtendAmount(); + if (!hasShiftExtendAmount()) + OS << ""; + OS << '>'; + break; + } + } +} + +/// @name Auto-generated Match Functions +/// { + +static unsigned MatchRegisterName(StringRef Name); + +/// } + +static unsigned matchVectorRegName(StringRef Name) { + return StringSwitch(Name) + .Case("v0", AArch64::Q0) + .Case("v1", AArch64::Q1) + .Case("v2", AArch64::Q2) + .Case("v3", AArch64::Q3) + .Case("v4", AArch64::Q4) + .Case("v5", AArch64::Q5) + .Case("v6", AArch64::Q6) + .Case("v7", AArch64::Q7) + .Case("v8", AArch64::Q8) + .Case("v9", AArch64::Q9) + .Case("v10", AArch64::Q10) + .Case("v11", AArch64::Q11) + .Case("v12", AArch64::Q12) + .Case("v13", AArch64::Q13) + .Case("v14", AArch64::Q14) + .Case("v15", AArch64::Q15) + .Case("v16", AArch64::Q16) + .Case("v17", AArch64::Q17) + .Case("v18", AArch64::Q18) + .Case("v19", AArch64::Q19) + .Case("v20", AArch64::Q20) + .Case("v21", AArch64::Q21) + .Case("v22", AArch64::Q22) + .Case("v23", AArch64::Q23) + .Case("v24", AArch64::Q24) + .Case("v25", AArch64::Q25) + .Case("v26", AArch64::Q26) + .Case("v27", AArch64::Q27) + .Case("v28", AArch64::Q28) + .Case("v29", AArch64::Q29) + .Case("v30", AArch64::Q30) + .Case("v31", AArch64::Q31) + .Default(0); +} + +static bool isValidVectorKind(StringRef Name) { + return StringSwitch(Name.lower()) + .Case(".8b", true) + .Case(".16b", true) + .Case(".4h", true) + .Case(".8h", true) + .Case(".2s", true) + .Case(".4s", true) + .Case(".1d", true) + .Case(".2d", true) + .Case(".1q", true) + // Accept the width neutral ones, too, for verbose syntax. If those + // aren't used in the right places, the token operand won't match so + // all will work out. + .Case(".b", true) + .Case(".h", true) + .Case(".s", true) + .Case(".d", true) + .Default(false); +} + +static void parseValidVectorKind(StringRef Name, unsigned &NumElements, + char &ElementKind) { + assert(isValidVectorKind(Name)); + + ElementKind = Name.lower()[Name.size() - 1]; + NumElements = 0; + + if (Name.size() == 2) + return; + + // Parse the lane count + Name = Name.drop_front(); + while (isdigit(Name.front())) { + NumElements = 10 * NumElements + (Name.front() - '0'); + Name = Name.drop_front(); + } +} + +bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, + SMLoc &EndLoc) { + StartLoc = getLoc(); + RegNo = tryParseRegister(); + EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1); + return (RegNo == (unsigned)-1); +} + +/// tryParseRegister - Try to parse a register name. The token must be an +/// Identifier when called, and if it is a register name the token is eaten and +/// the register is added to the operand list. +int AArch64AsmParser::tryParseRegister() { + const AsmToken &Tok = Parser.getTok(); + assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); + + std::string lowerCase = Tok.getString().lower(); + unsigned RegNum = MatchRegisterName(lowerCase); + // Also handle a few aliases of registers. + if (RegNum == 0) + RegNum = StringSwitch(lowerCase) + .Case("fp", AArch64::FP) + .Case("lr", AArch64::LR) + .Case("x31", AArch64::XZR) + .Case("w31", AArch64::WZR) + .Default(0); + + if (RegNum == 0) + return -1; + + Parser.Lex(); // Eat identifier token. + return RegNum; +} + +/// tryMatchVectorRegister - Try to parse a vector register name with optional +/// kind specifier. If it is a register specifier, eat the token and return it. +int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) { + if (Parser.getTok().isNot(AsmToken::Identifier)) { + TokError("vector register expected"); + return -1; + } + + StringRef Name = Parser.getTok().getString(); + // If there is a kind specifier, it's separated from the register name by + // a '.'. + size_t Start = 0, Next = Name.find('.'); + StringRef Head = Name.slice(Start, Next); + unsigned RegNum = matchVectorRegName(Head); + if (RegNum) { + if (Next != StringRef::npos) { + Kind = Name.slice(Next, StringRef::npos); + if (!isValidVectorKind(Kind)) { + TokError("invalid vector kind qualifier"); + return -1; + } + } + Parser.Lex(); // Eat the register token. + return RegNum; + } + + if (expected) + TokError("vector register expected"); + return -1; +} + +/// tryParseSysCROperand - Try to parse a system instruction CR operand name. +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) { + SMLoc S = getLoc(); + + if (Parser.getTok().isNot(AsmToken::Identifier)) { + Error(S, "Expected cN operand where 0 <= N <= 15"); + return MatchOperand_ParseFail; + } + + StringRef Tok = Parser.getTok().getIdentifier(); + if (Tok[0] != 'c' && Tok[0] != 'C') { + Error(S, "Expected cN operand where 0 <= N <= 15"); + return MatchOperand_ParseFail; + } + + uint32_t CRNum; + bool BadNum = Tok.drop_front().getAsInteger(10, CRNum); + if (BadNum || CRNum > 15) { + Error(S, "Expected cN operand where 0 <= N <= 15"); + return MatchOperand_ParseFail; + } + + Parser.Lex(); // Eat identifier token. + Operands.push_back( + AArch64Operand::CreateSysCR(CRNum, S, getLoc(), getContext())); + return MatchOperand_Success; +} + +/// tryParsePrefetch - Try to parse a prefetch operand. +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { + SMLoc S = getLoc(); + const AsmToken &Tok = Parser.getTok(); + // Either an identifier for named values or a 5-bit immediate. + bool Hash = Tok.is(AsmToken::Hash); + if (Hash || Tok.is(AsmToken::Integer)) { + if (Hash) + Parser.Lex(); // Eat hash token. + const MCExpr *ImmVal; + if (getParser().parseExpression(ImmVal)) + return MatchOperand_ParseFail; + + const MCConstantExpr *MCE = dyn_cast(ImmVal); + if (!MCE) { + TokError("immediate value expected for prefetch operand"); + return MatchOperand_ParseFail; + } + unsigned prfop = MCE->getValue(); + if (prfop > 31) { + TokError("prefetch operand out of range, [0,31] expected"); + return MatchOperand_ParseFail; + } + + Operands.push_back(AArch64Operand::CreatePrefetch(prfop, S, getContext())); + return MatchOperand_Success; + } + + if (Tok.isNot(AsmToken::Identifier)) { + TokError("pre-fetch hint expected"); + return MatchOperand_ParseFail; + } + + bool Valid; + unsigned prfop = AArch64PRFM::PRFMMapper().fromString(Tok.getString(), Valid); + if (!Valid) { + TokError("pre-fetch hint expected"); + return MatchOperand_ParseFail; + } + + Parser.Lex(); // Eat identifier token. + Operands.push_back(AArch64Operand::CreatePrefetch(prfop, S, getContext())); + return MatchOperand_Success; +} + +/// tryParseAdrpLabel - Parse and validate a source label for the ADRP +/// instruction. +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) { + SMLoc S = getLoc(); + const MCExpr *Expr; + + if (Parser.getTok().is(AsmToken::Hash)) { + Parser.Lex(); // Eat hash token. + } + + if (parseSymbolicImmVal(Expr)) + return MatchOperand_ParseFail; + + AArch64MCExpr::VariantKind ELFRefKind; + MCSymbolRefExpr::VariantKind DarwinRefKind; + int64_t Addend; + if (classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) { + if (DarwinRefKind == MCSymbolRefExpr::VK_None && + ELFRefKind == AArch64MCExpr::VK_INVALID) { + // No modifier was specified at all; this is the syntax for an ELF basic + // ADRP relocation (unfortunately). + Expr = + AArch64MCExpr::Create(Expr, AArch64MCExpr::VK_ABS_PAGE, getContext()); + } else if ((DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGE || + DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGE) && + Addend != 0) { + Error(S, "gotpage label reference not allowed an addend"); + return MatchOperand_ParseFail; + } else if (DarwinRefKind != MCSymbolRefExpr::VK_PAGE && + DarwinRefKind != MCSymbolRefExpr::VK_GOTPAGE && + DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE && + ELFRefKind != AArch64MCExpr::VK_GOT_PAGE && + ELFRefKind != AArch64MCExpr::VK_GOTTPREL_PAGE && + ELFRefKind != AArch64MCExpr::VK_TLSDESC_PAGE) { + // The operand must be an @page or @gotpage qualified symbolref. + Error(S, "page or gotpage label reference expected"); + return MatchOperand_ParseFail; + } + } + + // We have either a label reference possibly with addend or an immediate. The + // addend is a raw value here. The linker will adjust it to only reference the + // page. + SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1); + Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext())); + + return MatchOperand_Success; +} + +/// tryParseAdrLabel - Parse and validate a source label for the ADR +/// instruction. +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) { + SMLoc S = getLoc(); + const MCExpr *Expr; + + if (Parser.getTok().is(AsmToken::Hash)) { + Parser.Lex(); // Eat hash token. + } + + if (getParser().parseExpression(Expr)) + return MatchOperand_ParseFail; + + SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1); + Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext())); + + return MatchOperand_Success; +} + +/// tryParseFPImm - A floating point immediate expression operand. +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParseFPImm(OperandVector &Operands) { + SMLoc S = getLoc(); + + bool Hash = false; + if (Parser.getTok().is(AsmToken::Hash)) { + Parser.Lex(); // Eat '#' + Hash = true; + } + + // Handle negation, as that still comes through as a separate token. + bool isNegative = false; + if (Parser.getTok().is(AsmToken::Minus)) { + isNegative = true; + Parser.Lex(); + } + const AsmToken &Tok = Parser.getTok(); + if (Tok.is(AsmToken::Real)) { + APFloat RealVal(APFloat::IEEEdouble, Tok.getString()); + uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue(); + // If we had a '-' in front, toggle the sign bit. + IntVal ^= (uint64_t)isNegative << 63; + int Val = AArch64_AM::getFP64Imm(APInt(64, IntVal)); + Parser.Lex(); // Eat the token. + // Check for out of range values. As an exception, we let Zero through, + // as we handle that special case in post-processing before matching in + // order to use the zero register for it. + if (Val == -1 && !RealVal.isZero()) { + TokError("expected compatible register or floating-point constant"); + return MatchOperand_ParseFail; + } + Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext())); + return MatchOperand_Success; + } + if (Tok.is(AsmToken::Integer)) { + int64_t Val; + if (!isNegative && Tok.getString().startswith("0x")) { + Val = Tok.getIntVal(); + if (Val > 255 || Val < 0) { + TokError("encoded floating point value out of range"); + return MatchOperand_ParseFail; + } + } else { + APFloat RealVal(APFloat::IEEEdouble, Tok.getString()); + uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue(); + // If we had a '-' in front, toggle the sign bit. + IntVal ^= (uint64_t)isNegative << 63; + Val = AArch64_AM::getFP64Imm(APInt(64, IntVal)); + } + Parser.Lex(); // Eat the token. + Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext())); + return MatchOperand_Success; + } + + if (!Hash) + return MatchOperand_NoMatch; + + TokError("invalid floating point immediate"); + return MatchOperand_ParseFail; +} + +/// tryParseAddSubImm - Parse ADD/SUB shifted immediate operand +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParseAddSubImm(OperandVector &Operands) { + SMLoc S = getLoc(); + + if (Parser.getTok().is(AsmToken::Hash)) + Parser.Lex(); // Eat '#' + else if (Parser.getTok().isNot(AsmToken::Integer)) + // Operand should start from # or should be integer, emit error otherwise. + return MatchOperand_NoMatch; + + const MCExpr *Imm; + if (parseSymbolicImmVal(Imm)) + return MatchOperand_ParseFail; + else if (Parser.getTok().isNot(AsmToken::Comma)) { + uint64_t ShiftAmount = 0; + const MCConstantExpr *MCE = dyn_cast(Imm); + if (MCE) { + int64_t Val = MCE->getValue(); + if (Val > 0xfff && (Val & 0xfff) == 0) { + Imm = MCConstantExpr::Create(Val >> 12, getContext()); + ShiftAmount = 12; + } + } + SMLoc E = Parser.getTok().getLoc(); + Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount, S, E, + getContext())); + return MatchOperand_Success; + } + + // Eat ',' + Parser.Lex(); + + // The optional operand must be "lsl #N" where N is non-negative. + if (!Parser.getTok().is(AsmToken::Identifier) || + !Parser.getTok().getIdentifier().equals_lower("lsl")) { + Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate"); + return MatchOperand_ParseFail; + } + + // Eat 'lsl' + Parser.Lex(); + + if (Parser.getTok().is(AsmToken::Hash)) { + Parser.Lex(); + } + + if (Parser.getTok().isNot(AsmToken::Integer)) { + Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate"); + return MatchOperand_ParseFail; + } + + int64_t ShiftAmount = Parser.getTok().getIntVal(); + + if (ShiftAmount < 0) { + Error(Parser.getTok().getLoc(), "positive shift amount required"); + return MatchOperand_ParseFail; + } + Parser.Lex(); // Eat the number + + SMLoc E = Parser.getTok().getLoc(); + Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount, + S, E, getContext())); + return MatchOperand_Success; +} + +/// parseCondCodeString - Parse a Condition Code string. +AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) { + AArch64CC::CondCode CC = StringSwitch(Cond.lower()) + .Case("eq", AArch64CC::EQ) + .Case("ne", AArch64CC::NE) + .Case("cs", AArch64CC::HS) + .Case("hs", AArch64CC::HS) + .Case("cc", AArch64CC::LO) + .Case("lo", AArch64CC::LO) + .Case("mi", AArch64CC::MI) + .Case("pl", AArch64CC::PL) + .Case("vs", AArch64CC::VS) + .Case("vc", AArch64CC::VC) + .Case("hi", AArch64CC::HI) + .Case("ls", AArch64CC::LS) + .Case("ge", AArch64CC::GE) + .Case("lt", AArch64CC::LT) + .Case("gt", AArch64CC::GT) + .Case("le", AArch64CC::LE) + .Case("al", AArch64CC::AL) + .Case("nv", AArch64CC::NV) + .Default(AArch64CC::Invalid); + return CC; +} + +/// parseCondCode - Parse a Condition Code operand. +bool AArch64AsmParser::parseCondCode(OperandVector &Operands, + bool invertCondCode) { + SMLoc S = getLoc(); + const AsmToken &Tok = Parser.getTok(); + assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); + + StringRef Cond = Tok.getString(); + AArch64CC::CondCode CC = parseCondCodeString(Cond); + if (CC == AArch64CC::Invalid) + return TokError("invalid condition code"); + Parser.Lex(); // Eat identifier token. + + if (invertCondCode) + CC = AArch64CC::getInvertedCondCode(AArch64CC::CondCode(CC)); + + Operands.push_back( + AArch64Operand::CreateCondCode(CC, S, getLoc(), getContext())); + return false; +} + +/// tryParseOptionalShift - Some operands take an optional shift argument. Parse +/// them if present. +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) { + const AsmToken &Tok = Parser.getTok(); + std::string LowerID = Tok.getString().lower(); + AArch64_AM::ShiftExtendType ShOp = + StringSwitch(LowerID) + .Case("lsl", AArch64_AM::LSL) + .Case("lsr", AArch64_AM::LSR) + .Case("asr", AArch64_AM::ASR) + .Case("ror", AArch64_AM::ROR) + .Case("msl", AArch64_AM::MSL) + .Case("uxtb", AArch64_AM::UXTB) + .Case("uxth", AArch64_AM::UXTH) + .Case("uxtw", AArch64_AM::UXTW) + .Case("uxtx", AArch64_AM::UXTX) + .Case("sxtb", AArch64_AM::SXTB) + .Case("sxth", AArch64_AM::SXTH) + .Case("sxtw", AArch64_AM::SXTW) + .Case("sxtx", AArch64_AM::SXTX) + .Default(AArch64_AM::InvalidShiftExtend); + + if (ShOp == AArch64_AM::InvalidShiftExtend) + return MatchOperand_NoMatch; + + SMLoc S = Tok.getLoc(); + Parser.Lex(); + + bool Hash = getLexer().is(AsmToken::Hash); + if (!Hash && getLexer().isNot(AsmToken::Integer)) { + if (ShOp == AArch64_AM::LSL || ShOp == AArch64_AM::LSR || + ShOp == AArch64_AM::ASR || ShOp == AArch64_AM::ROR || + ShOp == AArch64_AM::MSL) { + // We expect a number here. + TokError("expected #imm after shift specifier"); + return MatchOperand_ParseFail; + } + + // "extend" type operatoins don't need an immediate, #0 is implicit. + SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1); + Operands.push_back( + AArch64Operand::CreateShiftExtend(ShOp, 0, false, S, E, getContext())); + return MatchOperand_Success; + } + + if (Hash) + Parser.Lex(); // Eat the '#'. + + // Make sure we do actually have a number + if (!Parser.getTok().is(AsmToken::Integer)) { + Error(Parser.getTok().getLoc(), + "expected integer shift amount"); + return MatchOperand_ParseFail; + } + + const MCExpr *ImmVal; + if (getParser().parseExpression(ImmVal)) + return MatchOperand_ParseFail; + + const MCConstantExpr *MCE = dyn_cast(ImmVal); + if (!MCE) { + TokError("expected #imm after shift specifier"); + return MatchOperand_ParseFail; + } + + SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1); + Operands.push_back(AArch64Operand::CreateShiftExtend( + ShOp, MCE->getValue(), true, S, E, getContext())); + return MatchOperand_Success; +} + +/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for +/// the SYS instruction. Parse them specially so that we create a SYS MCInst. +bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, + OperandVector &Operands) { + if (Name.find('.') != StringRef::npos) + return TokError("invalid operand"); + + Mnemonic = Name; + Operands.push_back( + AArch64Operand::CreateToken("sys", false, NameLoc, getContext())); + + const AsmToken &Tok = Parser.getTok(); + StringRef Op = Tok.getString(); + SMLoc S = Tok.getLoc(); + + const MCExpr *Expr = nullptr; + +#define SYS_ALIAS(op1, Cn, Cm, op2) \ + do { \ + Expr = MCConstantExpr::Create(op1, getContext()); \ + Operands.push_back( \ + AArch64Operand::CreateImm(Expr, S, getLoc(), getContext())); \ + Operands.push_back( \ + AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext())); \ + Operands.push_back( \ + AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext())); \ + Expr = MCConstantExpr::Create(op2, getContext()); \ + Operands.push_back( \ + AArch64Operand::CreateImm(Expr, S, getLoc(), getContext())); \ + } while (0) + + if (Mnemonic == "ic") { + if (!Op.compare_lower("ialluis")) { + // SYS #0, C7, C1, #0 + SYS_ALIAS(0, 7, 1, 0); + } else if (!Op.compare_lower("iallu")) { + // SYS #0, C7, C5, #0 + SYS_ALIAS(0, 7, 5, 0); + } else if (!Op.compare_lower("ivau")) { + // SYS #3, C7, C5, #1 + SYS_ALIAS(3, 7, 5, 1); + } else { + return TokError("invalid operand for IC instruction"); + } + } else if (Mnemonic == "dc") { + if (!Op.compare_lower("zva")) { + // SYS #3, C7, C4, #1 + SYS_ALIAS(3, 7, 4, 1); + } else if (!Op.compare_lower("ivac")) { + // SYS #3, C7, C6, #1 + SYS_ALIAS(0, 7, 6, 1); + } else if (!Op.compare_lower("isw")) { + // SYS #0, C7, C6, #2 + SYS_ALIAS(0, 7, 6, 2); + } else if (!Op.compare_lower("cvac")) { + // SYS #3, C7, C10, #1 + SYS_ALIAS(3, 7, 10, 1); + } else if (!Op.compare_lower("csw")) { + // SYS #0, C7, C10, #2 + SYS_ALIAS(0, 7, 10, 2); + } else if (!Op.compare_lower("cvau")) { + // SYS #3, C7, C11, #1 + SYS_ALIAS(3, 7, 11, 1); + } else if (!Op.compare_lower("civac")) { + // SYS #3, C7, C14, #1 + SYS_ALIAS(3, 7, 14, 1); + } else if (!Op.compare_lower("cisw")) { + // SYS #0, C7, C14, #2 + SYS_ALIAS(0, 7, 14, 2); + } else { + return TokError("invalid operand for DC instruction"); + } + } else if (Mnemonic == "at") { + if (!Op.compare_lower("s1e1r")) { + // SYS #0, C7, C8, #0 + SYS_ALIAS(0, 7, 8, 0); + } else if (!Op.compare_lower("s1e2r")) { + // SYS #4, C7, C8, #0 + SYS_ALIAS(4, 7, 8, 0); + } else if (!Op.compare_lower("s1e3r")) { + // SYS #6, C7, C8, #0 + SYS_ALIAS(6, 7, 8, 0); + } else if (!Op.compare_lower("s1e1w")) { + // SYS #0, C7, C8, #1 + SYS_ALIAS(0, 7, 8, 1); + } else if (!Op.compare_lower("s1e2w")) { + // SYS #4, C7, C8, #1 + SYS_ALIAS(4, 7, 8, 1); + } else if (!Op.compare_lower("s1e3w")) { + // SYS #6, C7, C8, #1 + SYS_ALIAS(6, 7, 8, 1); + } else if (!Op.compare_lower("s1e0r")) { + // SYS #0, C7, C8, #3 + SYS_ALIAS(0, 7, 8, 2); + } else if (!Op.compare_lower("s1e0w")) { + // SYS #0, C7, C8, #3 + SYS_ALIAS(0, 7, 8, 3); + } else if (!Op.compare_lower("s12e1r")) { + // SYS #4, C7, C8, #4 + SYS_ALIAS(4, 7, 8, 4); + } else if (!Op.compare_lower("s12e1w")) { + // SYS #4, C7, C8, #5 + SYS_ALIAS(4, 7, 8, 5); + } else if (!Op.compare_lower("s12e0r")) { + // SYS #4, C7, C8, #6 + SYS_ALIAS(4, 7, 8, 6); + } else if (!Op.compare_lower("s12e0w")) { + // SYS #4, C7, C8, #7 + SYS_ALIAS(4, 7, 8, 7); + } else { + return TokError("invalid operand for AT instruction"); + } + } else if (Mnemonic == "tlbi") { + if (!Op.compare_lower("vmalle1is")) { + // SYS #0, C8, C3, #0 + SYS_ALIAS(0, 8, 3, 0); + } else if (!Op.compare_lower("alle2is")) { + // SYS #4, C8, C3, #0 + SYS_ALIAS(4, 8, 3, 0); + } else if (!Op.compare_lower("alle3is")) { + // SYS #6, C8, C3, #0 + SYS_ALIAS(6, 8, 3, 0); + } else if (!Op.compare_lower("vae1is")) { + // SYS #0, C8, C3, #1 + SYS_ALIAS(0, 8, 3, 1); + } else if (!Op.compare_lower("vae2is")) { + // SYS #4, C8, C3, #1 + SYS_ALIAS(4, 8, 3, 1); + } else if (!Op.compare_lower("vae3is")) { + // SYS #6, C8, C3, #1 + SYS_ALIAS(6, 8, 3, 1); + } else if (!Op.compare_lower("aside1is")) { + // SYS #0, C8, C3, #2 + SYS_ALIAS(0, 8, 3, 2); + } else if (!Op.compare_lower("vaae1is")) { + // SYS #0, C8, C3, #3 + SYS_ALIAS(0, 8, 3, 3); + } else if (!Op.compare_lower("alle1is")) { + // SYS #4, C8, C3, #4 + SYS_ALIAS(4, 8, 3, 4); + } else if (!Op.compare_lower("vale1is")) { + // SYS #0, C8, C3, #5 + SYS_ALIAS(0, 8, 3, 5); + } else if (!Op.compare_lower("vaale1is")) { + // SYS #0, C8, C3, #7 + SYS_ALIAS(0, 8, 3, 7); + } else if (!Op.compare_lower("vmalle1")) { + // SYS #0, C8, C7, #0 + SYS_ALIAS(0, 8, 7, 0); + } else if (!Op.compare_lower("alle2")) { + // SYS #4, C8, C7, #0 + SYS_ALIAS(4, 8, 7, 0); + } else if (!Op.compare_lower("vale2is")) { + // SYS #4, C8, C3, #5 + SYS_ALIAS(4, 8, 3, 5); + } else if (!Op.compare_lower("vale3is")) { + // SYS #6, C8, C3, #5 + SYS_ALIAS(6, 8, 3, 5); + } else if (!Op.compare_lower("alle3")) { + // SYS #6, C8, C7, #0 + SYS_ALIAS(6, 8, 7, 0); + } else if (!Op.compare_lower("vae1")) { + // SYS #0, C8, C7, #1 + SYS_ALIAS(0, 8, 7, 1); + } else if (!Op.compare_lower("vae2")) { + // SYS #4, C8, C7, #1 + SYS_ALIAS(4, 8, 7, 1); + } else if (!Op.compare_lower("vae3")) { + // SYS #6, C8, C7, #1 + SYS_ALIAS(6, 8, 7, 1); + } else if (!Op.compare_lower("aside1")) { + // SYS #0, C8, C7, #2 + SYS_ALIAS(0, 8, 7, 2); + } else if (!Op.compare_lower("vaae1")) { + // SYS #0, C8, C7, #3 + SYS_ALIAS(0, 8, 7, 3); + } else if (!Op.compare_lower("alle1")) { + // SYS #4, C8, C7, #4 + SYS_ALIAS(4, 8, 7, 4); + } else if (!Op.compare_lower("vale1")) { + // SYS #0, C8, C7, #5 + SYS_ALIAS(0, 8, 7, 5); + } else if (!Op.compare_lower("vale2")) { + // SYS #4, C8, C7, #5 + SYS_ALIAS(4, 8, 7, 5); + } else if (!Op.compare_lower("vale3")) { + // SYS #6, C8, C7, #5 + SYS_ALIAS(6, 8, 7, 5); + } else if (!Op.compare_lower("vaale1")) { + // SYS #0, C8, C7, #7 + SYS_ALIAS(0, 8, 7, 7); + } else if (!Op.compare_lower("ipas2e1")) { + // SYS #4, C8, C4, #1 + SYS_ALIAS(4, 8, 4, 1); + } else if (!Op.compare_lower("ipas2le1")) { + // SYS #4, C8, C4, #5 + SYS_ALIAS(4, 8, 4, 5); + } else if (!Op.compare_lower("ipas2e1is")) { + // SYS #4, C8, C4, #1 + SYS_ALIAS(4, 8, 0, 1); + } else if (!Op.compare_lower("ipas2le1is")) { + // SYS #4, C8, C4, #5 + SYS_ALIAS(4, 8, 0, 5); + } else if (!Op.compare_lower("vmalls12e1")) { + // SYS #4, C8, C7, #6 + SYS_ALIAS(4, 8, 7, 6); + } else if (!Op.compare_lower("vmalls12e1is")) { + // SYS #4, C8, C3, #6 + SYS_ALIAS(4, 8, 3, 6); + } else { + return TokError("invalid operand for TLBI instruction"); + } + } + +#undef SYS_ALIAS + + Parser.Lex(); // Eat operand. + + bool ExpectRegister = (Op.lower().find("all") == StringRef::npos); + bool HasRegister = false; + + // Check for the optional register operand. + if (getLexer().is(AsmToken::Comma)) { + Parser.Lex(); // Eat comma. + + if (Tok.isNot(AsmToken::Identifier) || parseRegister(Operands)) + return TokError("expected register operand"); + + HasRegister = true; + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + Parser.eatToEndOfStatement(); + return TokError("unexpected token in argument list"); + } + + if (ExpectRegister && !HasRegister) { + return TokError("specified " + Mnemonic + " op requires a register"); + } + else if (!ExpectRegister && HasRegister) { + return TokError("specified " + Mnemonic + " op does not use a register"); + } + + Parser.Lex(); // Consume the EndOfStatement + return false; +} + +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { + const AsmToken &Tok = Parser.getTok(); + + // Can be either a #imm style literal or an option name + bool Hash = Tok.is(AsmToken::Hash); + if (Hash || Tok.is(AsmToken::Integer)) { + // Immediate operand. + if (Hash) + Parser.Lex(); // Eat the '#' + const MCExpr *ImmVal; + SMLoc ExprLoc = getLoc(); + if (getParser().parseExpression(ImmVal)) + return MatchOperand_ParseFail; + const MCConstantExpr *MCE = dyn_cast(ImmVal); + if (!MCE) { + Error(ExprLoc, "immediate value expected for barrier operand"); + return MatchOperand_ParseFail; + } + if (MCE->getValue() < 0 || MCE->getValue() > 15) { + Error(ExprLoc, "barrier operand out of range"); + return MatchOperand_ParseFail; + } + Operands.push_back( + AArch64Operand::CreateBarrier(MCE->getValue(), ExprLoc, getContext())); + return MatchOperand_Success; + } + + if (Tok.isNot(AsmToken::Identifier)) { + TokError("invalid operand for instruction"); + return MatchOperand_ParseFail; + } + + bool Valid; + unsigned Opt = AArch64DB::DBarrierMapper().fromString(Tok.getString(), Valid); + if (!Valid) { + TokError("invalid barrier option name"); + return MatchOperand_ParseFail; + } + + // The only valid named option for ISB is 'sy' + if (Mnemonic == "isb" && Opt != AArch64DB::SY) { + TokError("'sy' or #imm operand expected"); + return MatchOperand_ParseFail; + } + + Operands.push_back( + AArch64Operand::CreateBarrier(Opt, getLoc(), getContext())); + Parser.Lex(); // Consume the option + + return MatchOperand_Success; +} + +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParseSysReg(OperandVector &Operands) { + const AsmToken &Tok = Parser.getTok(); + + if (Tok.isNot(AsmToken::Identifier)) + return MatchOperand_NoMatch; + + Operands.push_back(AArch64Operand::CreateSysReg(Tok.getString(), getLoc(), + STI.getFeatureBits(), getContext())); + Parser.Lex(); // Eat identifier + + return MatchOperand_Success; +} + +/// tryParseVectorRegister - Parse a vector register operand. +bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) { + if (Parser.getTok().isNot(AsmToken::Identifier)) + return true; + + SMLoc S = getLoc(); + // Check for a vector register specifier first. + StringRef Kind; + int64_t Reg = tryMatchVectorRegister(Kind, false); + if (Reg == -1) + return true; + Operands.push_back( + AArch64Operand::CreateReg(Reg, true, S, getLoc(), getContext())); + // If there was an explicit qualifier, that goes on as a literal text + // operand. + if (!Kind.empty()) + Operands.push_back( + AArch64Operand::CreateToken(Kind, false, S, getContext())); + + // If there is an index specifier following the register, parse that too. + if (Parser.getTok().is(AsmToken::LBrac)) { + SMLoc SIdx = getLoc(); + Parser.Lex(); // Eat left bracket token. + + const MCExpr *ImmVal; + if (getParser().parseExpression(ImmVal)) + return false; + const MCConstantExpr *MCE = dyn_cast(ImmVal); + if (!MCE) { + TokError("immediate value expected for vector index"); + return false; + } + + SMLoc E = getLoc(); + if (Parser.getTok().isNot(AsmToken::RBrac)) { + Error(E, "']' expected"); + return false; + } + + Parser.Lex(); // Eat right bracket token. + + Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx, + E, getContext())); + } + + return false; +} + +/// parseRegister - Parse a non-vector register operand. +bool AArch64AsmParser::parseRegister(OperandVector &Operands) { + SMLoc S = getLoc(); + // Try for a vector register. + if (!tryParseVectorRegister(Operands)) + return false; + + // Try for a scalar register. + int64_t Reg = tryParseRegister(); + if (Reg == -1) + return true; + Operands.push_back( + AArch64Operand::CreateReg(Reg, false, S, getLoc(), getContext())); + + // A small number of instructions (FMOVXDhighr, for example) have "[1]" + // as a string token in the instruction itself. + if (getLexer().getKind() == AsmToken::LBrac) { + SMLoc LBracS = getLoc(); + Parser.Lex(); + const AsmToken &Tok = Parser.getTok(); + if (Tok.is(AsmToken::Integer)) { + SMLoc IntS = getLoc(); + int64_t Val = Tok.getIntVal(); + if (Val == 1) { + Parser.Lex(); + if (getLexer().getKind() == AsmToken::RBrac) { + SMLoc RBracS = getLoc(); + Parser.Lex(); + Operands.push_back( + AArch64Operand::CreateToken("[", false, LBracS, getContext())); + Operands.push_back( + AArch64Operand::CreateToken("1", false, IntS, getContext())); + Operands.push_back( + AArch64Operand::CreateToken("]", false, RBracS, getContext())); + return false; + } + } + } + } + + return false; +} + +bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) { + bool HasELFModifier = false; + AArch64MCExpr::VariantKind RefKind; + + if (Parser.getTok().is(AsmToken::Colon)) { + Parser.Lex(); // Eat ':" + HasELFModifier = true; + + if (Parser.getTok().isNot(AsmToken::Identifier)) { + Error(Parser.getTok().getLoc(), + "expect relocation specifier in operand after ':'"); + return true; + } + + std::string LowerCase = Parser.getTok().getIdentifier().lower(); + RefKind = StringSwitch(LowerCase) + .Case("lo12", AArch64MCExpr::VK_LO12) + .Case("abs_g3", AArch64MCExpr::VK_ABS_G3) + .Case("abs_g2", AArch64MCExpr::VK_ABS_G2) + .Case("abs_g2_s", AArch64MCExpr::VK_ABS_G2_S) + .Case("abs_g2_nc", AArch64MCExpr::VK_ABS_G2_NC) + .Case("abs_g1", AArch64MCExpr::VK_ABS_G1) + .Case("abs_g1_s", AArch64MCExpr::VK_ABS_G1_S) + .Case("abs_g1_nc", AArch64MCExpr::VK_ABS_G1_NC) + .Case("abs_g0", AArch64MCExpr::VK_ABS_G0) + .Case("abs_g0_s", AArch64MCExpr::VK_ABS_G0_S) + .Case("abs_g0_nc", AArch64MCExpr::VK_ABS_G0_NC) + .Case("dtprel_g2", AArch64MCExpr::VK_DTPREL_G2) + .Case("dtprel_g1", AArch64MCExpr::VK_DTPREL_G1) + .Case("dtprel_g1_nc", AArch64MCExpr::VK_DTPREL_G1_NC) + .Case("dtprel_g0", AArch64MCExpr::VK_DTPREL_G0) + .Case("dtprel_g0_nc", AArch64MCExpr::VK_DTPREL_G0_NC) + .Case("dtprel_hi12", AArch64MCExpr::VK_DTPREL_HI12) + .Case("dtprel_lo12", AArch64MCExpr::VK_DTPREL_LO12) + .Case("dtprel_lo12_nc", AArch64MCExpr::VK_DTPREL_LO12_NC) + .Case("tprel_g2", AArch64MCExpr::VK_TPREL_G2) + .Case("tprel_g1", AArch64MCExpr::VK_TPREL_G1) + .Case("tprel_g1_nc", AArch64MCExpr::VK_TPREL_G1_NC) + .Case("tprel_g0", AArch64MCExpr::VK_TPREL_G0) + .Case("tprel_g0_nc", AArch64MCExpr::VK_TPREL_G0_NC) + .Case("tprel_hi12", AArch64MCExpr::VK_TPREL_HI12) + .Case("tprel_lo12", AArch64MCExpr::VK_TPREL_LO12) + .Case("tprel_lo12_nc", AArch64MCExpr::VK_TPREL_LO12_NC) + .Case("tlsdesc_lo12", AArch64MCExpr::VK_TLSDESC_LO12) + .Case("got", AArch64MCExpr::VK_GOT_PAGE) + .Case("got_lo12", AArch64MCExpr::VK_GOT_LO12) + .Case("gottprel", AArch64MCExpr::VK_GOTTPREL_PAGE) + .Case("gottprel_lo12", AArch64MCExpr::VK_GOTTPREL_LO12_NC) + .Case("gottprel_g1", AArch64MCExpr::VK_GOTTPREL_G1) + .Case("gottprel_g0_nc", AArch64MCExpr::VK_GOTTPREL_G0_NC) + .Case("tlsdesc", AArch64MCExpr::VK_TLSDESC_PAGE) + .Default(AArch64MCExpr::VK_INVALID); + + if (RefKind == AArch64MCExpr::VK_INVALID) { + Error(Parser.getTok().getLoc(), + "expect relocation specifier in operand after ':'"); + return true; + } + + Parser.Lex(); // Eat identifier + + if (Parser.getTok().isNot(AsmToken::Colon)) { + Error(Parser.getTok().getLoc(), "expect ':' after relocation specifier"); + return true; + } + Parser.Lex(); // Eat ':' + } + + if (getParser().parseExpression(ImmVal)) + return true; + + if (HasELFModifier) + ImmVal = AArch64MCExpr::Create(ImmVal, RefKind, getContext()); + + return false; +} + +/// parseVectorList - Parse a vector list operand for AdvSIMD instructions. +bool AArch64AsmParser::parseVectorList(OperandVector &Operands) { + assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Bracket"); + SMLoc S = getLoc(); + Parser.Lex(); // Eat left bracket token. + StringRef Kind; + int64_t FirstReg = tryMatchVectorRegister(Kind, true); + if (FirstReg == -1) + return true; + int64_t PrevReg = FirstReg; + unsigned Count = 1; + + if (Parser.getTok().is(AsmToken::Minus)) { + Parser.Lex(); // Eat the minus. + + SMLoc Loc = getLoc(); + StringRef NextKind; + int64_t Reg = tryMatchVectorRegister(NextKind, true); + if (Reg == -1) + return true; + // Any Kind suffices must match on all regs in the list. + if (Kind != NextKind) + return Error(Loc, "mismatched register size suffix"); + + unsigned Space = (PrevReg < Reg) ? (Reg - PrevReg) : (Reg + 32 - PrevReg); + + if (Space == 0 || Space > 3) { + return Error(Loc, "invalid number of vectors"); + } + + Count += Space; + } + else { + while (Parser.getTok().is(AsmToken::Comma)) { + Parser.Lex(); // Eat the comma token. + + SMLoc Loc = getLoc(); + StringRef NextKind; + int64_t Reg = tryMatchVectorRegister(NextKind, true); + if (Reg == -1) + return true; + // Any Kind suffices must match on all regs in the list. + if (Kind != NextKind) + return Error(Loc, "mismatched register size suffix"); + + // Registers must be incremental (with wraparound at 31) + if (getContext().getRegisterInfo()->getEncodingValue(Reg) != + (getContext().getRegisterInfo()->getEncodingValue(PrevReg) + 1) % 32) + return Error(Loc, "registers must be sequential"); + + PrevReg = Reg; + ++Count; + } + } + + if (Parser.getTok().isNot(AsmToken::RCurly)) + return Error(getLoc(), "'}' expected"); + Parser.Lex(); // Eat the '}' token. + + if (Count > 4) + return Error(S, "invalid number of vectors"); + + unsigned NumElements = 0; + char ElementKind = 0; + if (!Kind.empty()) + parseValidVectorKind(Kind, NumElements, ElementKind); + + Operands.push_back(AArch64Operand::CreateVectorList( + FirstReg, Count, NumElements, ElementKind, S, getLoc(), getContext())); + + // If there is an index specifier following the list, parse that too. + if (Parser.getTok().is(AsmToken::LBrac)) { + SMLoc SIdx = getLoc(); + Parser.Lex(); // Eat left bracket token. + + const MCExpr *ImmVal; + if (getParser().parseExpression(ImmVal)) + return false; + const MCConstantExpr *MCE = dyn_cast(ImmVal); + if (!MCE) { + TokError("immediate value expected for vector index"); + return false; + } + + SMLoc E = getLoc(); + if (Parser.getTok().isNot(AsmToken::RBrac)) { + Error(E, "']' expected"); + return false; + } + + Parser.Lex(); // Eat right bracket token. + + Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx, + E, getContext())); + } + return false; +} + +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) { + const AsmToken &Tok = Parser.getTok(); + if (!Tok.is(AsmToken::Identifier)) + return MatchOperand_NoMatch; + + unsigned RegNum = MatchRegisterName(Tok.getString().lower()); + + MCContext &Ctx = getContext(); + const MCRegisterInfo *RI = Ctx.getRegisterInfo(); + if (!RI->getRegClass(AArch64::GPR64spRegClassID).contains(RegNum)) + return MatchOperand_NoMatch; + + SMLoc S = getLoc(); + Parser.Lex(); // Eat register + + if (Parser.getTok().isNot(AsmToken::Comma)) { + Operands.push_back( + AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx)); + return MatchOperand_Success; + } + Parser.Lex(); // Eat comma. + + if (Parser.getTok().is(AsmToken::Hash)) + Parser.Lex(); // Eat hash + + if (Parser.getTok().isNot(AsmToken::Integer)) { + Error(getLoc(), "index must be absent or #0"); + return MatchOperand_ParseFail; + } + + const MCExpr *ImmVal; + if (Parser.parseExpression(ImmVal) || !isa(ImmVal) || + cast(ImmVal)->getValue() != 0) { + Error(getLoc(), "index must be absent or #0"); + return MatchOperand_ParseFail; + } + + Operands.push_back( + AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx)); + return MatchOperand_Success; +} + +/// parseOperand - Parse a arm instruction operand. For now this parses the +/// operand regardless of the mnemonic. +bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode, + bool invertCondCode) { + // Check if the current operand has a custom associated parser, if so, try to + // custom parse the operand, or fallback to the general approach. + OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); + if (ResTy == MatchOperand_Success) + return false; + // If there wasn't a custom match, try the generic matcher below. Otherwise, + // there was a match, but an error occurred, in which case, just return that + // the operand parsing failed. + if (ResTy == MatchOperand_ParseFail) + return true; + + // Nothing custom, so do general case parsing. + SMLoc S, E; + switch (getLexer().getKind()) { + default: { + SMLoc S = getLoc(); + const MCExpr *Expr; + if (parseSymbolicImmVal(Expr)) + return Error(S, "invalid operand"); + + SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1); + Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext())); + return false; + } + case AsmToken::LBrac: { + SMLoc Loc = Parser.getTok().getLoc(); + Operands.push_back(AArch64Operand::CreateToken("[", false, Loc, + getContext())); + Parser.Lex(); // Eat '[' + + // There's no comma after a '[', so we can parse the next operand + // immediately. + return parseOperand(Operands, false, false); + } + case AsmToken::LCurly: + return parseVectorList(Operands); + case AsmToken::Identifier: { + // If we're expecting a Condition Code operand, then just parse that. + if (isCondCode) + return parseCondCode(Operands, invertCondCode); + + // If it's a register name, parse it. + if (!parseRegister(Operands)) + return false; + + // This could be an optional "shift" or "extend" operand. + OperandMatchResultTy GotShift = tryParseOptionalShiftExtend(Operands); + // We can only continue if no tokens were eaten. + if (GotShift != MatchOperand_NoMatch) + return GotShift; + + // This was not a register so parse other operands that start with an + // identifier (like labels) as expressions and create them as immediates. + const MCExpr *IdVal; + S = getLoc(); + if (getParser().parseExpression(IdVal)) + return true; + + E = SMLoc::getFromPointer(getLoc().getPointer() - 1); + Operands.push_back(AArch64Operand::CreateImm(IdVal, S, E, getContext())); + return false; + } + case AsmToken::Integer: + case AsmToken::Real: + case AsmToken::Hash: { + // #42 -> immediate. + S = getLoc(); + if (getLexer().is(AsmToken::Hash)) + Parser.Lex(); + + // Parse a negative sign + bool isNegative = false; + if (Parser.getTok().is(AsmToken::Minus)) { + isNegative = true; + // We need to consume this token only when we have a Real, otherwise + // we let parseSymbolicImmVal take care of it + if (Parser.getLexer().peekTok().is(AsmToken::Real)) + Parser.Lex(); + } + + // The only Real that should come through here is a literal #0.0 for + // the fcmp[e] r, #0.0 instructions. They expect raw token operands, + // so convert the value. + const AsmToken &Tok = Parser.getTok(); + if (Tok.is(AsmToken::Real)) { + APFloat RealVal(APFloat::IEEEdouble, Tok.getString()); + uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue(); + if (Mnemonic != "fcmp" && Mnemonic != "fcmpe" && Mnemonic != "fcmeq" && + Mnemonic != "fcmge" && Mnemonic != "fcmgt" && Mnemonic != "fcmle" && + Mnemonic != "fcmlt") + return TokError("unexpected floating point literal"); + else if (IntVal != 0 || isNegative) + return TokError("expected floating-point constant #0.0"); + Parser.Lex(); // Eat the token. + + Operands.push_back( + AArch64Operand::CreateToken("#0", false, S, getContext())); + Operands.push_back( + AArch64Operand::CreateToken(".0", false, S, getContext())); + return false; + } + + const MCExpr *ImmVal; + if (parseSymbolicImmVal(ImmVal)) + return true; + + E = SMLoc::getFromPointer(getLoc().getPointer() - 1); + Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E, getContext())); + return false; + } + } +} + +/// ParseInstruction - Parse an AArch64 instruction mnemonic followed by its +/// operands. +bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info, + StringRef Name, SMLoc NameLoc, + OperandVector &Operands) { + Name = StringSwitch(Name.lower()) + .Case("beq", "b.eq") + .Case("bne", "b.ne") + .Case("bhs", "b.hs") + .Case("bcs", "b.cs") + .Case("blo", "b.lo") + .Case("bcc", "b.cc") + .Case("bmi", "b.mi") + .Case("bpl", "b.pl") + .Case("bvs", "b.vs") + .Case("bvc", "b.vc") + .Case("bhi", "b.hi") + .Case("bls", "b.ls") + .Case("bge", "b.ge") + .Case("blt", "b.lt") + .Case("bgt", "b.gt") + .Case("ble", "b.le") + .Case("bal", "b.al") + .Case("bnv", "b.nv") + .Default(Name); + + // Create the leading tokens for the mnemonic, split by '.' characters. + size_t Start = 0, Next = Name.find('.'); + StringRef Head = Name.slice(Start, Next); + + // IC, DC, AT, and TLBI instructions are aliases for the SYS instruction. + if (Head == "ic" || Head == "dc" || Head == "at" || Head == "tlbi") { + bool IsError = parseSysAlias(Head, NameLoc, Operands); + if (IsError && getLexer().isNot(AsmToken::EndOfStatement)) + Parser.eatToEndOfStatement(); + return IsError; + } + + Operands.push_back( + AArch64Operand::CreateToken(Head, false, NameLoc, getContext())); + Mnemonic = Head; + + // Handle condition codes for a branch mnemonic + if (Head == "b" && Next != StringRef::npos) { + Start = Next; + Next = Name.find('.', Start + 1); + Head = Name.slice(Start + 1, Next); + + SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() + + (Head.data() - Name.data())); + AArch64CC::CondCode CC = parseCondCodeString(Head); + if (CC == AArch64CC::Invalid) + return Error(SuffixLoc, "invalid condition code"); + Operands.push_back( + AArch64Operand::CreateToken(".", true, SuffixLoc, getContext())); + Operands.push_back( + AArch64Operand::CreateCondCode(CC, NameLoc, NameLoc, getContext())); + } + + // Add the remaining tokens in the mnemonic. + while (Next != StringRef::npos) { + Start = Next; + Next = Name.find('.', Start + 1); + Head = Name.slice(Start, Next); + SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() + + (Head.data() - Name.data()) + 1); + Operands.push_back( + AArch64Operand::CreateToken(Head, true, SuffixLoc, getContext())); + } + + // Conditional compare instructions have a Condition Code operand, which needs + // to be parsed and an immediate operand created. + bool condCodeFourthOperand = + (Head == "ccmp" || Head == "ccmn" || Head == "fccmp" || + Head == "fccmpe" || Head == "fcsel" || Head == "csel" || + Head == "csinc" || Head == "csinv" || Head == "csneg"); + + // These instructions are aliases to some of the conditional select + // instructions. However, the condition code is inverted in the aliased + // instruction. + // + // FIXME: Is this the correct way to handle these? Or should the parser + // generate the aliased instructions directly? + bool condCodeSecondOperand = (Head == "cset" || Head == "csetm"); + bool condCodeThirdOperand = + (Head == "cinc" || Head == "cinv" || Head == "cneg"); + + // Read the remaining operands. + if (getLexer().isNot(AsmToken::EndOfStatement)) { + // Read the first operand. + if (parseOperand(Operands, false, false)) { + Parser.eatToEndOfStatement(); + return true; + } + + unsigned N = 2; + while (getLexer().is(AsmToken::Comma)) { + Parser.Lex(); // Eat the comma. + + // Parse and remember the operand. + if (parseOperand(Operands, (N == 4 && condCodeFourthOperand) || + (N == 3 && condCodeThirdOperand) || + (N == 2 && condCodeSecondOperand), + condCodeSecondOperand || condCodeThirdOperand)) { + Parser.eatToEndOfStatement(); + return true; + } + + // After successfully parsing some operands there are two special cases to + // consider (i.e. notional operands not separated by commas). Both are due + // to memory specifiers: + // + An RBrac will end an address for load/store/prefetch + // + An '!' will indicate a pre-indexed operation. + // + // It's someone else's responsibility to make sure these tokens are sane + // in the given context! + if (Parser.getTok().is(AsmToken::RBrac)) { + SMLoc Loc = Parser.getTok().getLoc(); + Operands.push_back(AArch64Operand::CreateToken("]", false, Loc, + getContext())); + Parser.Lex(); + } + + if (Parser.getTok().is(AsmToken::Exclaim)) { + SMLoc Loc = Parser.getTok().getLoc(); + Operands.push_back(AArch64Operand::CreateToken("!", false, Loc, + getContext())); + Parser.Lex(); + } + + ++N; + } + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) { + SMLoc Loc = Parser.getTok().getLoc(); + Parser.eatToEndOfStatement(); + return Error(Loc, "unexpected token in argument list"); + } + + Parser.Lex(); // Consume the EndOfStatement + return false; +} + +// FIXME: This entire function is a giant hack to provide us with decent +// operand range validation/diagnostics until TableGen/MC can be extended +// to support autogeneration of this kind of validation. +bool AArch64AsmParser::validateInstruction(MCInst &Inst, + SmallVectorImpl &Loc) { + const MCRegisterInfo *RI = getContext().getRegisterInfo(); + // Check for indexed addressing modes w/ the base register being the + // same as a destination/source register or pair load where + // the Rt == Rt2. All of those are undefined behaviour. + switch (Inst.getOpcode()) { + case AArch64::LDPSWpre: + case AArch64::LDPWpost: + case AArch64::LDPWpre: + case AArch64::LDPXpost: + case AArch64::LDPXpre: { + unsigned Rt = Inst.getOperand(1).getReg(); + unsigned Rt2 = Inst.getOperand(2).getReg(); + unsigned Rn = Inst.getOperand(3).getReg(); + if (RI->isSubRegisterEq(Rn, Rt)) + return Error(Loc[0], "unpredictable LDP instruction, writeback base " + "is also a destination"); + if (RI->isSubRegisterEq(Rn, Rt2)) + return Error(Loc[1], "unpredictable LDP instruction, writeback base " + "is also a destination"); + // FALLTHROUGH + } + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPWi: + case AArch64::LDPXi: { + unsigned Rt = Inst.getOperand(0).getReg(); + unsigned Rt2 = Inst.getOperand(1).getReg(); + if (Rt == Rt2) + return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt"); + break; + } + case AArch64::LDPDpost: + case AArch64::LDPDpre: + case AArch64::LDPQpost: + case AArch64::LDPQpre: + case AArch64::LDPSpost: + case AArch64::LDPSpre: + case AArch64::LDPSWpost: { + unsigned Rt = Inst.getOperand(1).getReg(); + unsigned Rt2 = Inst.getOperand(2).getReg(); + if (Rt == Rt2) + return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt"); + break; + } + case AArch64::STPDpost: + case AArch64::STPDpre: + case AArch64::STPQpost: + case AArch64::STPQpre: + case AArch64::STPSpost: + case AArch64::STPSpre: + case AArch64::STPWpost: + case AArch64::STPWpre: + case AArch64::STPXpost: + case AArch64::STPXpre: { + unsigned Rt = Inst.getOperand(1).getReg(); + unsigned Rt2 = Inst.getOperand(2).getReg(); + unsigned Rn = Inst.getOperand(3).getReg(); + if (RI->isSubRegisterEq(Rn, Rt)) + return Error(Loc[0], "unpredictable STP instruction, writeback base " + "is also a source"); + if (RI->isSubRegisterEq(Rn, Rt2)) + return Error(Loc[1], "unpredictable STP instruction, writeback base " + "is also a source"); + break; + } + case AArch64::LDRBBpre: + case AArch64::LDRBpre: + case AArch64::LDRHHpre: + case AArch64::LDRHpre: + case AArch64::LDRSBWpre: + case AArch64::LDRSBXpre: + case AArch64::LDRSHWpre: + case AArch64::LDRSHXpre: + case AArch64::LDRSWpre: + case AArch64::LDRWpre: + case AArch64::LDRXpre: + case AArch64::LDRBBpost: + case AArch64::LDRBpost: + case AArch64::LDRHHpost: + case AArch64::LDRHpost: + case AArch64::LDRSBWpost: + case AArch64::LDRSBXpost: + case AArch64::LDRSHWpost: + case AArch64::LDRSHXpost: + case AArch64::LDRSWpost: + case AArch64::LDRWpost: + case AArch64::LDRXpost: { + unsigned Rt = Inst.getOperand(1).getReg(); + unsigned Rn = Inst.getOperand(2).getReg(); + if (RI->isSubRegisterEq(Rn, Rt)) + return Error(Loc[0], "unpredictable LDR instruction, writeback base " + "is also a source"); + break; + } + case AArch64::STRBBpost: + case AArch64::STRBpost: + case AArch64::STRHHpost: + case AArch64::STRHpost: + case AArch64::STRWpost: + case AArch64::STRXpost: + case AArch64::STRBBpre: + case AArch64::STRBpre: + case AArch64::STRHHpre: + case AArch64::STRHpre: + case AArch64::STRWpre: + case AArch64::STRXpre: { + unsigned Rt = Inst.getOperand(1).getReg(); + unsigned Rn = Inst.getOperand(2).getReg(); + if (RI->isSubRegisterEq(Rn, Rt)) + return Error(Loc[0], "unpredictable STR instruction, writeback base " + "is also a source"); + break; + } + } + + // Now check immediate ranges. Separate from the above as there is overlap + // in the instructions being checked and this keeps the nested conditionals + // to a minimum. + switch (Inst.getOpcode()) { + case AArch64::ADDSWri: + case AArch64::ADDSXri: + case AArch64::ADDWri: + case AArch64::ADDXri: + case AArch64::SUBSWri: + case AArch64::SUBSXri: + case AArch64::SUBWri: + case AArch64::SUBXri: { + // Annoyingly we can't do this in the isAddSubImm predicate, so there is + // some slight duplication here. + if (Inst.getOperand(2).isExpr()) { + const MCExpr *Expr = Inst.getOperand(2).getExpr(); + AArch64MCExpr::VariantKind ELFRefKind; + MCSymbolRefExpr::VariantKind DarwinRefKind; + int64_t Addend; + if (!classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) { + return Error(Loc[2], "invalid immediate expression"); + } + + // Only allow these with ADDXri. + if ((DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF || + DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) && + Inst.getOpcode() == AArch64::ADDXri) + return false; + + // Only allow these with ADDXri/ADDWri + if ((ELFRefKind == AArch64MCExpr::VK_LO12 || + ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12 || + ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 || + ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC || + ELFRefKind == AArch64MCExpr::VK_TPREL_HI12 || + ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 || + ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC || + ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) && + (Inst.getOpcode() == AArch64::ADDXri || + Inst.getOpcode() == AArch64::ADDWri)) + return false; + + // Don't allow expressions in the immediate field otherwise + return Error(Loc[2], "invalid immediate expression"); + } + return false; + } + default: + return false; + } +} + +bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) { + switch (ErrCode) { + case Match_MissingFeature: + return Error(Loc, + "instruction requires a CPU feature not currently enabled"); + case Match_InvalidOperand: + return Error(Loc, "invalid operand for instruction"); + case Match_InvalidSuffix: + return Error(Loc, "invalid type suffix for instruction"); + case Match_InvalidCondCode: + return Error(Loc, "expected AArch64 condition code"); + case Match_AddSubRegExtendSmall: + return Error(Loc, + "expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]"); + case Match_AddSubRegExtendLarge: + return Error(Loc, + "expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]"); + case Match_AddSubSecondSource: + return Error(Loc, + "expected compatible register, symbol or integer in range [0, 4095]"); + case Match_LogicalSecondSource: + return Error(Loc, "expected compatible register or logical immediate"); + case Match_InvalidMovImm32Shift: + return Error(Loc, "expected 'lsl' with optional integer 0 or 16"); + case Match_InvalidMovImm64Shift: + return Error(Loc, "expected 'lsl' with optional integer 0, 16, 32 or 48"); + case Match_AddSubRegShift32: + return Error(Loc, + "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]"); + case Match_AddSubRegShift64: + return Error(Loc, + "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]"); + case Match_InvalidFPImm: + return Error(Loc, + "expected compatible register or floating-point constant"); + case Match_InvalidMemoryIndexedSImm9: + return Error(Loc, "index must be an integer in range [-256, 255]."); + case Match_InvalidMemoryIndexed4SImm7: + return Error(Loc, "index must be a multiple of 4 in range [-256, 252]."); + case Match_InvalidMemoryIndexed8SImm7: + return Error(Loc, "index must be a multiple of 8 in range [-512, 504]."); + case Match_InvalidMemoryIndexed16SImm7: + return Error(Loc, "index must be a multiple of 16 in range [-1024, 1008]."); + case Match_InvalidMemoryWExtend8: + return Error(Loc, + "expected 'uxtw' or 'sxtw' with optional shift of #0"); + case Match_InvalidMemoryWExtend16: + return Error(Loc, + "expected 'uxtw' or 'sxtw' with optional shift of #0 or #1"); + case Match_InvalidMemoryWExtend32: + return Error(Loc, + "expected 'uxtw' or 'sxtw' with optional shift of #0 or #2"); + case Match_InvalidMemoryWExtend64: + return Error(Loc, + "expected 'uxtw' or 'sxtw' with optional shift of #0 or #3"); + case Match_InvalidMemoryWExtend128: + return Error(Loc, + "expected 'uxtw' or 'sxtw' with optional shift of #0 or #4"); + case Match_InvalidMemoryXExtend8: + return Error(Loc, + "expected 'lsl' or 'sxtx' with optional shift of #0"); + case Match_InvalidMemoryXExtend16: + return Error(Loc, + "expected 'lsl' or 'sxtx' with optional shift of #0 or #1"); + case Match_InvalidMemoryXExtend32: + return Error(Loc, + "expected 'lsl' or 'sxtx' with optional shift of #0 or #2"); + case Match_InvalidMemoryXExtend64: + return Error(Loc, + "expected 'lsl' or 'sxtx' with optional shift of #0 or #3"); + case Match_InvalidMemoryXExtend128: + return Error(Loc, + "expected 'lsl' or 'sxtx' with optional shift of #0 or #4"); + case Match_InvalidMemoryIndexed1: + return Error(Loc, "index must be an integer in range [0, 4095]."); + case Match_InvalidMemoryIndexed2: + return Error(Loc, "index must be a multiple of 2 in range [0, 8190]."); + case Match_InvalidMemoryIndexed4: + return Error(Loc, "index must be a multiple of 4 in range [0, 16380]."); + case Match_InvalidMemoryIndexed8: + return Error(Loc, "index must be a multiple of 8 in range [0, 32760]."); + case Match_InvalidMemoryIndexed16: + return Error(Loc, "index must be a multiple of 16 in range [0, 65520]."); + case Match_InvalidImm0_7: + return Error(Loc, "immediate must be an integer in range [0, 7]."); + case Match_InvalidImm0_15: + return Error(Loc, "immediate must be an integer in range [0, 15]."); + case Match_InvalidImm0_31: + return Error(Loc, "immediate must be an integer in range [0, 31]."); + case Match_InvalidImm0_63: + return Error(Loc, "immediate must be an integer in range [0, 63]."); + case Match_InvalidImm0_127: + return Error(Loc, "immediate must be an integer in range [0, 127]."); + case Match_InvalidImm0_65535: + return Error(Loc, "immediate must be an integer in range [0, 65535]."); + case Match_InvalidImm1_8: + return Error(Loc, "immediate must be an integer in range [1, 8]."); + case Match_InvalidImm1_16: + return Error(Loc, "immediate must be an integer in range [1, 16]."); + case Match_InvalidImm1_32: + return Error(Loc, "immediate must be an integer in range [1, 32]."); + case Match_InvalidImm1_64: + return Error(Loc, "immediate must be an integer in range [1, 64]."); + case Match_InvalidIndex1: + return Error(Loc, "expected lane specifier '[1]'"); + case Match_InvalidIndexB: + return Error(Loc, "vector lane must be an integer in range [0, 15]."); + case Match_InvalidIndexH: + return Error(Loc, "vector lane must be an integer in range [0, 7]."); + case Match_InvalidIndexS: + return Error(Loc, "vector lane must be an integer in range [0, 3]."); + case Match_InvalidIndexD: + return Error(Loc, "vector lane must be an integer in range [0, 1]."); + case Match_InvalidLabel: + return Error(Loc, "expected label or encodable integer pc offset"); + case Match_MRS: + return Error(Loc, "expected readable system register"); + case Match_MSR: + return Error(Loc, "expected writable system register or pstate"); + case Match_MnemonicFail: + return Error(Loc, "unrecognized instruction mnemonic"); + default: + assert(0 && "unexpected error code!"); + return Error(Loc, "invalid instruction format"); + } +} + +static const char *getSubtargetFeatureName(unsigned Val); + +bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, + unsigned &ErrorInfo, + bool MatchingInlineAsm) { + assert(!Operands.empty() && "Unexpect empty operand list!"); + AArch64Operand *Op = static_cast(Operands[0]); + assert(Op->isToken() && "Leading operand should always be a mnemonic!"); + + StringRef Tok = Op->getToken(); + unsigned NumOperands = Operands.size(); + + if (NumOperands == 4 && Tok == "lsl") { + AArch64Operand *Op2 = static_cast(Operands[2]); + AArch64Operand *Op3 = static_cast(Operands[3]); + if (Op2->isReg() && Op3->isImm()) { + const MCConstantExpr *Op3CE = dyn_cast(Op3->getImm()); + if (Op3CE) { + uint64_t Op3Val = Op3CE->getValue(); + uint64_t NewOp3Val = 0; + uint64_t NewOp4Val = 0; + if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains( + Op2->getReg())) { + NewOp3Val = (32 - Op3Val) & 0x1f; + NewOp4Val = 31 - Op3Val; + } else { + NewOp3Val = (64 - Op3Val) & 0x3f; + NewOp4Val = 63 - Op3Val; + } + + const MCExpr *NewOp3 = MCConstantExpr::Create(NewOp3Val, getContext()); + const MCExpr *NewOp4 = MCConstantExpr::Create(NewOp4Val, getContext()); + + Operands[0] = AArch64Operand::CreateToken( + "ubfm", false, Op->getStartLoc(), getContext()); + Operands[3] = AArch64Operand::CreateImm(NewOp3, Op3->getStartLoc(), + Op3->getEndLoc(), getContext()); + Operands.push_back(AArch64Operand::CreateImm( + NewOp4, Op3->getStartLoc(), Op3->getEndLoc(), getContext())); + delete Op3; + delete Op; + } + } + } else if (NumOperands == 5) { + // FIXME: Horrible hack to handle the BFI -> BFM, SBFIZ->SBFM, and + // UBFIZ -> UBFM aliases. + if (Tok == "bfi" || Tok == "sbfiz" || Tok == "ubfiz") { + AArch64Operand *Op1 = static_cast(Operands[1]); + AArch64Operand *Op3 = static_cast(Operands[3]); + AArch64Operand *Op4 = static_cast(Operands[4]); + + if (Op1->isReg() && Op3->isImm() && Op4->isImm()) { + const MCConstantExpr *Op3CE = dyn_cast(Op3->getImm()); + const MCConstantExpr *Op4CE = dyn_cast(Op4->getImm()); + + if (Op3CE && Op4CE) { + uint64_t Op3Val = Op3CE->getValue(); + uint64_t Op4Val = Op4CE->getValue(); + + uint64_t RegWidth = 0; + if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains( + Op1->getReg())) + RegWidth = 64; + else + RegWidth = 32; + + if (Op3Val >= RegWidth) + return Error(Op3->getStartLoc(), + "expected integer in range [0, 31]"); + if (Op4Val < 1 || Op4Val > RegWidth) + return Error(Op4->getStartLoc(), + "expected integer in range [1, 32]"); + + uint64_t NewOp3Val = 0; + if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains( + Op1->getReg())) + NewOp3Val = (32 - Op3Val) & 0x1f; + else + NewOp3Val = (64 - Op3Val) & 0x3f; + + uint64_t NewOp4Val = Op4Val - 1; + + if (NewOp3Val != 0 && NewOp4Val >= NewOp3Val) + return Error(Op4->getStartLoc(), + "requested insert overflows register"); + + const MCExpr *NewOp3 = + MCConstantExpr::Create(NewOp3Val, getContext()); + const MCExpr *NewOp4 = + MCConstantExpr::Create(NewOp4Val, getContext()); + Operands[3] = AArch64Operand::CreateImm( + NewOp3, Op3->getStartLoc(), Op3->getEndLoc(), getContext()); + Operands[4] = AArch64Operand::CreateImm( + NewOp4, Op4->getStartLoc(), Op4->getEndLoc(), getContext()); + if (Tok == "bfi") + Operands[0] = AArch64Operand::CreateToken( + "bfm", false, Op->getStartLoc(), getContext()); + else if (Tok == "sbfiz") + Operands[0] = AArch64Operand::CreateToken( + "sbfm", false, Op->getStartLoc(), getContext()); + else if (Tok == "ubfiz") + Operands[0] = AArch64Operand::CreateToken( + "ubfm", false, Op->getStartLoc(), getContext()); + else + llvm_unreachable("No valid mnemonic for alias?"); + + delete Op; + delete Op3; + delete Op4; + } + } + + // FIXME: Horrible hack to handle the BFXIL->BFM, SBFX->SBFM, and + // UBFX -> UBFM aliases. + } else if (NumOperands == 5 && + (Tok == "bfxil" || Tok == "sbfx" || Tok == "ubfx")) { + AArch64Operand *Op1 = static_cast(Operands[1]); + AArch64Operand *Op3 = static_cast(Operands[3]); + AArch64Operand *Op4 = static_cast(Operands[4]); + + if (Op1->isReg() && Op3->isImm() && Op4->isImm()) { + const MCConstantExpr *Op3CE = dyn_cast(Op3->getImm()); + const MCConstantExpr *Op4CE = dyn_cast(Op4->getImm()); + + if (Op3CE && Op4CE) { + uint64_t Op3Val = Op3CE->getValue(); + uint64_t Op4Val = Op4CE->getValue(); + + uint64_t RegWidth = 0; + if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains( + Op1->getReg())) + RegWidth = 64; + else + RegWidth = 32; + + if (Op3Val >= RegWidth) + return Error(Op3->getStartLoc(), + "expected integer in range [0, 31]"); + if (Op4Val < 1 || Op4Val > RegWidth) + return Error(Op4->getStartLoc(), + "expected integer in range [1, 32]"); + + uint64_t NewOp4Val = Op3Val + Op4Val - 1; + + if (NewOp4Val >= RegWidth || NewOp4Val < Op3Val) + return Error(Op4->getStartLoc(), + "requested extract overflows register"); + + const MCExpr *NewOp4 = + MCConstantExpr::Create(NewOp4Val, getContext()); + Operands[4] = AArch64Operand::CreateImm( + NewOp4, Op4->getStartLoc(), Op4->getEndLoc(), getContext()); + if (Tok == "bfxil") + Operands[0] = AArch64Operand::CreateToken( + "bfm", false, Op->getStartLoc(), getContext()); + else if (Tok == "sbfx") + Operands[0] = AArch64Operand::CreateToken( + "sbfm", false, Op->getStartLoc(), getContext()); + else if (Tok == "ubfx") + Operands[0] = AArch64Operand::CreateToken( + "ubfm", false, Op->getStartLoc(), getContext()); + else + llvm_unreachable("No valid mnemonic for alias?"); + + delete Op; + delete Op4; + } + } + } + } + // FIXME: Horrible hack for sxtw and uxtw with Wn src and Xd dst operands. + // InstAlias can't quite handle this since the reg classes aren't + // subclasses. + if (NumOperands == 3 && (Tok == "sxtw" || Tok == "uxtw")) { + // The source register can be Wn here, but the matcher expects a + // GPR64. Twiddle it here if necessary. + AArch64Operand *Op = static_cast(Operands[2]); + if (Op->isReg()) { + unsigned Reg = getXRegFromWReg(Op->getReg()); + Operands[2] = AArch64Operand::CreateReg(Reg, false, Op->getStartLoc(), + Op->getEndLoc(), getContext()); + delete Op; + } + } + // FIXME: Likewise for sxt[bh] with a Xd dst operand + else if (NumOperands == 3 && (Tok == "sxtb" || Tok == "sxth")) { + AArch64Operand *Op = static_cast(Operands[1]); + if (Op->isReg() && + AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains( + Op->getReg())) { + // The source register can be Wn here, but the matcher expects a + // GPR64. Twiddle it here if necessary. + AArch64Operand *Op = static_cast(Operands[2]); + if (Op->isReg()) { + unsigned Reg = getXRegFromWReg(Op->getReg()); + Operands[2] = AArch64Operand::CreateReg(Reg, false, Op->getStartLoc(), + Op->getEndLoc(), getContext()); + delete Op; + } + } + } + // FIXME: Likewise for uxt[bh] with a Xd dst operand + else if (NumOperands == 3 && (Tok == "uxtb" || Tok == "uxth")) { + AArch64Operand *Op = static_cast(Operands[1]); + if (Op->isReg() && + AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains( + Op->getReg())) { + // The source register can be Wn here, but the matcher expects a + // GPR32. Twiddle it here if necessary. + AArch64Operand *Op = static_cast(Operands[1]); + if (Op->isReg()) { + unsigned Reg = getWRegFromXReg(Op->getReg()); + Operands[1] = AArch64Operand::CreateReg(Reg, false, Op->getStartLoc(), + Op->getEndLoc(), getContext()); + delete Op; + } + } + } + + // Yet another horrible hack to handle FMOV Rd, #0.0 using [WX]ZR. + if (NumOperands == 3 && Tok == "fmov") { + AArch64Operand *RegOp = static_cast(Operands[1]); + AArch64Operand *ImmOp = static_cast(Operands[2]); + if (RegOp->isReg() && ImmOp->isFPImm() && + ImmOp->getFPImm() == (unsigned)-1) { + unsigned zreg = + AArch64MCRegisterClasses[AArch64::FPR32RegClassID].contains( + RegOp->getReg()) + ? AArch64::WZR + : AArch64::XZR; + Operands[2] = AArch64Operand::CreateReg(zreg, false, Op->getStartLoc(), + Op->getEndLoc(), getContext()); + delete ImmOp; + } + } + + MCInst Inst; + // First try to match against the secondary set of tables containing the + // short-form NEON instructions (e.g. "fadd.2s v0, v1, v2"). + unsigned MatchResult = + MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 1); + + // If that fails, try against the alternate table containing long-form NEON: + // "fadd v0.2s, v1.2s, v2.2s" + if (MatchResult != Match_Success) + MatchResult = + MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0); + + switch (MatchResult) { + case Match_Success: { + // Perform range checking and other semantic validations + SmallVector OperandLocs; + NumOperands = Operands.size(); + for (unsigned i = 1; i < NumOperands; ++i) + OperandLocs.push_back(Operands[i]->getStartLoc()); + if (validateInstruction(Inst, OperandLocs)) + return true; + + Inst.setLoc(IDLoc); + Out.EmitInstruction(Inst, STI); + return false; + } + case Match_MissingFeature: { + assert(ErrorInfo && "Unknown missing feature!"); + // Special case the error message for the very common case where only + // a single subtarget feature is missing (neon, e.g.). + std::string Msg = "instruction requires:"; + unsigned Mask = 1; + for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) { + if (ErrorInfo & Mask) { + Msg += " "; + Msg += getSubtargetFeatureName(ErrorInfo & Mask); + } + Mask <<= 1; + } + return Error(IDLoc, Msg); + } + case Match_MnemonicFail: + return showMatchError(IDLoc, MatchResult); + case Match_InvalidOperand: { + SMLoc ErrorLoc = IDLoc; + if (ErrorInfo != ~0U) { + if (ErrorInfo >= Operands.size()) + return Error(IDLoc, "too few operands for instruction"); + + ErrorLoc = ((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(); + if (ErrorLoc == SMLoc()) + ErrorLoc = IDLoc; + } + // If the match failed on a suffix token operand, tweak the diagnostic + // accordingly. + if (((AArch64Operand *)Operands[ErrorInfo])->isToken() && + ((AArch64Operand *)Operands[ErrorInfo])->isTokenSuffix()) + MatchResult = Match_InvalidSuffix; + + return showMatchError(ErrorLoc, MatchResult); + } + case Match_InvalidMemoryIndexed1: + case Match_InvalidMemoryIndexed2: + case Match_InvalidMemoryIndexed4: + case Match_InvalidMemoryIndexed8: + case Match_InvalidMemoryIndexed16: + case Match_InvalidCondCode: + case Match_AddSubRegExtendSmall: + case Match_AddSubRegExtendLarge: + case Match_AddSubSecondSource: + case Match_LogicalSecondSource: + case Match_AddSubRegShift32: + case Match_AddSubRegShift64: + case Match_InvalidMovImm32Shift: + case Match_InvalidMovImm64Shift: + case Match_InvalidFPImm: + case Match_InvalidMemoryWExtend8: + case Match_InvalidMemoryWExtend16: + case Match_InvalidMemoryWExtend32: + case Match_InvalidMemoryWExtend64: + case Match_InvalidMemoryWExtend128: + case Match_InvalidMemoryXExtend8: + case Match_InvalidMemoryXExtend16: + case Match_InvalidMemoryXExtend32: + case Match_InvalidMemoryXExtend64: + case Match_InvalidMemoryXExtend128: + case Match_InvalidMemoryIndexed4SImm7: + case Match_InvalidMemoryIndexed8SImm7: + case Match_InvalidMemoryIndexed16SImm7: + case Match_InvalidMemoryIndexedSImm9: + case Match_InvalidImm0_7: + case Match_InvalidImm0_15: + case Match_InvalidImm0_31: + case Match_InvalidImm0_63: + case Match_InvalidImm0_127: + case Match_InvalidImm0_65535: + case Match_InvalidImm1_8: + case Match_InvalidImm1_16: + case Match_InvalidImm1_32: + case Match_InvalidImm1_64: + case Match_InvalidIndex1: + case Match_InvalidIndexB: + case Match_InvalidIndexH: + case Match_InvalidIndexS: + case Match_InvalidIndexD: + case Match_InvalidLabel: + case Match_MSR: + case Match_MRS: { + // Any time we get here, there's nothing fancy to do. Just get the + // operand SMLoc and display the diagnostic. + SMLoc ErrorLoc = ((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(); + if (ErrorLoc == SMLoc()) + ErrorLoc = IDLoc; + return showMatchError(ErrorLoc, MatchResult); + } + } + + llvm_unreachable("Implement any new match types added!"); + return true; +} + +/// ParseDirective parses the arm specific directives +bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) { + StringRef IDVal = DirectiveID.getIdentifier(); + SMLoc Loc = DirectiveID.getLoc(); + if (IDVal == ".hword") + return parseDirectiveWord(2, Loc); + if (IDVal == ".word") + return parseDirectiveWord(4, Loc); + if (IDVal == ".xword") + return parseDirectiveWord(8, Loc); + if (IDVal == ".tlsdesccall") + return parseDirectiveTLSDescCall(Loc); + + return parseDirectiveLOH(IDVal, Loc); +} + +/// parseDirectiveWord +/// ::= .word [ expression (, expression)* ] +bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) { + if (getLexer().isNot(AsmToken::EndOfStatement)) { + for (;;) { + const MCExpr *Value; + if (getParser().parseExpression(Value)) + return true; + + getParser().getStreamer().EmitValue(Value, Size); + + if (getLexer().is(AsmToken::EndOfStatement)) + break; + + // FIXME: Improve diagnostic. + if (getLexer().isNot(AsmToken::Comma)) + return Error(L, "unexpected token in directive"); + Parser.Lex(); + } + } + + Parser.Lex(); + return false; +} + +// parseDirectiveTLSDescCall: +// ::= .tlsdesccall symbol +bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) { + StringRef Name; + if (getParser().parseIdentifier(Name)) + return Error(L, "expected symbol after directive"); + + MCSymbol *Sym = getContext().GetOrCreateSymbol(Name); + const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, getContext()); + Expr = AArch64MCExpr::Create(Expr, AArch64MCExpr::VK_TLSDESC, getContext()); + + MCInst Inst; + Inst.setOpcode(AArch64::TLSDESCCALL); + Inst.addOperand(MCOperand::CreateExpr(Expr)); + + getParser().getStreamer().EmitInstruction(Inst, STI); + return false; +} + +/// ::= .loh label1, ..., labelN +/// The number of arguments depends on the loh identifier. +bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) { + if (IDVal != MCLOHDirectiveName()) + return true; + MCLOHType Kind; + if (getParser().getTok().isNot(AsmToken::Identifier)) { + if (getParser().getTok().isNot(AsmToken::Integer)) + return TokError("expected an identifier or a number in directive"); + // We successfully get a numeric value for the identifier. + // Check if it is valid. + int64_t Id = getParser().getTok().getIntVal(); + Kind = (MCLOHType)Id; + // Check that Id does not overflow MCLOHType. + if (!isValidMCLOHType(Kind) || Id != Kind) + return TokError("invalid numeric identifier in directive"); + } else { + StringRef Name = getTok().getIdentifier(); + // We successfully parse an identifier. + // Check if it is a recognized one. + int Id = MCLOHNameToId(Name); + + if (Id == -1) + return TokError("invalid identifier in directive"); + Kind = (MCLOHType)Id; + } + // Consume the identifier. + Lex(); + // Get the number of arguments of this LOH. + int NbArgs = MCLOHIdToNbArgs(Kind); + + assert(NbArgs != -1 && "Invalid number of arguments"); + + SmallVector Args; + for (int Idx = 0; Idx < NbArgs; ++Idx) { + StringRef Name; + if (getParser().parseIdentifier(Name)) + return TokError("expected identifier in directive"); + Args.push_back(getContext().GetOrCreateSymbol(Name)); + + if (Idx + 1 == NbArgs) + break; + if (getLexer().isNot(AsmToken::Comma)) + return TokError("unexpected token in '" + Twine(IDVal) + "' directive"); + Lex(); + } + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in '" + Twine(IDVal) + "' directive"); + + getStreamer().EmitLOHDirective((MCLOHType)Kind, Args); + return false; +} + +bool +AArch64AsmParser::classifySymbolRef(const MCExpr *Expr, + AArch64MCExpr::VariantKind &ELFRefKind, + MCSymbolRefExpr::VariantKind &DarwinRefKind, + int64_t &Addend) { + ELFRefKind = AArch64MCExpr::VK_INVALID; + DarwinRefKind = MCSymbolRefExpr::VK_None; + Addend = 0; + + if (const AArch64MCExpr *AE = dyn_cast(Expr)) { + ELFRefKind = AE->getKind(); + Expr = AE->getSubExpr(); + } + + const MCSymbolRefExpr *SE = dyn_cast(Expr); + if (SE) { + // It's a simple symbol reference with no addend. + DarwinRefKind = SE->getKind(); + return true; + } + + const MCBinaryExpr *BE = dyn_cast(Expr); + if (!BE) + return false; + + SE = dyn_cast(BE->getLHS()); + if (!SE) + return false; + DarwinRefKind = SE->getKind(); + + if (BE->getOpcode() != MCBinaryExpr::Add && + BE->getOpcode() != MCBinaryExpr::Sub) + return false; + + // See if the addend is is a constant, otherwise there's more going + // on here than we can deal with. + auto AddendExpr = dyn_cast(BE->getRHS()); + if (!AddendExpr) + return false; + + Addend = AddendExpr->getValue(); + if (BE->getOpcode() == MCBinaryExpr::Sub) + Addend = -Addend; + + // It's some symbol reference + a constant addend, but really + // shouldn't use both Darwin and ELF syntax. + return ELFRefKind == AArch64MCExpr::VK_INVALID || + DarwinRefKind == MCSymbolRefExpr::VK_None; +} + +/// Force static initialization. +extern "C" void LLVMInitializeAArch64AsmParser() { + RegisterMCAsmParser X(TheAArch64leTarget); + RegisterMCAsmParser Y(TheAArch64beTarget); + + RegisterMCAsmParser Z(TheARM64leTarget); + RegisterMCAsmParser W(TheARM64beTarget); +} + +#define GET_REGISTER_MATCHER +#define GET_SUBTARGET_FEATURE_NAME +#define GET_MATCHER_IMPLEMENTATION +#include "AArch64GenAsmMatcher.inc" + +// Define this matcher function after the auto-generated include so we +// have the match class enum definitions. +unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp, + unsigned Kind) { + AArch64Operand *Op = static_cast(AsmOp); + // If the kind is a token for a literal immediate, check if our asm + // operand matches. This is for InstAliases which have a fixed-value + // immediate in the syntax. + int64_t ExpectedVal; + switch (Kind) { + default: + return Match_InvalidOperand; + case MCK__35_0: + ExpectedVal = 0; + break; + case MCK__35_1: + ExpectedVal = 1; + break; + case MCK__35_12: + ExpectedVal = 12; + break; + case MCK__35_16: + ExpectedVal = 16; + break; + case MCK__35_2: + ExpectedVal = 2; + break; + case MCK__35_24: + ExpectedVal = 24; + break; + case MCK__35_3: + ExpectedVal = 3; + break; + case MCK__35_32: + ExpectedVal = 32; + break; + case MCK__35_4: + ExpectedVal = 4; + break; + case MCK__35_48: + ExpectedVal = 48; + break; + case MCK__35_6: + ExpectedVal = 6; + break; + case MCK__35_64: + ExpectedVal = 64; + break; + case MCK__35_8: + ExpectedVal = 8; + break; + } + if (!Op->isImm()) + return Match_InvalidOperand; + const MCConstantExpr *CE = dyn_cast(Op->getImm()); + if (!CE) + return Match_InvalidOperand; + if (CE->getValue() == ExpectedVal) + return Match_Success; + return Match_InvalidOperand; +} diff --git a/lib/Target/AArch64/AsmParser/CMakeLists.txt b/lib/Target/AArch64/AsmParser/CMakeLists.txt new file mode 100644 index 00000000000..cc0a9d86a14 --- /dev/null +++ b/lib/Target/AArch64/AsmParser/CMakeLists.txt @@ -0,0 +1,6 @@ +include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) + +add_llvm_library(LLVMAArch64AsmParser + AArch64AsmParser.cpp + ) + diff --git a/lib/Target/AArch64/AsmParser/LLVMBuild.txt b/lib/Target/AArch64/AsmParser/LLVMBuild.txt new file mode 100644 index 00000000000..11eb9d55f61 --- /dev/null +++ b/lib/Target/AArch64/AsmParser/LLVMBuild.txt @@ -0,0 +1,23 @@ +;===- ./lib/Target/AArch64/AsmParser/LLVMBuild.txt ---------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = AArch64AsmParser +parent = AArch64 +required_libraries = AArch64Desc AArch64Info AArch64Utils MC MCParser Support +add_to_library_groups = AArch64 diff --git a/lib/Target/AArch64/AsmParser/Makefile b/lib/Target/AArch64/AsmParser/Makefile new file mode 100644 index 00000000000..00268c76f8e --- /dev/null +++ b/lib/Target/AArch64/AsmParser/Makefile @@ -0,0 +1,15 @@ +##===- lib/Target/AArch64/AsmParser/Makefile ---------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../../.. +LIBRARYNAME = LLVMAArch64AsmParser + +# Hack: we need to include 'main' ARM target directory to grab private headers +CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt new file mode 100644 index 00000000000..789d549bb15 --- /dev/null +++ b/lib/Target/AArch64/CMakeLists.txt @@ -0,0 +1,51 @@ +set(LLVM_TARGET_DEFINITIONS AArch64.td) + +tablegen(LLVM AArch64GenRegisterInfo.inc -gen-register-info) +tablegen(LLVM AArch64GenInstrInfo.inc -gen-instr-info) +tablegen(LLVM AArch64GenMCCodeEmitter.inc -gen-emitter -mc-emitter) +tablegen(LLVM AArch64GenMCPseudoLowering.inc -gen-pseudo-lowering) +tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer) +tablegen(LLVM AArch64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1) +tablegen(LLVM AArch64GenAsmMatcher.inc -gen-asm-matcher) +tablegen(LLVM AArch64GenDAGISel.inc -gen-dag-isel) +tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel) +tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv) +tablegen(LLVM AArch64GenSubtargetInfo.inc -gen-subtarget) +tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler) +add_public_tablegen_target(AArch64CommonTableGen) + +add_llvm_target(AArch64CodeGen + AArch64AddressTypePromotion.cpp + AArch64AdvSIMDScalarPass.cpp + AArch64AsmPrinter.cpp + AArch64BranchRelaxation.cpp + AArch64CleanupLocalDynamicTLSPass.cpp + AArch64CollectLOH.cpp + AArch64ConditionalCompares.cpp + AArch64DeadRegisterDefinitionsPass.cpp + AArch64ExpandPseudoInsts.cpp + AArch64FastISel.cpp + AArch64FrameLowering.cpp + AArch64ISelDAGToDAG.cpp + AArch64ISelLowering.cpp + AArch64InstrInfo.cpp + AArch64LoadStoreOptimizer.cpp + AArch64MCInstLower.cpp + AArch64PromoteConstant.cpp + AArch64RegisterInfo.cpp + AArch64SelectionDAGInfo.cpp + AArch64StorePairSuppress.cpp + AArch64Subtarget.cpp + AArch64TargetMachine.cpp + AArch64TargetObjectFile.cpp + AArch64TargetTransformInfo.cpp +) + +add_dependencies(LLVMAArch64CodeGen intrinsics_gen) + +add_subdirectory(TargetInfo) +add_subdirectory(AsmParser) +add_subdirectory(Disassembler) +add_subdirectory(InstPrinter) +add_subdirectory(MCTargetDesc) +add_subdirectory(Utils) diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp new file mode 100644 index 00000000000..6de27d6d51a --- /dev/null +++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -0,0 +1,1559 @@ +//===- AArch64Disassembler.cpp - Disassembler for AArch64 -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "AArch64Disassembler.h" +#include "AArch64ExternalSymbolizer.h" +#include "AArch64Subtarget.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCFixedLenDisassembler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MemoryObject.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-disassembler" + +// Pull DecodeStatus and its enum values into the global namespace. +typedef llvm::MCDisassembler::DecodeStatus DecodeStatus; + +// Forward declare these because the autogenerated code will reference them. +// Definitions are further down. +static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeFPR128_loRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeGPR64spRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeGPR32spRegisterClass(llvm::MCInst &Inst, + unsigned RegNo, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeQQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeDDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); + +static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst, + uint32_t insn, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst, + uint32_t insn, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst, + uint32_t insn, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn, + uint64_t Address, const void *Decoder); +static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst, + uint32_t insn, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn, + uint64_t Address, const void *Decoder); + +static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder); +static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, + const void *Decoder); +static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder); +static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, + const void *Decoder); +static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder); +static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, + const void *Decoder); +static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder); +static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder); +static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder); +static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder); +static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder); + +static bool Check(DecodeStatus &Out, DecodeStatus In) { + switch (In) { + case MCDisassembler::Success: + // Out stays the same. + return true; + case MCDisassembler::SoftFail: + Out = In; + return true; + case MCDisassembler::Fail: + Out = In; + return false; + } + llvm_unreachable("Invalid DecodeStatus!"); +} + +#include "AArch64GenDisassemblerTables.inc" +#include "AArch64GenInstrInfo.inc" + +#define Success llvm::MCDisassembler::Success +#define Fail llvm::MCDisassembler::Fail +#define SoftFail llvm::MCDisassembler::SoftFail + +static MCDisassembler *createAArch64Disassembler(const Target &T, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + return new AArch64Disassembler(STI, Ctx); +} + +DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size, + const MemoryObject &Region, + uint64_t Address, + raw_ostream &os, + raw_ostream &cs) const { + CommentStream = &cs; + + uint8_t bytes[4]; + + Size = 0; + // We want to read exactly 4 bytes of data. + if (Region.readBytes(Address, 4, (uint8_t *)bytes) == -1) + return Fail; + Size = 4; + + // Encoded as a small-endian 32-bit word in the stream. + uint32_t insn = + (bytes[3] << 24) | (bytes[2] << 16) | (bytes[1] << 8) | (bytes[0] << 0); + + // Calling the auto-generated decoder function. + return decodeInstruction(DecoderTable32, MI, insn, Address, this, STI); +} + +static MCSymbolizer * +createAArch64ExternalSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo, + LLVMSymbolLookupCallback SymbolLookUp, + void *DisInfo, MCContext *Ctx, + MCRelocationInfo *RelInfo) { + return new llvm::AArch64ExternalSymbolizer( + *Ctx, + std::unique_ptr(RelInfo), + GetOpInfo, SymbolLookUp, DisInfo); +} + +extern "C" void LLVMInitializeAArch64Disassembler() { + TargetRegistry::RegisterMCDisassembler(TheAArch64leTarget, + createAArch64Disassembler); + TargetRegistry::RegisterMCDisassembler(TheAArch64beTarget, + createAArch64Disassembler); + TargetRegistry::RegisterMCSymbolizer(TheAArch64leTarget, + createAArch64ExternalSymbolizer); + TargetRegistry::RegisterMCSymbolizer(TheAArch64beTarget, + createAArch64ExternalSymbolizer); + + TargetRegistry::RegisterMCDisassembler(TheARM64leTarget, + createAArch64Disassembler); + TargetRegistry::RegisterMCDisassembler(TheARM64beTarget, + createAArch64Disassembler); + TargetRegistry::RegisterMCSymbolizer(TheARM64leTarget, + createAArch64ExternalSymbolizer); + TargetRegistry::RegisterMCSymbolizer(TheARM64beTarget, + createAArch64ExternalSymbolizer); +} + +static const unsigned FPR128DecoderTable[] = { + AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4, + AArch64::Q5, AArch64::Q6, AArch64::Q7, AArch64::Q8, AArch64::Q9, + AArch64::Q10, AArch64::Q11, AArch64::Q12, AArch64::Q13, AArch64::Q14, + AArch64::Q15, AArch64::Q16, AArch64::Q17, AArch64::Q18, AArch64::Q19, + AArch64::Q20, AArch64::Q21, AArch64::Q22, AArch64::Q23, AArch64::Q24, + AArch64::Q25, AArch64::Q26, AArch64::Q27, AArch64::Q28, AArch64::Q29, + AArch64::Q30, AArch64::Q31 +}; + +static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + + unsigned Register = FPR128DecoderTable[RegNo]; + Inst.addOperand(MCOperand::CreateReg(Register)); + return Success; +} + +static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 15) + return Fail; + return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder); +} + +static const unsigned FPR64DecoderTable[] = { + AArch64::D0, AArch64::D1, AArch64::D2, AArch64::D3, AArch64::D4, + AArch64::D5, AArch64::D6, AArch64::D7, AArch64::D8, AArch64::D9, + AArch64::D10, AArch64::D11, AArch64::D12, AArch64::D13, AArch64::D14, + AArch64::D15, AArch64::D16, AArch64::D17, AArch64::D18, AArch64::D19, + AArch64::D20, AArch64::D21, AArch64::D22, AArch64::D23, AArch64::D24, + AArch64::D25, AArch64::D26, AArch64::D27, AArch64::D28, AArch64::D29, + AArch64::D30, AArch64::D31 +}; + +static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + + unsigned Register = FPR64DecoderTable[RegNo]; + Inst.addOperand(MCOperand::CreateReg(Register)); + return Success; +} + +static const unsigned FPR32DecoderTable[] = { + AArch64::S0, AArch64::S1, AArch64::S2, AArch64::S3, AArch64::S4, + AArch64::S5, AArch64::S6, AArch64::S7, AArch64::S8, AArch64::S9, + AArch64::S10, AArch64::S11, AArch64::S12, AArch64::S13, AArch64::S14, + AArch64::S15, AArch64::S16, AArch64::S17, AArch64::S18, AArch64::S19, + AArch64::S20, AArch64::S21, AArch64::S22, AArch64::S23, AArch64::S24, + AArch64::S25, AArch64::S26, AArch64::S27, AArch64::S28, AArch64::S29, + AArch64::S30, AArch64::S31 +}; + +static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + + unsigned Register = FPR32DecoderTable[RegNo]; + Inst.addOperand(MCOperand::CreateReg(Register)); + return Success; +} + +static const unsigned FPR16DecoderTable[] = { + AArch64::H0, AArch64::H1, AArch64::H2, AArch64::H3, AArch64::H4, + AArch64::H5, AArch64::H6, AArch64::H7, AArch64::H8, AArch64::H9, + AArch64::H10, AArch64::H11, AArch64::H12, AArch64::H13, AArch64::H14, + AArch64::H15, AArch64::H16, AArch64::H17, AArch64::H18, AArch64::H19, + AArch64::H20, AArch64::H21, AArch64::H22, AArch64::H23, AArch64::H24, + AArch64::H25, AArch64::H26, AArch64::H27, AArch64::H28, AArch64::H29, + AArch64::H30, AArch64::H31 +}; + +static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + + unsigned Register = FPR16DecoderTable[RegNo]; + Inst.addOperand(MCOperand::CreateReg(Register)); + return Success; +} + +static const unsigned FPR8DecoderTable[] = { + AArch64::B0, AArch64::B1, AArch64::B2, AArch64::B3, AArch64::B4, + AArch64::B5, AArch64::B6, AArch64::B7, AArch64::B8, AArch64::B9, + AArch64::B10, AArch64::B11, AArch64::B12, AArch64::B13, AArch64::B14, + AArch64::B15, AArch64::B16, AArch64::B17, AArch64::B18, AArch64::B19, + AArch64::B20, AArch64::B21, AArch64::B22, AArch64::B23, AArch64::B24, + AArch64::B25, AArch64::B26, AArch64::B27, AArch64::B28, AArch64::B29, + AArch64::B30, AArch64::B31 +}; + +static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + + unsigned Register = FPR8DecoderTable[RegNo]; + Inst.addOperand(MCOperand::CreateReg(Register)); + return Success; +} + +static const unsigned GPR64DecoderTable[] = { + AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4, + AArch64::X5, AArch64::X6, AArch64::X7, AArch64::X8, AArch64::X9, + AArch64::X10, AArch64::X11, AArch64::X12, AArch64::X13, AArch64::X14, + AArch64::X15, AArch64::X16, AArch64::X17, AArch64::X18, AArch64::X19, + AArch64::X20, AArch64::X21, AArch64::X22, AArch64::X23, AArch64::X24, + AArch64::X25, AArch64::X26, AArch64::X27, AArch64::X28, AArch64::FP, + AArch64::LR, AArch64::XZR +}; + +static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + + unsigned Register = GPR64DecoderTable[RegNo]; + Inst.addOperand(MCOperand::CreateReg(Register)); + return Success; +} + +static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + unsigned Register = GPR64DecoderTable[RegNo]; + if (Register == AArch64::XZR) + Register = AArch64::SP; + Inst.addOperand(MCOperand::CreateReg(Register)); + return Success; +} + +static const unsigned GPR32DecoderTable[] = { + AArch64::W0, AArch64::W1, AArch64::W2, AArch64::W3, AArch64::W4, + AArch64::W5, AArch64::W6, AArch64::W7, AArch64::W8, AArch64::W9, + AArch64::W10, AArch64::W11, AArch64::W12, AArch64::W13, AArch64::W14, + AArch64::W15, AArch64::W16, AArch64::W17, AArch64::W18, AArch64::W19, + AArch64::W20, AArch64::W21, AArch64::W22, AArch64::W23, AArch64::W24, + AArch64::W25, AArch64::W26, AArch64::W27, AArch64::W28, AArch64::W29, + AArch64::W30, AArch64::WZR +}; + +static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + + unsigned Register = GPR32DecoderTable[RegNo]; + Inst.addOperand(MCOperand::CreateReg(Register)); + return Success; +} + +static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + + unsigned Register = GPR32DecoderTable[RegNo]; + if (Register == AArch64::WZR) + Register = AArch64::WSP; + Inst.addOperand(MCOperand::CreateReg(Register)); + return Success; +} + +static const unsigned VectorDecoderTable[] = { + AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4, + AArch64::Q5, AArch64::Q6, AArch64::Q7, AArch64::Q8, AArch64::Q9, + AArch64::Q10, AArch64::Q11, AArch64::Q12, AArch64::Q13, AArch64::Q14, + AArch64::Q15, AArch64::Q16, AArch64::Q17, AArch64::Q18, AArch64::Q19, + AArch64::Q20, AArch64::Q21, AArch64::Q22, AArch64::Q23, AArch64::Q24, + AArch64::Q25, AArch64::Q26, AArch64::Q27, AArch64::Q28, AArch64::Q29, + AArch64::Q30, AArch64::Q31 +}; + +static DecodeStatus DecodeVectorRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + + unsigned Register = VectorDecoderTable[RegNo]; + Inst.addOperand(MCOperand::CreateReg(Register)); + return Success; +} + +static const unsigned QQDecoderTable[] = { + AArch64::Q0_Q1, AArch64::Q1_Q2, AArch64::Q2_Q3, AArch64::Q3_Q4, + AArch64::Q4_Q5, AArch64::Q5_Q6, AArch64::Q6_Q7, AArch64::Q7_Q8, + AArch64::Q8_Q9, AArch64::Q9_Q10, AArch64::Q10_Q11, AArch64::Q11_Q12, + AArch64::Q12_Q13, AArch64::Q13_Q14, AArch64::Q14_Q15, AArch64::Q15_Q16, + AArch64::Q16_Q17, AArch64::Q17_Q18, AArch64::Q18_Q19, AArch64::Q19_Q20, + AArch64::Q20_Q21, AArch64::Q21_Q22, AArch64::Q22_Q23, AArch64::Q23_Q24, + AArch64::Q24_Q25, AArch64::Q25_Q26, AArch64::Q26_Q27, AArch64::Q27_Q28, + AArch64::Q28_Q29, AArch64::Q29_Q30, AArch64::Q30_Q31, AArch64::Q31_Q0 +}; + +static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, const void *Decoder) { + if (RegNo > 31) + return Fail; + unsigned Register = QQDecoderTable[RegNo]; + Inst.addOperand(MCOperand::CreateReg(Register)); + return Success; +} + +static const unsigned QQQDecoderTable[] = { + AArch64::Q0_Q1_Q2, AArch64::Q1_Q2_Q3, AArch64::Q2_Q3_Q4, + AArch64::Q3_Q4_Q5, AArch64::Q4_Q5_Q6, AArch64::Q5_Q6_Q7, + AArch64::Q6_Q7_Q8, AArch64::Q7_Q8_Q9, AArch64::Q8_Q9_Q10, + AArch64::Q9_Q10_Q11, AArch64::Q10_Q11_Q12, AArch64::Q11_Q12_Q13, + AArch64::Q12_Q13_Q14, AArch64::Q13_Q14_Q15, AArch64::Q14_Q15_Q16, + AArch64::Q15_Q16_Q17, AArch64::Q16_Q17_Q18, AArch64::Q17_Q18_Q19, + AArch64::Q18_Q19_Q20, AArch64::Q19_Q20_Q21, AArch64::Q20_Q21_Q22, + AArch64::Q21_Q22_Q23, AArch64::Q22_Q23_Q24, AArch64::Q23_Q24_Q25, + AArch64::Q24_Q25_Q26, AArch64::Q25_Q26_Q27, AArch64::Q26_Q27_Q28, + AArch64::Q27_Q28_Q29, AArch64::Q28_Q29_Q30, AArch64::Q29_Q30_Q31, + AArch64::Q30_Q31_Q0, AArch64::Q31_Q0_Q1 +}; + +static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, const void *Decoder) { + if (RegNo > 31) + return Fail; + unsigned Register = QQQDecoderTable[RegNo]; + Inst.addOperand(MCOperand::CreateReg(Register)); + return Success; +} + +static const unsigned QQQQDecoderTable[] = { + AArch64::Q0_Q1_Q2_Q3, AArch64::Q1_Q2_Q3_Q4, AArch64::Q2_Q3_Q4_Q5, + AArch64::Q3_Q4_Q5_Q6, AArch64::Q4_Q5_Q6_Q7, AArch64::Q5_Q6_Q7_Q8, + AArch64::Q6_Q7_Q8_Q9, AArch64::Q7_Q8_Q9_Q10, AArch64::Q8_Q9_Q10_Q11, + AArch64::Q9_Q10_Q11_Q12, AArch64::Q10_Q11_Q12_Q13, AArch64::Q11_Q12_Q13_Q14, + AArch64::Q12_Q13_Q14_Q15, AArch64::Q13_Q14_Q15_Q16, AArch64::Q14_Q15_Q16_Q17, + AArch64::Q15_Q16_Q17_Q18, AArch64::Q16_Q17_Q18_Q19, AArch64::Q17_Q18_Q19_Q20, + AArch64::Q18_Q19_Q20_Q21, AArch64::Q19_Q20_Q21_Q22, AArch64::Q20_Q21_Q22_Q23, + AArch64::Q21_Q22_Q23_Q24, AArch64::Q22_Q23_Q24_Q25, AArch64::Q23_Q24_Q25_Q26, + AArch64::Q24_Q25_Q26_Q27, AArch64::Q25_Q26_Q27_Q28, AArch64::Q26_Q27_Q28_Q29, + AArch64::Q27_Q28_Q29_Q30, AArch64::Q28_Q29_Q30_Q31, AArch64::Q29_Q30_Q31_Q0, + AArch64::Q30_Q31_Q0_Q1, AArch64::Q31_Q0_Q1_Q2 +}; + +static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + unsigned Register = QQQQDecoderTable[RegNo]; + Inst.addOperand(MCOperand::CreateReg(Register)); + return Success; +} + +static const unsigned DDDecoderTable[] = { + AArch64::D0_D1, AArch64::D1_D2, AArch64::D2_D3, AArch64::D3_D4, + AArch64::D4_D5, AArch64::D5_D6, AArch64::D6_D7, AArch64::D7_D8, + AArch64::D8_D9, AArch64::D9_D10, AArch64::D10_D11, AArch64::D11_D12, + AArch64::D12_D13, AArch64::D13_D14, AArch64::D14_D15, AArch64::D15_D16, + AArch64::D16_D17, AArch64::D17_D18, AArch64::D18_D19, AArch64::D19_D20, + AArch64::D20_D21, AArch64::D21_D22, AArch64::D22_D23, AArch64::D23_D24, + AArch64::D24_D25, AArch64::D25_D26, AArch64::D26_D27, AArch64::D27_D28, + AArch64::D28_D29, AArch64::D29_D30, AArch64::D30_D31, AArch64::D31_D0 +}; + +static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, const void *Decoder) { + if (RegNo > 31) + return Fail; + unsigned Register = DDDecoderTable[RegNo]; + Inst.addOperand(MCOperand::CreateReg(Register)); + return Success; +} + +static const unsigned DDDDecoderTable[] = { + AArch64::D0_D1_D2, AArch64::D1_D2_D3, AArch64::D2_D3_D4, + AArch64::D3_D4_D5, AArch64::D4_D5_D6, AArch64::D5_D6_D7, + AArch64::D6_D7_D8, AArch64::D7_D8_D9, AArch64::D8_D9_D10, + AArch64::D9_D10_D11, AArch64::D10_D11_D12, AArch64::D11_D12_D13, + AArch64::D12_D13_D14, AArch64::D13_D14_D15, AArch64::D14_D15_D16, + AArch64::D15_D16_D17, AArch64::D16_D17_D18, AArch64::D17_D18_D19, + AArch64::D18_D19_D20, AArch64::D19_D20_D21, AArch64::D20_D21_D22, + AArch64::D21_D22_D23, AArch64::D22_D23_D24, AArch64::D23_D24_D25, + AArch64::D24_D25_D26, AArch64::D25_D26_D27, AArch64::D26_D27_D28, + AArch64::D27_D28_D29, AArch64::D28_D29_D30, AArch64::D29_D30_D31, + AArch64::D30_D31_D0, AArch64::D31_D0_D1 +}; + +static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, const void *Decoder) { + if (RegNo > 31) + return Fail; + unsigned Register = DDDDecoderTable[RegNo]; + Inst.addOperand(MCOperand::CreateReg(Register)); + return Success; +} + +static const unsigned DDDDDecoderTable[] = { + AArch64::D0_D1_D2_D3, AArch64::D1_D2_D3_D4, AArch64::D2_D3_D4_D5, + AArch64::D3_D4_D5_D6, AArch64::D4_D5_D6_D7, AArch64::D5_D6_D7_D8, + AArch64::D6_D7_D8_D9, AArch64::D7_D8_D9_D10, AArch64::D8_D9_D10_D11, + AArch64::D9_D10_D11_D12, AArch64::D10_D11_D12_D13, AArch64::D11_D12_D13_D14, + AArch64::D12_D13_D14_D15, AArch64::D13_D14_D15_D16, AArch64::D14_D15_D16_D17, + AArch64::D15_D16_D17_D18, AArch64::D16_D17_D18_D19, AArch64::D17_D18_D19_D20, + AArch64::D18_D19_D20_D21, AArch64::D19_D20_D21_D22, AArch64::D20_D21_D22_D23, + AArch64::D21_D22_D23_D24, AArch64::D22_D23_D24_D25, AArch64::D23_D24_D25_D26, + AArch64::D24_D25_D26_D27, AArch64::D25_D26_D27_D28, AArch64::D26_D27_D28_D29, + AArch64::D27_D28_D29_D30, AArch64::D28_D29_D30_D31, AArch64::D29_D30_D31_D0, + AArch64::D30_D31_D0_D1, AArch64::D31_D0_D1_D2 +}; + +static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Addr, + const void *Decoder) { + if (RegNo > 31) + return Fail; + unsigned Register = DDDDDecoderTable[RegNo]; + Inst.addOperand(MCOperand::CreateReg(Register)); + return Success; +} + +static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, + const void *Decoder) { + // scale{5} is asserted as 1 in tblgen. + Imm |= 0x20; + Inst.addOperand(MCOperand::CreateImm(64 - Imm)); + return Success; +} + +static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, + const void *Decoder) { + Inst.addOperand(MCOperand::CreateImm(64 - Imm)); + return Success; +} + +static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder) { + int64_t ImmVal = Imm; + const AArch64Disassembler *Dis = + static_cast(Decoder); + + // Sign-extend 19-bit immediate. + if (ImmVal & (1 << (19 - 1))) + ImmVal |= ~((1LL << 19) - 1); + + if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal << 2, Addr, + Inst.getOpcode() != AArch64::LDRXl, 0, 4)) + Inst.addOperand(MCOperand::CreateImm(ImmVal)); + return Success; +} + +static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm, + uint64_t Address, const void *Decoder) { + Inst.addOperand(MCOperand::CreateImm((Imm >> 1) & 1)); + Inst.addOperand(MCOperand::CreateImm(Imm & 1)); + return Success; +} + +static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm, + uint64_t Address, + const void *Decoder) { + const AArch64Disassembler *Dis = + static_cast(Decoder); + const MCSubtargetInfo &STI = Dis->getSubtargetInfo(); + + Imm |= 0x8000; + Inst.addOperand(MCOperand::CreateImm(Imm)); + + bool ValidNamed; + (void)AArch64SysReg::MRSMapper(STI.getFeatureBits()) + .toString(Imm, ValidNamed); + + return ValidNamed ? Success : Fail; +} + +static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm, + uint64_t Address, + const void *Decoder) { + const AArch64Disassembler *Dis = + static_cast(Decoder); + const MCSubtargetInfo &STI = Dis->getSubtargetInfo(); + + Imm |= 0x8000; + Inst.addOperand(MCOperand::CreateImm(Imm)); + + bool ValidNamed; + (void)AArch64SysReg::MSRMapper(STI.getFeatureBits()) + .toString(Imm, ValidNamed); + + return ValidNamed ? Success : Fail; +} + +static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn, + uint64_t Address, + const void *Decoder) { + // This decoder exists to add the dummy Lane operand to the MCInst, which must + // be 1 in assembly but has no other real manifestation. + unsigned Rd = fieldFromInstruction(Insn, 0, 5); + unsigned Rn = fieldFromInstruction(Insn, 5, 5); + unsigned IsToVec = fieldFromInstruction(Insn, 16, 1); + + if (IsToVec) { + DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder); + DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder); + } else { + DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder); + DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder); + } + + // Add the lane + Inst.addOperand(MCOperand::CreateImm(1)); + + return Success; +} + +static DecodeStatus DecodeVecShiftRImm(llvm::MCInst &Inst, unsigned Imm, + unsigned Add) { + Inst.addOperand(MCOperand::CreateImm(Add - Imm)); + return Success; +} + +static DecodeStatus DecodeVecShiftLImm(llvm::MCInst &Inst, unsigned Imm, + unsigned Add) { + Inst.addOperand(MCOperand::CreateImm((Imm + Add) & (Add - 1))); + return Success; +} + +static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder) { + return DecodeVecShiftRImm(Inst, Imm, 64); +} + +static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, + const void *Decoder) { + return DecodeVecShiftRImm(Inst, Imm | 0x20, 64); +} + +static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder) { + return DecodeVecShiftRImm(Inst, Imm, 32); +} + +static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, + const void *Decoder) { + return DecodeVecShiftRImm(Inst, Imm | 0x10, 32); +} + +static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder) { + return DecodeVecShiftRImm(Inst, Imm, 16); +} + +static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, + const void *Decoder) { + return DecodeVecShiftRImm(Inst, Imm | 0x8, 16); +} + +static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder) { + return DecodeVecShiftRImm(Inst, Imm, 8); +} + +static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder) { + return DecodeVecShiftLImm(Inst, Imm, 64); +} + +static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder) { + return DecodeVecShiftLImm(Inst, Imm, 32); +} + +static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder) { + return DecodeVecShiftLImm(Inst, Imm, 16); +} + +static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm, + uint64_t Addr, const void *Decoder) { + return DecodeVecShiftLImm(Inst, Imm, 8); +} + +static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Addr, + const void *Decoder) { + unsigned Rd = fieldFromInstruction(insn, 0, 5); + unsigned Rn = fieldFromInstruction(insn, 5, 5); + unsigned Rm = fieldFromInstruction(insn, 16, 5); + unsigned shiftHi = fieldFromInstruction(insn, 22, 2); + unsigned shiftLo = fieldFromInstruction(insn, 10, 6); + unsigned shift = (shiftHi << 6) | shiftLo; + switch (Inst.getOpcode()) { + default: + return Fail; + case AArch64::ADDWrs: + case AArch64::ADDSWrs: + case AArch64::SUBWrs: + case AArch64::SUBSWrs: + // if shift == '11' then ReservedValue() + if (shiftHi == 0x3) + return Fail; + // Deliberate fallthrough + case AArch64::ANDWrs: + case AArch64::ANDSWrs: + case AArch64::BICWrs: + case AArch64::BICSWrs: + case AArch64::ORRWrs: + case AArch64::ORNWrs: + case AArch64::EORWrs: + case AArch64::EONWrs: { + // if sf == '0' and imm6<5> == '1' then ReservedValue() + if (shiftLo >> 5 == 1) + return Fail; + DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder); + DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder); + break; + } + case AArch64::ADDXrs: + case AArch64::ADDSXrs: + case AArch64::SUBXrs: + case AArch64::SUBSXrs: + // if shift == '11' then ReservedValue() + if (shiftHi == 0x3) + return Fail; + // Deliberate fallthrough + case AArch64::ANDXrs: + case AArch64::ANDSXrs: + case AArch64::BICXrs: + case AArch64::BICSXrs: + case AArch64::ORRXrs: + case AArch64::ORNXrs: + case AArch64::EORXrs: + case AArch64::EONXrs: + DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder); + DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder); + break; + } + + Inst.addOperand(MCOperand::CreateImm(shift)); + return Success; +} + +static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn, + uint64_t Addr, + const void *Decoder) { + unsigned Rd = fieldFromInstruction(insn, 0, 5); + unsigned imm = fieldFromInstruction(insn, 5, 16); + unsigned shift = fieldFromInstruction(insn, 21, 2); + shift <<= 4; + switch (Inst.getOpcode()) { + default: + return Fail; + case AArch64::MOVZWi: + case AArch64::MOVNWi: + case AArch64::MOVKWi: + if (shift & (1U << 5)) + return Fail; + DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder); + break; + case AArch64::MOVZXi: + case AArch64::MOVNXi: + case AArch64::MOVKXi: + DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); + break; + } + + if (Inst.getOpcode() == AArch64::MOVKWi || + Inst.getOpcode() == AArch64::MOVKXi) + Inst.addOperand(Inst.getOperand(0)); + + Inst.addOperand(MCOperand::CreateImm(imm)); + Inst.addOperand(MCOperand::CreateImm(shift)); + return Success; +} + +static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Addr, + const void *Decoder) { + unsigned Rt = fieldFromInstruction(insn, 0, 5); + unsigned Rn = fieldFromInstruction(insn, 5, 5); + unsigned offset = fieldFromInstruction(insn, 10, 12); + const AArch64Disassembler *Dis = + static_cast(Decoder); + + switch (Inst.getOpcode()) { + default: + return Fail; + case AArch64::PRFMui: + // Rt is an immediate in prefetch. + Inst.addOperand(MCOperand::CreateImm(Rt)); + break; + case AArch64::STRBBui: + case AArch64::LDRBBui: + case AArch64::LDRSBWui: + case AArch64::STRHHui: + case AArch64::LDRHHui: + case AArch64::LDRSHWui: + case AArch64::STRWui: + case AArch64::LDRWui: + DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDRSBXui: + case AArch64::LDRSHXui: + case AArch64::LDRSWui: + case AArch64::STRXui: + case AArch64::LDRXui: + DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDRQui: + case AArch64::STRQui: + DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDRDui: + case AArch64::STRDui: + DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDRSui: + case AArch64::STRSui: + DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDRHui: + case AArch64::STRHui: + DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDRBui: + case AArch64::STRBui: + DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder); + break; + } + + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + if (!Dis->tryAddingSymbolicOperand(Inst, offset, Addr, Fail, 0, 4)) + Inst.addOperand(MCOperand::CreateImm(offset)); + return Success; +} + +static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Addr, + const void *Decoder) { + unsigned Rt = fieldFromInstruction(insn, 0, 5); + unsigned Rn = fieldFromInstruction(insn, 5, 5); + int64_t offset = fieldFromInstruction(insn, 12, 9); + + // offset is a 9-bit signed immediate, so sign extend it to + // fill the unsigned. + if (offset & (1 << (9 - 1))) + offset |= ~((1LL << 9) - 1); + + // First operand is always the writeback to the address register, if needed. + switch (Inst.getOpcode()) { + default: + break; + case AArch64::LDRSBWpre: + case AArch64::LDRSHWpre: + case AArch64::STRBBpre: + case AArch64::LDRBBpre: + case AArch64::STRHHpre: + case AArch64::LDRHHpre: + case AArch64::STRWpre: + case AArch64::LDRWpre: + case AArch64::LDRSBWpost: + case AArch64::LDRSHWpost: + case AArch64::STRBBpost: + case AArch64::LDRBBpost: + case AArch64::STRHHpost: + case AArch64::LDRHHpost: + case AArch64::STRWpost: + case AArch64::LDRWpost: + case AArch64::LDRSBXpre: + case AArch64::LDRSHXpre: + case AArch64::STRXpre: + case AArch64::LDRSWpre: + case AArch64::LDRXpre: + case AArch64::LDRSBXpost: + case AArch64::LDRSHXpost: + case AArch64::STRXpost: + case AArch64::LDRSWpost: + case AArch64::LDRXpost: + case AArch64::LDRQpre: + case AArch64::STRQpre: + case AArch64::LDRQpost: + case AArch64::STRQpost: + case AArch64::LDRDpre: + case AArch64::STRDpre: + case AArch64::LDRDpost: + case AArch64::STRDpost: + case AArch64::LDRSpre: + case AArch64::STRSpre: + case AArch64::LDRSpost: + case AArch64::STRSpost: + case AArch64::LDRHpre: + case AArch64::STRHpre: + case AArch64::LDRHpost: + case AArch64::STRHpost: + case AArch64::LDRBpre: + case AArch64::STRBpre: + case AArch64::LDRBpost: + case AArch64::STRBpost: + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + break; + } + + switch (Inst.getOpcode()) { + default: + return Fail; + case AArch64::PRFUMi: + // Rt is an immediate in prefetch. + Inst.addOperand(MCOperand::CreateImm(Rt)); + break; + case AArch64::STURBBi: + case AArch64::LDURBBi: + case AArch64::LDURSBWi: + case AArch64::STURHHi: + case AArch64::LDURHHi: + case AArch64::LDURSHWi: + case AArch64::STURWi: + case AArch64::LDURWi: + case AArch64::LDTRSBWi: + case AArch64::LDTRSHWi: + case AArch64::STTRWi: + case AArch64::LDTRWi: + case AArch64::STTRHi: + case AArch64::LDTRHi: + case AArch64::LDTRBi: + case AArch64::STTRBi: + case AArch64::LDRSBWpre: + case AArch64::LDRSHWpre: + case AArch64::STRBBpre: + case AArch64::LDRBBpre: + case AArch64::STRHHpre: + case AArch64::LDRHHpre: + case AArch64::STRWpre: + case AArch64::LDRWpre: + case AArch64::LDRSBWpost: + case AArch64::LDRSHWpost: + case AArch64::STRBBpost: + case AArch64::LDRBBpost: + case AArch64::STRHHpost: + case AArch64::LDRHHpost: + case AArch64::STRWpost: + case AArch64::LDRWpost: + DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDURSBXi: + case AArch64::LDURSHXi: + case AArch64::LDURSWi: + case AArch64::STURXi: + case AArch64::LDURXi: + case AArch64::LDTRSBXi: + case AArch64::LDTRSHXi: + case AArch64::LDTRSWi: + case AArch64::STTRXi: + case AArch64::LDTRXi: + case AArch64::LDRSBXpre: + case AArch64::LDRSHXpre: + case AArch64::STRXpre: + case AArch64::LDRSWpre: + case AArch64::LDRXpre: + case AArch64::LDRSBXpost: + case AArch64::LDRSHXpost: + case AArch64::STRXpost: + case AArch64::LDRSWpost: + case AArch64::LDRXpost: + DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDURQi: + case AArch64::STURQi: + case AArch64::LDRQpre: + case AArch64::STRQpre: + case AArch64::LDRQpost: + case AArch64::STRQpost: + DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDURDi: + case AArch64::STURDi: + case AArch64::LDRDpre: + case AArch64::STRDpre: + case AArch64::LDRDpost: + case AArch64::STRDpost: + DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDURSi: + case AArch64::STURSi: + case AArch64::LDRSpre: + case AArch64::STRSpre: + case AArch64::LDRSpost: + case AArch64::STRSpost: + DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDURHi: + case AArch64::STURHi: + case AArch64::LDRHpre: + case AArch64::STRHpre: + case AArch64::LDRHpost: + case AArch64::STRHpost: + DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::LDURBi: + case AArch64::STURBi: + case AArch64::LDRBpre: + case AArch64::STRBpre: + case AArch64::LDRBpost: + case AArch64::STRBpost: + DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder); + break; + } + + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + Inst.addOperand(MCOperand::CreateImm(offset)); + + bool IsLoad = fieldFromInstruction(insn, 22, 1); + bool IsIndexed = fieldFromInstruction(insn, 10, 2) != 0; + bool IsFP = fieldFromInstruction(insn, 26, 1); + + // Cannot write back to a transfer register (but xzr != sp). + if (IsLoad && IsIndexed && !IsFP && Rn != 31 && Rt == Rn) + return SoftFail; + + return Success; +} + +static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Addr, + const void *Decoder) { + unsigned Rt = fieldFromInstruction(insn, 0, 5); + unsigned Rn = fieldFromInstruction(insn, 5, 5); + unsigned Rt2 = fieldFromInstruction(insn, 10, 5); + unsigned Rs = fieldFromInstruction(insn, 16, 5); + + unsigned Opcode = Inst.getOpcode(); + switch (Opcode) { + default: + return Fail; + case AArch64::STLXRW: + case AArch64::STLXRB: + case AArch64::STLXRH: + case AArch64::STXRW: + case AArch64::STXRB: + case AArch64::STXRH: + DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder); + // FALLTHROUGH + case AArch64::LDARW: + case AArch64::LDARB: + case AArch64::LDARH: + case AArch64::LDAXRW: + case AArch64::LDAXRB: + case AArch64::LDAXRH: + case AArch64::LDXRW: + case AArch64::LDXRB: + case AArch64::LDXRH: + case AArch64::STLRW: + case AArch64::STLRB: + case AArch64::STLRH: + DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::STLXRX: + case AArch64::STXRX: + DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder); + // FALLTHROUGH + case AArch64::LDARX: + case AArch64::LDAXRX: + case AArch64::LDXRX: + case AArch64::STLRX: + DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); + break; + case AArch64::STLXPW: + case AArch64::STXPW: + DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder); + // FALLTHROUGH + case AArch64::LDAXPW: + case AArch64::LDXPW: + DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); + DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder); + break; + case AArch64::STLXPX: + case AArch64::STXPX: + DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder); + // FALLTHROUGH + case AArch64::LDAXPX: + case AArch64::LDXPX: + DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); + DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder); + break; + } + + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + + // You shouldn't load to the same register twice in an instruction... + if ((Opcode == AArch64::LDAXPW || Opcode == AArch64::LDXPW || + Opcode == AArch64::LDAXPX || Opcode == AArch64::LDXPX) && + Rt == Rt2) + return SoftFail; + + return Success; +} + +static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn, + uint64_t Addr, + const void *Decoder) { + unsigned Rt = fieldFromInstruction(insn, 0, 5); + unsigned Rn = fieldFromInstruction(insn, 5, 5); + unsigned Rt2 = fieldFromInstruction(insn, 10, 5); + int64_t offset = fieldFromInstruction(insn, 15, 7); + bool IsLoad = fieldFromInstruction(insn, 22, 1); + + // offset is a 7-bit signed immediate, so sign extend it to + // fill the unsigned. + if (offset & (1 << (7 - 1))) + offset |= ~((1LL << 7) - 1); + + unsigned Opcode = Inst.getOpcode(); + bool NeedsDisjointWritebackTransfer = false; + + // First operand is always writeback of base register. + switch (Opcode) { + default: + break; + case AArch64::LDPXpost: + case AArch64::STPXpost: + case AArch64::LDPSWpost: + case AArch64::LDPXpre: + case AArch64::STPXpre: + case AArch64::LDPSWpre: + case AArch64::LDPWpost: + case AArch64::STPWpost: + case AArch64::LDPWpre: + case AArch64::STPWpre: + case AArch64::LDPQpost: + case AArch64::STPQpost: + case AArch64::LDPQpre: + case AArch64::STPQpre: + case AArch64::LDPDpost: + case AArch64::STPDpost: + case AArch64::LDPDpre: + case AArch64::STPDpre: + case AArch64::LDPSpost: + case AArch64::STPSpost: + case AArch64::LDPSpre: + case AArch64::STPSpre: + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + break; + } + + switch (Opcode) { + default: + return Fail; + case AArch64::LDPXpost: + case AArch64::STPXpost: + case AArch64::LDPSWpost: + case AArch64::LDPXpre: + case AArch64::STPXpre: + case AArch64::LDPSWpre: + NeedsDisjointWritebackTransfer = true; + // Fallthrough + case AArch64::LDNPXi: + case AArch64::STNPXi: + case AArch64::LDPXi: + case AArch64::STPXi: + case AArch64::LDPSWi: + DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); + DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder); + break; + case AArch64::LDPWpost: + case AArch64::STPWpost: + case AArch64::LDPWpre: + case AArch64::STPWpre: + NeedsDisjointWritebackTransfer = true; + // Fallthrough + case AArch64::LDNPWi: + case AArch64::STNPWi: + case AArch64::LDPWi: + case AArch64::STPWi: + DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); + DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder); + break; + case AArch64::LDNPQi: + case AArch64::STNPQi: + case AArch64::LDPQpost: + case AArch64::STPQpost: + case AArch64::LDPQi: + case AArch64::STPQi: + case AArch64::LDPQpre: + case AArch64::STPQpre: + DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder); + DecodeFPR128RegisterClass(Inst, Rt2, Addr, Decoder); + break; + case AArch64::LDNPDi: + case AArch64::STNPDi: + case AArch64::LDPDpost: + case AArch64::STPDpost: + case AArch64::LDPDi: + case AArch64::STPDi: + case AArch64::LDPDpre: + case AArch64::STPDpre: + DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder); + DecodeFPR64RegisterClass(Inst, Rt2, Addr, Decoder); + break; + case AArch64::LDNPSi: + case AArch64::STNPSi: + case AArch64::LDPSpost: + case AArch64::STPSpost: + case AArch64::LDPSi: + case AArch64::STPSi: + case AArch64::LDPSpre: + case AArch64::STPSpre: + DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder); + DecodeFPR32RegisterClass(Inst, Rt2, Addr, Decoder); + break; + } + + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + Inst.addOperand(MCOperand::CreateImm(offset)); + + // You shouldn't load to the same register twice in an instruction... + if (IsLoad && Rt == Rt2) + return SoftFail; + + // ... or do any operation that writes-back to a transfer register. But note + // that "stp xzr, xzr, [sp], #4" is fine because xzr and sp are different. + if (NeedsDisjointWritebackTransfer && Rn != 31 && (Rt == Rn || Rt2 == Rn)) + return SoftFail; + + return Success; +} + +static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Addr, + const void *Decoder) { + unsigned Rd = fieldFromInstruction(insn, 0, 5); + unsigned Rn = fieldFromInstruction(insn, 5, 5); + unsigned Rm = fieldFromInstruction(insn, 16, 5); + unsigned extend = fieldFromInstruction(insn, 10, 6); + + unsigned shift = extend & 0x7; + if (shift > 4) + return Fail; + + switch (Inst.getOpcode()) { + default: + return Fail; + case AArch64::ADDWrx: + case AArch64::SUBWrx: + DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder); + break; + case AArch64::ADDSWrx: + case AArch64::SUBSWrx: + DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder); + break; + case AArch64::ADDXrx: + case AArch64::SUBXrx: + DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder); + break; + case AArch64::ADDSXrx: + case AArch64::SUBSXrx: + DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder); + break; + case AArch64::ADDXrx64: + case AArch64::SUBXrx64: + DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder); + break; + case AArch64::SUBSXrx64: + case AArch64::ADDSXrx64: + DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder); + break; + } + + Inst.addOperand(MCOperand::CreateImm(extend)); + return Success; +} + +static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Addr, + const void *Decoder) { + unsigned Rd = fieldFromInstruction(insn, 0, 5); + unsigned Rn = fieldFromInstruction(insn, 5, 5); + unsigned Datasize = fieldFromInstruction(insn, 31, 1); + unsigned imm; + + if (Datasize) { + if (Inst.getOpcode() == AArch64::ANDSXri) + DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); + else + DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder); + imm = fieldFromInstruction(insn, 10, 13); + if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 64)) + return Fail; + } else { + if (Inst.getOpcode() == AArch64::ANDSWri) + DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder); + else + DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder); + imm = fieldFromInstruction(insn, 10, 12); + if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 32)) + return Fail; + } + Inst.addOperand(MCOperand::CreateImm(imm)); + return Success; +} + +static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn, + uint64_t Addr, + const void *Decoder) { + unsigned Rd = fieldFromInstruction(insn, 0, 5); + unsigned cmode = fieldFromInstruction(insn, 12, 4); + unsigned imm = fieldFromInstruction(insn, 16, 3) << 5; + imm |= fieldFromInstruction(insn, 5, 5); + + if (Inst.getOpcode() == AArch64::MOVID) + DecodeFPR64RegisterClass(Inst, Rd, Addr, Decoder); + else + DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder); + + Inst.addOperand(MCOperand::CreateImm(imm)); + + switch (Inst.getOpcode()) { + default: + break; + case AArch64::MOVIv4i16: + case AArch64::MOVIv8i16: + case AArch64::MVNIv4i16: + case AArch64::MVNIv8i16: + case AArch64::MOVIv2i32: + case AArch64::MOVIv4i32: + case AArch64::MVNIv2i32: + case AArch64::MVNIv4i32: + Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2)); + break; + case AArch64::MOVIv2s_msl: + case AArch64::MOVIv4s_msl: + case AArch64::MVNIv2s_msl: + case AArch64::MVNIv4s_msl: + Inst.addOperand(MCOperand::CreateImm(cmode & 1 ? 0x110 : 0x108)); + break; + } + + return Success; +} + +static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Addr, + const void *Decoder) { + unsigned Rd = fieldFromInstruction(insn, 0, 5); + unsigned cmode = fieldFromInstruction(insn, 12, 4); + unsigned imm = fieldFromInstruction(insn, 16, 3) << 5; + imm |= fieldFromInstruction(insn, 5, 5); + + // Tied operands added twice. + DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder); + DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder); + + Inst.addOperand(MCOperand::CreateImm(imm)); + Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2)); + + return Success; +} + +static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn, + uint64_t Addr, const void *Decoder) { + unsigned Rd = fieldFromInstruction(insn, 0, 5); + int64_t imm = fieldFromInstruction(insn, 5, 19) << 2; + imm |= fieldFromInstruction(insn, 29, 2); + const AArch64Disassembler *Dis = + static_cast(Decoder); + + // Sign-extend the 21-bit immediate. + if (imm & (1 << (21 - 1))) + imm |= ~((1LL << 21) - 1); + + DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); + if (!Dis->tryAddingSymbolicOperand(Inst, imm, Addr, Fail, 0, 4)) + Inst.addOperand(MCOperand::CreateImm(imm)); + + return Success; +} + +static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn, + uint64_t Addr, const void *Decoder) { + unsigned Rd = fieldFromInstruction(insn, 0, 5); + unsigned Rn = fieldFromInstruction(insn, 5, 5); + unsigned Imm = fieldFromInstruction(insn, 10, 14); + unsigned S = fieldFromInstruction(insn, 29, 1); + unsigned Datasize = fieldFromInstruction(insn, 31, 1); + + unsigned ShifterVal = (Imm >> 12) & 3; + unsigned ImmVal = Imm & 0xFFF; + const AArch64Disassembler *Dis = + static_cast(Decoder); + + if (ShifterVal != 0 && ShifterVal != 1) + return Fail; + + if (Datasize) { + if (Rd == 31 && !S) + DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder); + else + DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); + } else { + if (Rd == 31 && !S) + DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder); + else + DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder); + DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder); + } + + if (!Dis->tryAddingSymbolicOperand(Inst, Imm, Addr, Fail, 0, 4)) + Inst.addOperand(MCOperand::CreateImm(ImmVal)); + Inst.addOperand(MCOperand::CreateImm(12 * ShifterVal)); + return Success; +} + +static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn, + uint64_t Addr, + const void *Decoder) { + int64_t imm = fieldFromInstruction(insn, 0, 26); + const AArch64Disassembler *Dis = + static_cast(Decoder); + + // Sign-extend the 26-bit immediate. + if (imm & (1 << (26 - 1))) + imm |= ~((1LL << 26) - 1); + + if (!Dis->tryAddingSymbolicOperand(Inst, imm << 2, Addr, true, 0, 4)) + Inst.addOperand(MCOperand::CreateImm(imm)); + + return Success; +} + +static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst, + uint32_t insn, uint64_t Addr, + const void *Decoder) { + uint64_t op1 = fieldFromInstruction(insn, 16, 3); + uint64_t op2 = fieldFromInstruction(insn, 5, 3); + uint64_t crm = fieldFromInstruction(insn, 8, 4); + + uint64_t pstate_field = (op1 << 3) | op2; + + Inst.addOperand(MCOperand::CreateImm(pstate_field)); + Inst.addOperand(MCOperand::CreateImm(crm)); + + bool ValidNamed; + (void)AArch64PState::PStateMapper().toString(pstate_field, ValidNamed); + + return ValidNamed ? Success : Fail; +} + +static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn, + uint64_t Addr, const void *Decoder) { + uint64_t Rt = fieldFromInstruction(insn, 0, 5); + uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5; + bit |= fieldFromInstruction(insn, 19, 5); + int64_t dst = fieldFromInstruction(insn, 5, 14); + const AArch64Disassembler *Dis = + static_cast(Decoder); + + // Sign-extend 14-bit immediate. + if (dst & (1 << (14 - 1))) + dst |= ~((1LL << 14) - 1); + + if (fieldFromInstruction(insn, 31, 1) == 0) + DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); + else + DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); + Inst.addOperand(MCOperand::CreateImm(bit)); + if (!Dis->tryAddingSymbolicOperand(Inst, dst << 2, Addr, true, 0, 4)) + Inst.addOperand(MCOperand::CreateImm(dst)); + + return Success; +} diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h new file mode 100644 index 00000000000..68d4867977b --- /dev/null +++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h @@ -0,0 +1,40 @@ +//===- AArch64Disassembler.h - Disassembler for AArch64 ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#ifndef AArch64DISASSEMBLER_H +#define AArch64DISASSEMBLER_H + +#include "llvm/MC/MCDisassembler.h" + +namespace llvm { + +class MCInst; +class MemoryObject; +class raw_ostream; + +class AArch64Disassembler : public MCDisassembler { +public: + AArch64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx) + : MCDisassembler(STI, Ctx) {} + + ~AArch64Disassembler() {} + + /// getInstruction - See MCDisassembler. + MCDisassembler::DecodeStatus + getInstruction(MCInst &instr, uint64_t &size, const MemoryObject ®ion, + uint64_t address, raw_ostream &vStream, + raw_ostream &cStream) const override; +}; + +} // namespace llvm + +#endif diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp new file mode 100644 index 00000000000..24663684a3f --- /dev/null +++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp @@ -0,0 +1,221 @@ +//===- AArch64ExternalSymbolizer.cpp - Symbolizer for AArch64 ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AArch64ExternalSymbolizer.h" +#include "AArch64Subtarget.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-disassembler" + +static MCSymbolRefExpr::VariantKind +getVariant(uint64_t LLVMDisassembler_VariantKind) { + switch (LLVMDisassembler_VariantKind) { + case LLVMDisassembler_VariantKind_None: + return MCSymbolRefExpr::VK_None; + case LLVMDisassembler_VariantKind_ARM64_PAGE: + return MCSymbolRefExpr::VK_PAGE; + case LLVMDisassembler_VariantKind_ARM64_PAGEOFF: + return MCSymbolRefExpr::VK_PAGEOFF; + case LLVMDisassembler_VariantKind_ARM64_GOTPAGE: + return MCSymbolRefExpr::VK_GOTPAGE; + case LLVMDisassembler_VariantKind_ARM64_GOTPAGEOFF: + return MCSymbolRefExpr::VK_GOTPAGEOFF; + case LLVMDisassembler_VariantKind_ARM64_TLVP: + case LLVMDisassembler_VariantKind_ARM64_TLVOFF: + default: + assert(0 && "bad LLVMDisassembler_VariantKind"); + return MCSymbolRefExpr::VK_None; + } +} + +/// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic +/// operand in place of the immediate Value in the MCInst. The immediate +/// Value has not had any PC adjustment made by the caller. If the instruction +/// is a branch that adds the PC to the immediate Value then isBranch is +/// Success, else Fail. If GetOpInfo is non-null, then it is called to get any +/// symbolic information at the Address for this instrution. If that returns +/// non-zero then the symbolic information it returns is used to create an +/// MCExpr and that is added as an operand to the MCInst. If GetOpInfo() +/// returns zero and isBranch is Success then a symbol look up for +/// Address + Value is done and if a symbol is found an MCExpr is created with +/// that, else an MCExpr with Address + Value is created. If GetOpInfo() +/// returns zero and isBranch is Fail then the the Opcode of the MCInst is +/// tested and for ADRP an other instructions that help to load of pointers +/// a symbol look up is done to see it is returns a specific reference type +/// to add to the comment stream. This function returns Success if it adds +/// an operand to the MCInst and Fail otherwise. +bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand( + MCInst &MI, raw_ostream &CommentStream, int64_t Value, uint64_t Address, + bool IsBranch, uint64_t Offset, uint64_t InstSize) { + // FIXME: This method shares a lot of code with + // MCExternalSymbolizer::tryAddingSymbolicOperand. It may be possible + // refactor the MCExternalSymbolizer interface to allow more of this + // implementation to be shared. + // + struct LLVMOpInfo1 SymbolicOp; + memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1)); + SymbolicOp.Value = Value; + uint64_t ReferenceType; + const char *ReferenceName; + if (!GetOpInfo || + !GetOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) { + if (IsBranch) { + ReferenceType = LLVMDisassembler_ReferenceType_In_Branch; + const char *Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType, + Address, &ReferenceName); + if (Name) { + SymbolicOp.AddSymbol.Name = Name; + SymbolicOp.AddSymbol.Present = true; + SymbolicOp.Value = 0; + } else { + SymbolicOp.Value = Address + Value; + } + if (ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub) + CommentStream << "symbol stub for: " << ReferenceName; + else if (ReferenceType == + LLVMDisassembler_ReferenceType_Out_Objc_Message) + CommentStream << "Objc message: " << ReferenceName; + } else if (MI.getOpcode() == AArch64::ADRP) { + ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADRP; + // otool expects the fully encoded ADRP instruction to be passed in as + // the value here, so reconstruct it: + const MCRegisterInfo &MCRI = *Ctx.getRegisterInfo(); + uint32_t EncodedInst = 0x90000000; + EncodedInst |= (Value & 0x3) << 29; // immlo + EncodedInst |= ((Value >> 2) & 0x7FFFF) << 5; // immhi + EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // reg + SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address, + &ReferenceName); + CommentStream << format("0x%llx", + 0xfffffffffffff000LL & (Address + Value)); + } else if (MI.getOpcode() == AArch64::ADDXri || + MI.getOpcode() == AArch64::LDRXui || + MI.getOpcode() == AArch64::LDRXl || + MI.getOpcode() == AArch64::ADR) { + if (MI.getOpcode() == AArch64::ADDXri) + ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADDXri; + else if (MI.getOpcode() == AArch64::LDRXui) + ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXui; + if (MI.getOpcode() == AArch64::LDRXl) { + ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXl; + SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address, + &ReferenceName); + } else if (MI.getOpcode() == AArch64::ADR) { + ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADR; + SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address, + &ReferenceName); + } else { + const MCRegisterInfo &MCRI = *Ctx.getRegisterInfo(); + // otool expects the fully encoded ADD/LDR instruction to be passed in + // as the value here, so reconstruct it: + unsigned EncodedInst = + MI.getOpcode() == AArch64::ADDXri ? 0x91000000: 0xF9400000; + EncodedInst |= Value << 10; // imm12 [+ shift:2 for ADD] + EncodedInst |= + MCRI.getEncodingValue(MI.getOperand(1).getReg()) << 5; // Rn + EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // Rd + + SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address, + &ReferenceName); + } + if (ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr) + CommentStream << "literal pool symbol address: " << ReferenceName; + else if (ReferenceType == + LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr) + CommentStream << "literal pool for: \"" << ReferenceName << "\""; + else if (ReferenceType == + LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref) + CommentStream << "Objc cfstring ref: @\"" << ReferenceName << "\""; + else if (ReferenceType == + LLVMDisassembler_ReferenceType_Out_Objc_Message) + CommentStream << "Objc message: " << ReferenceName; + else if (ReferenceType == + LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref) + CommentStream << "Objc message ref: " << ReferenceName; + else if (ReferenceType == + LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref) + CommentStream << "Objc selector ref: " << ReferenceName; + else if (ReferenceType == + LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref) + CommentStream << "Objc class ref: " << ReferenceName; + // For these instructions, the SymbolLookUp() above is just to get the + // ReferenceType and ReferenceName. We want to make sure not to + // fall through so we don't build an MCExpr to leave the disassembly + // of the immediate values of these instructions to the InstPrinter. + return false; + } else { + return false; + } + } + + const MCExpr *Add = nullptr; + if (SymbolicOp.AddSymbol.Present) { + if (SymbolicOp.AddSymbol.Name) { + StringRef Name(SymbolicOp.AddSymbol.Name); + MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name); + MCSymbolRefExpr::VariantKind Variant = getVariant(SymbolicOp.VariantKind); + if (Variant != MCSymbolRefExpr::VK_None) + Add = MCSymbolRefExpr::Create(Sym, Variant, Ctx); + else + Add = MCSymbolRefExpr::Create(Sym, Ctx); + } else { + Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, Ctx); + } + } + + const MCExpr *Sub = nullptr; + if (SymbolicOp.SubtractSymbol.Present) { + if (SymbolicOp.SubtractSymbol.Name) { + StringRef Name(SymbolicOp.SubtractSymbol.Name); + MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name); + Sub = MCSymbolRefExpr::Create(Sym, Ctx); + } else { + Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, Ctx); + } + } + + const MCExpr *Off = nullptr; + if (SymbolicOp.Value != 0) + Off = MCConstantExpr::Create(SymbolicOp.Value, Ctx); + + const MCExpr *Expr; + if (Sub) { + const MCExpr *LHS; + if (Add) + LHS = MCBinaryExpr::CreateSub(Add, Sub, Ctx); + else + LHS = MCUnaryExpr::CreateMinus(Sub, Ctx); + if (Off) + Expr = MCBinaryExpr::CreateAdd(LHS, Off, Ctx); + else + Expr = LHS; + } else if (Add) { + if (Off) + Expr = MCBinaryExpr::CreateAdd(Add, Off, Ctx); + else + Expr = Add; + } else { + if (Off) + Expr = Off; + else + Expr = MCConstantExpr::Create(0, Ctx); + } + + MI.addOperand(MCOperand::CreateExpr(Expr)); + + return true; +} diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h new file mode 100644 index 00000000000..171d31c48cd --- /dev/null +++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h @@ -0,0 +1,38 @@ +//===- AArch64ExternalSymbolizer.h - Symbolizer for AArch64 -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Symbolize AArch64 assembly code during disassembly using callbacks. +// +//===----------------------------------------------------------------------===// + +#ifndef AArch64EXTERNALSYMBOLIZER_H +#define AArch64EXTERNALSYMBOLIZER_H + +#include "llvm/MC/MCExternalSymbolizer.h" + +namespace llvm { + +class AArch64ExternalSymbolizer : public MCExternalSymbolizer { +public: + AArch64ExternalSymbolizer(MCContext &Ctx, + std::unique_ptr RelInfo, + LLVMOpInfoCallback GetOpInfo, + LLVMSymbolLookupCallback SymbolLookUp, + void *DisInfo) + : MCExternalSymbolizer(Ctx, std::move(RelInfo), GetOpInfo, SymbolLookUp, + DisInfo) {} + + bool tryAddingSymbolicOperand(MCInst &MI, raw_ostream &CommentStream, + int64_t Value, uint64_t Address, bool IsBranch, + uint64_t Offset, uint64_t InstSize) override; +}; + +} // namespace llvm + +#endif diff --git a/lib/Target/AArch64/Disassembler/CMakeLists.txt b/lib/Target/AArch64/Disassembler/CMakeLists.txt new file mode 100644 index 00000000000..be4ccad6d1b --- /dev/null +++ b/lib/Target/AArch64/Disassembler/CMakeLists.txt @@ -0,0 +1,14 @@ +include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) + +add_llvm_library(LLVMAArch64Disassembler + AArch64Disassembler.cpp + AArch64ExternalSymbolizer.cpp + ) +# workaround for hanging compilation on MSVC8, 9 and 10 +#if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 ) +#set_property( +# SOURCE ARMDisassembler.cpp +# PROPERTY COMPILE_FLAGS "/Od" +# ) +#endif() +add_dependencies(LLVMAArch64Disassembler AArch64CommonTableGen) diff --git a/lib/Target/AArch64/Disassembler/LLVMBuild.txt b/lib/Target/AArch64/Disassembler/LLVMBuild.txt new file mode 100644 index 00000000000..a4224f4a2f5 --- /dev/null +++ b/lib/Target/AArch64/Disassembler/LLVMBuild.txt @@ -0,0 +1,23 @@ +;===- ./lib/Target/AArch64/Disassembler/LLVMBuild.txt ------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = AArch64Disassembler +parent = AArch64 +required_libraries = AArch64Info AArch64Utils MC Support +add_to_library_groups = AArch64 diff --git a/lib/Target/AArch64/Disassembler/Makefile b/lib/Target/AArch64/Disassembler/Makefile new file mode 100644 index 00000000000..741bb817a63 --- /dev/null +++ b/lib/Target/AArch64/Disassembler/Makefile @@ -0,0 +1,16 @@ +##===- lib/Target/AArch64/Disassembler/Makefile ------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## + +LEVEL = ../../../.. +LIBRARYNAME = LLVMAArch64Disassembler + +# Hack: we need to include 'main' arm target directory to grab private headers +CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp new file mode 100644 index 00000000000..f484a5b1bdc --- /dev/null +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp @@ -0,0 +1,1316 @@ +//==-- AArch64InstPrinter.cpp - Convert AArch64 MCInst to assembly syntax --==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class prints an AArch64 MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#include "AArch64InstPrinter.h" +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +#define GET_INSTRUCTION_NAME +#define PRINT_ALIAS_INSTR +#include "AArch64GenAsmWriter.inc" +#define GET_INSTRUCTION_NAME +#define PRINT_ALIAS_INSTR +#include "AArch64GenAsmWriter1.inc" + +AArch64InstPrinter::AArch64InstPrinter(const MCAsmInfo &MAI, + const MCInstrInfo &MII, + const MCRegisterInfo &MRI, + const MCSubtargetInfo &STI) + : MCInstPrinter(MAI, MII, MRI) { + // Initialize the set of available features. + setAvailableFeatures(STI.getFeatureBits()); +} + +AArch64AppleInstPrinter::AArch64AppleInstPrinter(const MCAsmInfo &MAI, + const MCInstrInfo &MII, + const MCRegisterInfo &MRI, + const MCSubtargetInfo &STI) + : AArch64InstPrinter(MAI, MII, MRI, STI) {} + +void AArch64InstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { + // This is for .cfi directives. + OS << getRegisterName(RegNo); +} + +void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot) { + // Check for special encodings and print the canonical alias instead. + + unsigned Opcode = MI->getOpcode(); + + if (Opcode == AArch64::SYSxt) + if (printSysAlias(MI, O)) { + printAnnotation(O, Annot); + return; + } + + // SBFM/UBFM should print to a nicer aliased form if possible. + if (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri || + Opcode == AArch64::UBFMXri || Opcode == AArch64::UBFMWri) { + const MCOperand &Op0 = MI->getOperand(0); + const MCOperand &Op1 = MI->getOperand(1); + const MCOperand &Op2 = MI->getOperand(2); + const MCOperand &Op3 = MI->getOperand(3); + + bool IsSigned = (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri); + bool Is64Bit = (Opcode == AArch64::SBFMXri || Opcode == AArch64::UBFMXri); + if (Op2.isImm() && Op2.getImm() == 0 && Op3.isImm()) { + const char *AsmMnemonic = nullptr; + + switch (Op3.getImm()) { + default: + break; + case 7: + if (IsSigned) + AsmMnemonic = "sxtb"; + else if (!Is64Bit) + AsmMnemonic = "uxtb"; + break; + case 15: + if (IsSigned) + AsmMnemonic = "sxth"; + else if (!Is64Bit) + AsmMnemonic = "uxth"; + break; + case 31: + // *xtw is only valid for signed 64-bit operations. + if (Is64Bit && IsSigned) + AsmMnemonic = "sxtw"; + break; + } + + if (AsmMnemonic) { + O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg()) + << ", " << getRegisterName(getWRegFromXReg(Op1.getReg())); + printAnnotation(O, Annot); + return; + } + } + + // All immediate shifts are aliases, implemented using the Bitfield + // instruction. In all cases the immediate shift amount shift must be in + // the range 0 to (reg.size -1). + if (Op2.isImm() && Op3.isImm()) { + const char *AsmMnemonic = nullptr; + int shift = 0; + int64_t immr = Op2.getImm(); + int64_t imms = Op3.getImm(); + if (Opcode == AArch64::UBFMWri && imms != 0x1F && ((imms + 1) == immr)) { + AsmMnemonic = "lsl"; + shift = 31 - imms; + } else if (Opcode == AArch64::UBFMXri && imms != 0x3f && + ((imms + 1 == immr))) { + AsmMnemonic = "lsl"; + shift = 63 - imms; + } else if (Opcode == AArch64::UBFMWri && imms == 0x1f) { + AsmMnemonic = "lsr"; + shift = immr; + } else if (Opcode == AArch64::UBFMXri && imms == 0x3f) { + AsmMnemonic = "lsr"; + shift = immr; + } else if (Opcode == AArch64::SBFMWri && imms == 0x1f) { + AsmMnemonic = "asr"; + shift = immr; + } else if (Opcode == AArch64::SBFMXri && imms == 0x3f) { + AsmMnemonic = "asr"; + shift = immr; + } + if (AsmMnemonic) { + O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg()) + << ", " << getRegisterName(Op1.getReg()) << ", #" << shift; + printAnnotation(O, Annot); + return; + } + } + + // SBFIZ/UBFIZ aliases + if (Op2.getImm() > Op3.getImm()) { + O << '\t' << (IsSigned ? "sbfiz" : "ubfiz") << '\t' + << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg()) + << ", #" << (Is64Bit ? 64 : 32) - Op2.getImm() << ", #" << Op3.getImm() + 1; + printAnnotation(O, Annot); + return; + } + + // Otherwise SBFX/UBFX is the preferred form + O << '\t' << (IsSigned ? "sbfx" : "ubfx") << '\t' + << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg()) + << ", #" << Op2.getImm() << ", #" << Op3.getImm() - Op2.getImm() + 1; + printAnnotation(O, Annot); + return; + } + + if (Opcode == AArch64::BFMXri || Opcode == AArch64::BFMWri) { + const MCOperand &Op0 = MI->getOperand(0); // Op1 == Op0 + const MCOperand &Op2 = MI->getOperand(2); + int ImmR = MI->getOperand(3).getImm(); + int ImmS = MI->getOperand(4).getImm(); + + // BFI alias + if (ImmS < ImmR) { + int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32; + int LSB = (BitWidth - ImmR) % BitWidth; + int Width = ImmS + 1; + O << "\tbfi\t" << getRegisterName(Op0.getReg()) << ", " + << getRegisterName(Op2.getReg()) << ", #" << LSB << ", #" << Width; + printAnnotation(O, Annot); + return; + } + + int LSB = ImmR; + int Width = ImmS - ImmR + 1; + // Otherwise BFXIL the preferred form + O << "\tbfxil\t" + << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op2.getReg()) + << ", #" << LSB << ", #" << Width; + printAnnotation(O, Annot); + return; + } + + // Symbolic operands for MOVZ, MOVN and MOVK already imply a shift + // (e.g. :gottprel_g1: is always going to be "lsl #16") so it should not be + // printed. + if ((Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi || + Opcode == AArch64::MOVNXi || Opcode == AArch64::MOVNWi) && + MI->getOperand(1).isExpr()) { + if (Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi) + O << "\tmovz\t"; + else + O << "\tmovn\t"; + + O << getRegisterName(MI->getOperand(0).getReg()) << ", #" + << *MI->getOperand(1).getExpr(); + return; + } + + if ((Opcode == AArch64::MOVKXi || Opcode == AArch64::MOVKWi) && + MI->getOperand(2).isExpr()) { + O << "\tmovk\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #" + << *MI->getOperand(2).getExpr(); + return; + } + + if (!printAliasInstr(MI, O)) + printInstruction(MI, O); + + printAnnotation(O, Annot); +} + +static bool isTblTbxInstruction(unsigned Opcode, StringRef &Layout, + bool &IsTbx) { + switch (Opcode) { + case AArch64::TBXv8i8One: + case AArch64::TBXv8i8Two: + case AArch64::TBXv8i8Three: + case AArch64::TBXv8i8Four: + IsTbx = true; + Layout = ".8b"; + return true; + case AArch64::TBLv8i8One: + case AArch64::TBLv8i8Two: + case AArch64::TBLv8i8Three: + case AArch64::TBLv8i8Four: + IsTbx = false; + Layout = ".8b"; + return true; + case AArch64::TBXv16i8One: + case AArch64::TBXv16i8Two: + case AArch64::TBXv16i8Three: + case AArch64::TBXv16i8Four: + IsTbx = true; + Layout = ".16b"; + return true; + case AArch64::TBLv16i8One: + case AArch64::TBLv16i8Two: + case AArch64::TBLv16i8Three: + case AArch64::TBLv16i8Four: + IsTbx = false; + Layout = ".16b"; + return true; + default: + return false; + } +} + +struct LdStNInstrDesc { + unsigned Opcode; + const char *Mnemonic; + const char *Layout; + int ListOperand; + bool HasLane; + int NaturalOffset; +}; + +static LdStNInstrDesc LdStNInstInfo[] = { + { AArch64::LD1i8, "ld1", ".b", 1, true, 0 }, + { AArch64::LD1i16, "ld1", ".h", 1, true, 0 }, + { AArch64::LD1i32, "ld1", ".s", 1, true, 0 }, + { AArch64::LD1i64, "ld1", ".d", 1, true, 0 }, + { AArch64::LD1i8_POST, "ld1", ".b", 2, true, 1 }, + { AArch64::LD1i16_POST, "ld1", ".h", 2, true, 2 }, + { AArch64::LD1i32_POST, "ld1", ".s", 2, true, 4 }, + { AArch64::LD1i64_POST, "ld1", ".d", 2, true, 8 }, + { AArch64::LD1Rv16b, "ld1r", ".16b", 0, false, 0 }, + { AArch64::LD1Rv8h, "ld1r", ".8h", 0, false, 0 }, + { AArch64::LD1Rv4s, "ld1r", ".4s", 0, false, 0 }, + { AArch64::LD1Rv2d, "ld1r", ".2d", 0, false, 0 }, + { AArch64::LD1Rv8b, "ld1r", ".8b", 0, false, 0 }, + { AArch64::LD1Rv4h, "ld1r", ".4h", 0, false, 0 }, + { AArch64::LD1Rv2s, "ld1r", ".2s", 0, false, 0 }, + { AArch64::LD1Rv1d, "ld1r", ".1d", 0, false, 0 }, + { AArch64::LD1Rv16b_POST, "ld1r", ".16b", 1, false, 1 }, + { AArch64::LD1Rv8h_POST, "ld1r", ".8h", 1, false, 2 }, + { AArch64::LD1Rv4s_POST, "ld1r", ".4s", 1, false, 4 }, + { AArch64::LD1Rv2d_POST, "ld1r", ".2d", 1, false, 8 }, + { AArch64::LD1Rv8b_POST, "ld1r", ".8b", 1, false, 1 }, + { AArch64::LD1Rv4h_POST, "ld1r", ".4h", 1, false, 2 }, + { AArch64::LD1Rv2s_POST, "ld1r", ".2s", 1, false, 4 }, + { AArch64::LD1Rv1d_POST, "ld1r", ".1d", 1, false, 8 }, + { AArch64::LD1Onev16b, "ld1", ".16b", 0, false, 0 }, + { AArch64::LD1Onev8h, "ld1", ".8h", 0, false, 0 }, + { AArch64::LD1Onev4s, "ld1", ".4s", 0, false, 0 }, + { AArch64::LD1Onev2d, "ld1", ".2d", 0, false, 0 }, + { AArch64::LD1Onev8b, "ld1", ".8b", 0, false, 0 }, + { AArch64::LD1Onev4h, "ld1", ".4h", 0, false, 0 }, + { AArch64::LD1Onev2s, "ld1", ".2s", 0, false, 0 }, + { AArch64::LD1Onev1d, "ld1", ".1d", 0, false, 0 }, + { AArch64::LD1Onev16b_POST, "ld1", ".16b", 1, false, 16 }, + { AArch64::LD1Onev8h_POST, "ld1", ".8h", 1, false, 16 }, + { AArch64::LD1Onev4s_POST, "ld1", ".4s", 1, false, 16 }, + { AArch64::LD1Onev2d_POST, "ld1", ".2d", 1, false, 16 }, + { AArch64::LD1Onev8b_POST, "ld1", ".8b", 1, false, 8 }, + { AArch64::LD1Onev4h_POST, "ld1", ".4h", 1, false, 8 }, + { AArch64::LD1Onev2s_POST, "ld1", ".2s", 1, false, 8 }, + { AArch64::LD1Onev1d_POST, "ld1", ".1d", 1, false, 8 }, + { AArch64::LD1Twov16b, "ld1", ".16b", 0, false, 0 }, + { AArch64::LD1Twov8h, "ld1", ".8h", 0, false, 0 }, + { AArch64::LD1Twov4s, "ld1", ".4s", 0, false, 0 }, + { AArch64::LD1Twov2d, "ld1", ".2d", 0, false, 0 }, + { AArch64::LD1Twov8b, "ld1", ".8b", 0, false, 0 }, + { AArch64::LD1Twov4h, "ld1", ".4h", 0, false, 0 }, + { AArch64::LD1Twov2s, "ld1", ".2s", 0, false, 0 }, + { AArch64::LD1Twov1d, "ld1", ".1d", 0, false, 0 }, + { AArch64::LD1Twov16b_POST, "ld1", ".16b", 1, false, 32 }, + { AArch64::LD1Twov8h_POST, "ld1", ".8h", 1, false, 32 }, + { AArch64::LD1Twov4s_POST, "ld1", ".4s", 1, false, 32 }, + { AArch64::LD1Twov2d_POST, "ld1", ".2d", 1, false, 32 }, + { AArch64::LD1Twov8b_POST, "ld1", ".8b", 1, false, 16 }, + { AArch64::LD1Twov4h_POST, "ld1", ".4h", 1, false, 16 }, + { AArch64::LD1Twov2s_POST, "ld1", ".2s", 1, false, 16 }, + { AArch64::LD1Twov1d_POST, "ld1", ".1d", 1, false, 16 }, + { AArch64::LD1Threev16b, "ld1", ".16b", 0, false, 0 }, + { AArch64::LD1Threev8h, "ld1", ".8h", 0, false, 0 }, + { AArch64::LD1Threev4s, "ld1", ".4s", 0, false, 0 }, + { AArch64::LD1Threev2d, "ld1", ".2d", 0, false, 0 }, + { AArch64::LD1Threev8b, "ld1", ".8b", 0, false, 0 }, + { AArch64::LD1Threev4h, "ld1", ".4h", 0, false, 0 }, + { AArch64::LD1Threev2s, "ld1", ".2s", 0, false, 0 }, + { AArch64::LD1Threev1d, "ld1", ".1d", 0, false, 0 }, + { AArch64::LD1Threev16b_POST, "ld1", ".16b", 1, false, 48 }, + { AArch64::LD1Threev8h_POST, "ld1", ".8h", 1, false, 48 }, + { AArch64::LD1Threev4s_POST, "ld1", ".4s", 1, false, 48 }, + { AArch64::LD1Threev2d_POST, "ld1", ".2d", 1, false, 48 }, + { AArch64::LD1Threev8b_POST, "ld1", ".8b", 1, false, 24 }, + { AArch64::LD1Threev4h_POST, "ld1", ".4h", 1, false, 24 }, + { AArch64::LD1Threev2s_POST, "ld1", ".2s", 1, false, 24 }, + { AArch64::LD1Threev1d_POST, "ld1", ".1d", 1, false, 24 }, + { AArch64::LD1Fourv16b, "ld1", ".16b", 0, false, 0 }, + { AArch64::LD1Fourv8h, "ld1", ".8h", 0, false, 0 }, + { AArch64::LD1Fourv4s, "ld1", ".4s", 0, false, 0 }, + { AArch64::LD1Fourv2d, "ld1", ".2d", 0, false, 0 }, + { AArch64::LD1Fourv8b, "ld1", ".8b", 0, false, 0 }, + { AArch64::LD1Fourv4h, "ld1", ".4h", 0, false, 0 }, + { AArch64::LD1Fourv2s, "ld1", ".2s", 0, false, 0 }, + { AArch64::LD1Fourv1d, "ld1", ".1d", 0, false, 0 }, + { AArch64::LD1Fourv16b_POST, "ld1", ".16b", 1, false, 64 }, + { AArch64::LD1Fourv8h_POST, "ld1", ".8h", 1, false, 64 }, + { AArch64::LD1Fourv4s_POST, "ld1", ".4s", 1, false, 64 }, + { AArch64::LD1Fourv2d_POST, "ld1", ".2d", 1, false, 64 }, + { AArch64::LD1Fourv8b_POST, "ld1", ".8b", 1, false, 32 }, + { AArch64::LD1Fourv4h_POST, "ld1", ".4h", 1, false, 32 }, + { AArch64::LD1Fourv2s_POST, "ld1", ".2s", 1, false, 32 }, + { AArch64::LD1Fourv1d_POST, "ld1", ".1d", 1, false, 32 }, + { AArch64::LD2i8, "ld2", ".b", 1, true, 0 }, + { AArch64::LD2i16, "ld2", ".h", 1, true, 0 }, + { AArch64::LD2i32, "ld2", ".s", 1, true, 0 }, + { AArch64::LD2i64, "ld2", ".d", 1, true, 0 }, + { AArch64::LD2i8_POST, "ld2", ".b", 2, true, 2 }, + { AArch64::LD2i16_POST, "ld2", ".h", 2, true, 4 }, + { AArch64::LD2i32_POST, "ld2", ".s", 2, true, 8 }, + { AArch64::LD2i64_POST, "ld2", ".d", 2, true, 16 }, + { AArch64::LD2Rv16b, "ld2r", ".16b", 0, false, 0 }, + { AArch64::LD2Rv8h, "ld2r", ".8h", 0, false, 0 }, + { AArch64::LD2Rv4s, "ld2r", ".4s", 0, false, 0 }, + { AArch64::LD2Rv2d, "ld2r", ".2d", 0, false, 0 }, + { AArch64::LD2Rv8b, "ld2r", ".8b", 0, false, 0 }, + { AArch64::LD2Rv4h, "ld2r", ".4h", 0, false, 0 }, + { AArch64::LD2Rv2s, "ld2r", ".2s", 0, false, 0 }, + { AArch64::LD2Rv1d, "ld2r", ".1d", 0, false, 0 }, + { AArch64::LD2Rv16b_POST, "ld2r", ".16b", 1, false, 2 }, + { AArch64::LD2Rv8h_POST, "ld2r", ".8h", 1, false, 4 }, + { AArch64::LD2Rv4s_POST, "ld2r", ".4s", 1, false, 8 }, + { AArch64::LD2Rv2d_POST, "ld2r", ".2d", 1, false, 16 }, + { AArch64::LD2Rv8b_POST, "ld2r", ".8b", 1, false, 2 }, + { AArch64::LD2Rv4h_POST, "ld2r", ".4h", 1, false, 4 }, + { AArch64::LD2Rv2s_POST, "ld2r", ".2s", 1, false, 8 }, + { AArch64::LD2Rv1d_POST, "ld2r", ".1d", 1, false, 16 }, + { AArch64::LD2Twov16b, "ld2", ".16b", 0, false, 0 }, + { AArch64::LD2Twov8h, "ld2", ".8h", 0, false, 0 }, + { AArch64::LD2Twov4s, "ld2", ".4s", 0, false, 0 }, + { AArch64::LD2Twov2d, "ld2", ".2d", 0, false, 0 }, + { AArch64::LD2Twov8b, "ld2", ".8b", 0, false, 0 }, + { AArch64::LD2Twov4h, "ld2", ".4h", 0, false, 0 }, + { AArch64::LD2Twov2s, "ld2", ".2s", 0, false, 0 }, + { AArch64::LD2Twov16b_POST, "ld2", ".16b", 1, false, 32 }, + { AArch64::LD2Twov8h_POST, "ld2", ".8h", 1, false, 32 }, + { AArch64::LD2Twov4s_POST, "ld2", ".4s", 1, false, 32 }, + { AArch64::LD2Twov2d_POST, "ld2", ".2d", 1, false, 32 }, + { AArch64::LD2Twov8b_POST, "ld2", ".8b", 1, false, 16 }, + { AArch64::LD2Twov4h_POST, "ld2", ".4h", 1, false, 16 }, + { AArch64::LD2Twov2s_POST, "ld2", ".2s", 1, false, 16 }, + { AArch64::LD3i8, "ld3", ".b", 1, true, 0 }, + { AArch64::LD3i16, "ld3", ".h", 1, true, 0 }, + { AArch64::LD3i32, "ld3", ".s", 1, true, 0 }, + { AArch64::LD3i64, "ld3", ".d", 1, true, 0 }, + { AArch64::LD3i8_POST, "ld3", ".b", 2, true, 3 }, + { AArch64::LD3i16_POST, "ld3", ".h", 2, true, 6 }, + { AArch64::LD3i32_POST, "ld3", ".s", 2, true, 12 }, + { AArch64::LD3i64_POST, "ld3", ".d", 2, true, 24 }, + { AArch64::LD3Rv16b, "ld3r", ".16b", 0, false, 0 }, + { AArch64::LD3Rv8h, "ld3r", ".8h", 0, false, 0 }, + { AArch64::LD3Rv4s, "ld3r", ".4s", 0, false, 0 }, + { AArch64::LD3Rv2d, "ld3r", ".2d", 0, false, 0 }, + { AArch64::LD3Rv8b, "ld3r", ".8b", 0, false, 0 }, + { AArch64::LD3Rv4h, "ld3r", ".4h", 0, false, 0 }, + { AArch64::LD3Rv2s, "ld3r", ".2s", 0, false, 0 }, + { AArch64::LD3Rv1d, "ld3r", ".1d", 0, false, 0 }, + { AArch64::LD3Rv16b_POST, "ld3r", ".16b", 1, false, 3 }, + { AArch64::LD3Rv8h_POST, "ld3r", ".8h", 1, false, 6 }, + { AArch64::LD3Rv4s_POST, "ld3r", ".4s", 1, false, 12 }, + { AArch64::LD3Rv2d_POST, "ld3r", ".2d", 1, false, 24 }, + { AArch64::LD3Rv8b_POST, "ld3r", ".8b", 1, false, 3 }, + { AArch64::LD3Rv4h_POST, "ld3r", ".4h", 1, false, 6 }, + { AArch64::LD3Rv2s_POST, "ld3r", ".2s", 1, false, 12 }, + { AArch64::LD3Rv1d_POST, "ld3r", ".1d", 1, false, 24 }, + { AArch64::LD3Threev16b, "ld3", ".16b", 0, false, 0 }, + { AArch64::LD3Threev8h, "ld3", ".8h", 0, false, 0 }, + { AArch64::LD3Threev4s, "ld3", ".4s", 0, false, 0 }, + { AArch64::LD3Threev2d, "ld3", ".2d", 0, false, 0 }, + { AArch64::LD3Threev8b, "ld3", ".8b", 0, false, 0 }, + { AArch64::LD3Threev4h, "ld3", ".4h", 0, false, 0 }, + { AArch64::LD3Threev2s, "ld3", ".2s", 0, false, 0 }, + { AArch64::LD3Threev16b_POST, "ld3", ".16b", 1, false, 48 }, + { AArch64::LD3Threev8h_POST, "ld3", ".8h", 1, false, 48 }, + { AArch64::LD3Threev4s_POST, "ld3", ".4s", 1, false, 48 }, + { AArch64::LD3Threev2d_POST, "ld3", ".2d", 1, false, 48 }, + { AArch64::LD3Threev8b_POST, "ld3", ".8b", 1, false, 24 }, + { AArch64::LD3Threev4h_POST, "ld3", ".4h", 1, false, 24 }, + { AArch64::LD3Threev2s_POST, "ld3", ".2s", 1, false, 24 }, + { AArch64::LD4i8, "ld4", ".b", 1, true, 0 }, + { AArch64::LD4i16, "ld4", ".h", 1, true, 0 }, + { AArch64::LD4i32, "ld4", ".s", 1, true, 0 }, + { AArch64::LD4i64, "ld4", ".d", 1, true, 0 }, + { AArch64::LD4i8_POST, "ld4", ".b", 2, true, 4 }, + { AArch64::LD4i16_POST, "ld4", ".h", 2, true, 8 }, + { AArch64::LD4i32_POST, "ld4", ".s", 2, true, 16 }, + { AArch64::LD4i64_POST, "ld4", ".d", 2, true, 32 }, + { AArch64::LD4Rv16b, "ld4r", ".16b", 0, false, 0 }, + { AArch64::LD4Rv8h, "ld4r", ".8h", 0, false, 0 }, + { AArch64::LD4Rv4s, "ld4r", ".4s", 0, false, 0 }, + { AArch64::LD4Rv2d, "ld4r", ".2d", 0, false, 0 }, + { AArch64::LD4Rv8b, "ld4r", ".8b", 0, false, 0 }, + { AArch64::LD4Rv4h, "ld4r", ".4h", 0, false, 0 }, + { AArch64::LD4Rv2s, "ld4r", ".2s", 0, false, 0 }, + { AArch64::LD4Rv1d, "ld4r", ".1d", 0, false, 0 }, + { AArch64::LD4Rv16b_POST, "ld4r", ".16b", 1, false, 4 }, + { AArch64::LD4Rv8h_POST, "ld4r", ".8h", 1, false, 8 }, + { AArch64::LD4Rv4s_POST, "ld4r", ".4s", 1, false, 16 }, + { AArch64::LD4Rv2d_POST, "ld4r", ".2d", 1, false, 32 }, + { AArch64::LD4Rv8b_POST, "ld4r", ".8b", 1, false, 4 }, + { AArch64::LD4Rv4h_POST, "ld4r", ".4h", 1, false, 8 }, + { AArch64::LD4Rv2s_POST, "ld4r", ".2s", 1, false, 16 }, + { AArch64::LD4Rv1d_POST, "ld4r", ".1d", 1, false, 32 }, + { AArch64::LD4Fourv16b, "ld4", ".16b", 0, false, 0 }, + { AArch64::LD4Fourv8h, "ld4", ".8h", 0, false, 0 }, + { AArch64::LD4Fourv4s, "ld4", ".4s", 0, false, 0 }, + { AArch64::LD4Fourv2d, "ld4", ".2d", 0, false, 0 }, + { AArch64::LD4Fourv8b, "ld4", ".8b", 0, false, 0 }, + { AArch64::LD4Fourv4h, "ld4", ".4h", 0, false, 0 }, + { AArch64::LD4Fourv2s, "ld4", ".2s", 0, false, 0 }, + { AArch64::LD4Fourv16b_POST, "ld4", ".16b", 1, false, 64 }, + { AArch64::LD4Fourv8h_POST, "ld4", ".8h", 1, false, 64 }, + { AArch64::LD4Fourv4s_POST, "ld4", ".4s", 1, false, 64 }, + { AArch64::LD4Fourv2d_POST, "ld4", ".2d", 1, false, 64 }, + { AArch64::LD4Fourv8b_POST, "ld4", ".8b", 1, false, 32 }, + { AArch64::LD4Fourv4h_POST, "ld4", ".4h", 1, false, 32 }, + { AArch64::LD4Fourv2s_POST, "ld4", ".2s", 1, false, 32 }, + { AArch64::ST1i8, "st1", ".b", 0, true, 0 }, + { AArch64::ST1i16, "st1", ".h", 0, true, 0 }, + { AArch64::ST1i32, "st1", ".s", 0, true, 0 }, + { AArch64::ST1i64, "st1", ".d", 0, true, 0 }, + { AArch64::ST1i8_POST, "st1", ".b", 1, true, 1 }, + { AArch64::ST1i16_POST, "st1", ".h", 1, true, 2 }, + { AArch64::ST1i32_POST, "st1", ".s", 1, true, 4 }, + { AArch64::ST1i64_POST, "st1", ".d", 1, true, 8 }, + { AArch64::ST1Onev16b, "st1", ".16b", 0, false, 0 }, + { AArch64::ST1Onev8h, "st1", ".8h", 0, false, 0 }, + { AArch64::ST1Onev4s, "st1", ".4s", 0, false, 0 }, + { AArch64::ST1Onev2d, "st1", ".2d", 0, false, 0 }, + { AArch64::ST1Onev8b, "st1", ".8b", 0, false, 0 }, + { AArch64::ST1Onev4h, "st1", ".4h", 0, false, 0 }, + { AArch64::ST1Onev2s, "st1", ".2s", 0, false, 0 }, + { AArch64::ST1Onev1d, "st1", ".1d", 0, false, 0 }, + { AArch64::ST1Onev16b_POST, "st1", ".16b", 1, false, 16 }, + { AArch64::ST1Onev8h_POST, "st1", ".8h", 1, false, 16 }, + { AArch64::ST1Onev4s_POST, "st1", ".4s", 1, false, 16 }, + { AArch64::ST1Onev2d_POST, "st1", ".2d", 1, false, 16 }, + { AArch64::ST1Onev8b_POST, "st1", ".8b", 1, false, 8 }, + { AArch64::ST1Onev4h_POST, "st1", ".4h", 1, false, 8 }, + { AArch64::ST1Onev2s_POST, "st1", ".2s", 1, false, 8 }, + { AArch64::ST1Onev1d_POST, "st1", ".1d", 1, false, 8 }, + { AArch64::ST1Twov16b, "st1", ".16b", 0, false, 0 }, + { AArch64::ST1Twov8h, "st1", ".8h", 0, false, 0 }, + { AArch64::ST1Twov4s, "st1", ".4s", 0, false, 0 }, + { AArch64::ST1Twov2d, "st1", ".2d", 0, false, 0 }, + { AArch64::ST1Twov8b, "st1", ".8b", 0, false, 0 }, + { AArch64::ST1Twov4h, "st1", ".4h", 0, false, 0 }, + { AArch64::ST1Twov2s, "st1", ".2s", 0, false, 0 }, + { AArch64::ST1Twov1d, "st1", ".1d", 0, false, 0 }, + { AArch64::ST1Twov16b_POST, "st1", ".16b", 1, false, 32 }, + { AArch64::ST1Twov8h_POST, "st1", ".8h", 1, false, 32 }, + { AArch64::ST1Twov4s_POST, "st1", ".4s", 1, false, 32 }, + { AArch64::ST1Twov2d_POST, "st1", ".2d", 1, false, 32 }, + { AArch64::ST1Twov8b_POST, "st1", ".8b", 1, false, 16 }, + { AArch64::ST1Twov4h_POST, "st1", ".4h", 1, false, 16 }, + { AArch64::ST1Twov2s_POST, "st1", ".2s", 1, false, 16 }, + { AArch64::ST1Twov1d_POST, "st1", ".1d", 1, false, 16 }, + { AArch64::ST1Threev16b, "st1", ".16b", 0, false, 0 }, + { AArch64::ST1Threev8h, "st1", ".8h", 0, false, 0 }, + { AArch64::ST1Threev4s, "st1", ".4s", 0, false, 0 }, + { AArch64::ST1Threev2d, "st1", ".2d", 0, false, 0 }, + { AArch64::ST1Threev8b, "st1", ".8b", 0, false, 0 }, + { AArch64::ST1Threev4h, "st1", ".4h", 0, false, 0 }, + { AArch64::ST1Threev2s, "st1", ".2s", 0, false, 0 }, + { AArch64::ST1Threev1d, "st1", ".1d", 0, false, 0 }, + { AArch64::ST1Threev16b_POST, "st1", ".16b", 1, false, 48 }, + { AArch64::ST1Threev8h_POST, "st1", ".8h", 1, false, 48 }, + { AArch64::ST1Threev4s_POST, "st1", ".4s", 1, false, 48 }, + { AArch64::ST1Threev2d_POST, "st1", ".2d", 1, false, 48 }, + { AArch64::ST1Threev8b_POST, "st1", ".8b", 1, false, 24 }, + { AArch64::ST1Threev4h_POST, "st1", ".4h", 1, false, 24 }, + { AArch64::ST1Threev2s_POST, "st1", ".2s", 1, false, 24 }, + { AArch64::ST1Threev1d_POST, "st1", ".1d", 1, false, 24 }, + { AArch64::ST1Fourv16b, "st1", ".16b", 0, false, 0 }, + { AArch64::ST1Fourv8h, "st1", ".8h", 0, false, 0 }, + { AArch64::ST1Fourv4s, "st1", ".4s", 0, false, 0 }, + { AArch64::ST1Fourv2d, "st1", ".2d", 0, false, 0 }, + { AArch64::ST1Fourv8b, "st1", ".8b", 0, false, 0 }, + { AArch64::ST1Fourv4h, "st1", ".4h", 0, false, 0 }, + { AArch64::ST1Fourv2s, "st1", ".2s", 0, false, 0 }, + { AArch64::ST1Fourv1d, "st1", ".1d", 0, false, 0 }, + { AArch64::ST1Fourv16b_POST, "st1", ".16b", 1, false, 64 }, + { AArch64::ST1Fourv8h_POST, "st1", ".8h", 1, false, 64 }, + { AArch64::ST1Fourv4s_POST, "st1", ".4s", 1, false, 64 }, + { AArch64::ST1Fourv2d_POST, "st1", ".2d", 1, false, 64 }, + { AArch64::ST1Fourv8b_POST, "st1", ".8b", 1, false, 32 }, + { AArch64::ST1Fourv4h_POST, "st1", ".4h", 1, false, 32 }, + { AArch64::ST1Fourv2s_POST, "st1", ".2s", 1, false, 32 }, + { AArch64::ST1Fourv1d_POST, "st1", ".1d", 1, false, 32 }, + { AArch64::ST2i8, "st2", ".b", 0, true, 0 }, + { AArch64::ST2i16, "st2", ".h", 0, true, 0 }, + { AArch64::ST2i32, "st2", ".s", 0, true, 0 }, + { AArch64::ST2i64, "st2", ".d", 0, true, 0 }, + { AArch64::ST2i8_POST, "st2", ".b", 1, true, 2 }, + { AArch64::ST2i16_POST, "st2", ".h", 1, true, 4 }, + { AArch64::ST2i32_POST, "st2", ".s", 1, true, 8 }, + { AArch64::ST2i64_POST, "st2", ".d", 1, true, 16 }, + { AArch64::ST2Twov16b, "st2", ".16b", 0, false, 0 }, + { AArch64::ST2Twov8h, "st2", ".8h", 0, false, 0 }, + { AArch64::ST2Twov4s, "st2", ".4s", 0, false, 0 }, + { AArch64::ST2Twov2d, "st2", ".2d", 0, false, 0 }, + { AArch64::ST2Twov8b, "st2", ".8b", 0, false, 0 }, + { AArch64::ST2Twov4h, "st2", ".4h", 0, false, 0 }, + { AArch64::ST2Twov2s, "st2", ".2s", 0, false, 0 }, + { AArch64::ST2Twov16b_POST, "st2", ".16b", 1, false, 32 }, + { AArch64::ST2Twov8h_POST, "st2", ".8h", 1, false, 32 }, + { AArch64::ST2Twov4s_POST, "st2", ".4s", 1, false, 32 }, + { AArch64::ST2Twov2d_POST, "st2", ".2d", 1, false, 32 }, + { AArch64::ST2Twov8b_POST, "st2", ".8b", 1, false, 16 }, + { AArch64::ST2Twov4h_POST, "st2", ".4h", 1, false, 16 }, + { AArch64::ST2Twov2s_POST, "st2", ".2s", 1, false, 16 }, + { AArch64::ST3i8, "st3", ".b", 0, true, 0 }, + { AArch64::ST3i16, "st3", ".h", 0, true, 0 }, + { AArch64::ST3i32, "st3", ".s", 0, true, 0 }, + { AArch64::ST3i64, "st3", ".d", 0, true, 0 }, + { AArch64::ST3i8_POST, "st3", ".b", 1, true, 3 }, + { AArch64::ST3i16_POST, "st3", ".h", 1, true, 6 }, + { AArch64::ST3i32_POST, "st3", ".s", 1, true, 12 }, + { AArch64::ST3i64_POST, "st3", ".d", 1, true, 24 }, + { AArch64::ST3Threev16b, "st3", ".16b", 0, false, 0 }, + { AArch64::ST3Threev8h, "st3", ".8h", 0, false, 0 }, + { AArch64::ST3Threev4s, "st3", ".4s", 0, false, 0 }, + { AArch64::ST3Threev2d, "st3", ".2d", 0, false, 0 }, + { AArch64::ST3Threev8b, "st3", ".8b", 0, false, 0 }, + { AArch64::ST3Threev4h, "st3", ".4h", 0, false, 0 }, + { AArch64::ST3Threev2s, "st3", ".2s", 0, false, 0 }, + { AArch64::ST3Threev16b_POST, "st3", ".16b", 1, false, 48 }, + { AArch64::ST3Threev8h_POST, "st3", ".8h", 1, false, 48 }, + { AArch64::ST3Threev4s_POST, "st3", ".4s", 1, false, 48 }, + { AArch64::ST3Threev2d_POST, "st3", ".2d", 1, false, 48 }, + { AArch64::ST3Threev8b_POST, "st3", ".8b", 1, false, 24 }, + { AArch64::ST3Threev4h_POST, "st3", ".4h", 1, false, 24 }, + { AArch64::ST3Threev2s_POST, "st3", ".2s", 1, false, 24 }, + { AArch64::ST4i8, "st4", ".b", 0, true, 0 }, + { AArch64::ST4i16, "st4", ".h", 0, true, 0 }, + { AArch64::ST4i32, "st4", ".s", 0, true, 0 }, + { AArch64::ST4i64, "st4", ".d", 0, true, 0 }, + { AArch64::ST4i8_POST, "st4", ".b", 1, true, 4 }, + { AArch64::ST4i16_POST, "st4", ".h", 1, true, 8 }, + { AArch64::ST4i32_POST, "st4", ".s", 1, true, 16 }, + { AArch64::ST4i64_POST, "st4", ".d", 1, true, 32 }, + { AArch64::ST4Fourv16b, "st4", ".16b", 0, false, 0 }, + { AArch64::ST4Fourv8h, "st4", ".8h", 0, false, 0 }, + { AArch64::ST4Fourv4s, "st4", ".4s", 0, false, 0 }, + { AArch64::ST4Fourv2d, "st4", ".2d", 0, false, 0 }, + { AArch64::ST4Fourv8b, "st4", ".8b", 0, false, 0 }, + { AArch64::ST4Fourv4h, "st4", ".4h", 0, false, 0 }, + { AArch64::ST4Fourv2s, "st4", ".2s", 0, false, 0 }, + { AArch64::ST4Fourv16b_POST, "st4", ".16b", 1, false, 64 }, + { AArch64::ST4Fourv8h_POST, "st4", ".8h", 1, false, 64 }, + { AArch64::ST4Fourv4s_POST, "st4", ".4s", 1, false, 64 }, + { AArch64::ST4Fourv2d_POST, "st4", ".2d", 1, false, 64 }, + { AArch64::ST4Fourv8b_POST, "st4", ".8b", 1, false, 32 }, + { AArch64::ST4Fourv4h_POST, "st4", ".4h", 1, false, 32 }, + { AArch64::ST4Fourv2s_POST, "st4", ".2s", 1, false, 32 }, +}; + +static LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) { + unsigned Idx; + for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx) + if (LdStNInstInfo[Idx].Opcode == Opcode) + return &LdStNInstInfo[Idx]; + + return nullptr; +} + +void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O, + StringRef Annot) { + unsigned Opcode = MI->getOpcode(); + StringRef Layout, Mnemonic; + + bool IsTbx; + if (isTblTbxInstruction(MI->getOpcode(), Layout, IsTbx)) { + O << "\t" << (IsTbx ? "tbx" : "tbl") << Layout << '\t' + << getRegisterName(MI->getOperand(0).getReg(), AArch64::vreg) << ", "; + + unsigned ListOpNum = IsTbx ? 2 : 1; + printVectorList(MI, ListOpNum, O, ""); + + O << ", " + << getRegisterName(MI->getOperand(ListOpNum + 1).getReg(), AArch64::vreg); + printAnnotation(O, Annot); + return; + } + + if (LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) { + O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t'; + + // Now onto the operands: first a vector list with possible lane + // specifier. E.g. { v0 }[2] + int OpNum = LdStDesc->ListOperand; + printVectorList(MI, OpNum++, O, ""); + + if (LdStDesc->HasLane) + O << '[' << MI->getOperand(OpNum++).getImm() << ']'; + + // Next the address: [xN] + unsigned AddrReg = MI->getOperand(OpNum++).getReg(); + O << ", [" << getRegisterName(AddrReg) << ']'; + + // Finally, there might be a post-indexed offset. + if (LdStDesc->NaturalOffset != 0) { + unsigned Reg = MI->getOperand(OpNum++).getReg(); + if (Reg != AArch64::XZR) + O << ", " << getRegisterName(Reg); + else { + assert(LdStDesc->NaturalOffset && "no offset on post-inc instruction?"); + O << ", #" << LdStDesc->NaturalOffset; + } + } + + printAnnotation(O, Annot); + return; + } + + AArch64InstPrinter::printInst(MI, O, Annot); +} + +bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) { +#ifndef NDEBUG + unsigned Opcode = MI->getOpcode(); + assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!"); +#endif + + const char *Asm = nullptr; + const MCOperand &Op1 = MI->getOperand(0); + const MCOperand &Cn = MI->getOperand(1); + const MCOperand &Cm = MI->getOperand(2); + const MCOperand &Op2 = MI->getOperand(3); + + unsigned Op1Val = Op1.getImm(); + unsigned CnVal = Cn.getImm(); + unsigned CmVal = Cm.getImm(); + unsigned Op2Val = Op2.getImm(); + + if (CnVal == 7) { + switch (CmVal) { + default: + break; + + // IC aliases + case 1: + if (Op1Val == 0 && Op2Val == 0) + Asm = "ic\tialluis"; + break; + case 5: + if (Op1Val == 0 && Op2Val == 0) + Asm = "ic\tiallu"; + else if (Op1Val == 3 && Op2Val == 1) + Asm = "ic\tivau"; + break; + + // DC aliases + case 4: + if (Op1Val == 3 && Op2Val == 1) + Asm = "dc\tzva"; + break; + case 6: + if (Op1Val == 0 && Op2Val == 1) + Asm = "dc\tivac"; + if (Op1Val == 0 && Op2Val == 2) + Asm = "dc\tisw"; + break; + case 10: + if (Op1Val == 3 && Op2Val == 1) + Asm = "dc\tcvac"; + else if (Op1Val == 0 && Op2Val == 2) + Asm = "dc\tcsw"; + break; + case 11: + if (Op1Val == 3 && Op2Val == 1) + Asm = "dc\tcvau"; + break; + case 14: + if (Op1Val == 3 && Op2Val == 1) + Asm = "dc\tcivac"; + else if (Op1Val == 0 && Op2Val == 2) + Asm = "dc\tcisw"; + break; + + // AT aliases + case 8: + switch (Op1Val) { + default: + break; + case 0: + switch (Op2Val) { + default: + break; + case 0: Asm = "at\ts1e1r"; break; + case 1: Asm = "at\ts1e1w"; break; + case 2: Asm = "at\ts1e0r"; break; + case 3: Asm = "at\ts1e0w"; break; + } + break; + case 4: + switch (Op2Val) { + default: + break; + case 0: Asm = "at\ts1e2r"; break; + case 1: Asm = "at\ts1e2w"; break; + case 4: Asm = "at\ts12e1r"; break; + case 5: Asm = "at\ts12e1w"; break; + case 6: Asm = "at\ts12e0r"; break; + case 7: Asm = "at\ts12e0w"; break; + } + break; + case 6: + switch (Op2Val) { + default: + break; + case 0: Asm = "at\ts1e3r"; break; + case 1: Asm = "at\ts1e3w"; break; + } + break; + } + break; + } + } else if (CnVal == 8) { + // TLBI aliases + switch (CmVal) { + default: + break; + case 3: + switch (Op1Val) { + default: + break; + case 0: + switch (Op2Val) { + default: + break; + case 0: Asm = "tlbi\tvmalle1is"; break; + case 1: Asm = "tlbi\tvae1is"; break; + case 2: Asm = "tlbi\taside1is"; break; + case 3: Asm = "tlbi\tvaae1is"; break; + case 5: Asm = "tlbi\tvale1is"; break; + case 7: Asm = "tlbi\tvaale1is"; break; + } + break; + case 4: + switch (Op2Val) { + default: + break; + case 0: Asm = "tlbi\talle2is"; break; + case 1: Asm = "tlbi\tvae2is"; break; + case 4: Asm = "tlbi\talle1is"; break; + case 5: Asm = "tlbi\tvale2is"; break; + case 6: Asm = "tlbi\tvmalls12e1is"; break; + } + break; + case 6: + switch (Op2Val) { + default: + break; + case 0: Asm = "tlbi\talle3is"; break; + case 1: Asm = "tlbi\tvae3is"; break; + case 5: Asm = "tlbi\tvale3is"; break; + } + break; + } + break; + case 0: + switch (Op1Val) { + default: + break; + case 4: + switch (Op2Val) { + default: + break; + case 1: Asm = "tlbi\tipas2e1is"; break; + case 5: Asm = "tlbi\tipas2le1is"; break; + } + break; + } + break; + case 4: + switch (Op1Val) { + default: + break; + case 4: + switch (Op2Val) { + default: + break; + case 1: Asm = "tlbi\tipas2e1"; break; + case 5: Asm = "tlbi\tipas2le1"; break; + } + break; + } + break; + case 7: + switch (Op1Val) { + default: + break; + case 0: + switch (Op2Val) { + default: + break; + case 0: Asm = "tlbi\tvmalle1"; break; + case 1: Asm = "tlbi\tvae1"; break; + case 2: Asm = "tlbi\taside1"; break; + case 3: Asm = "tlbi\tvaae1"; break; + case 5: Asm = "tlbi\tvale1"; break; + case 7: Asm = "tlbi\tvaale1"; break; + } + break; + case 4: + switch (Op2Val) { + default: + break; + case 0: Asm = "tlbi\talle2"; break; + case 1: Asm = "tlbi\tvae2"; break; + case 4: Asm = "tlbi\talle1"; break; + case 5: Asm = "tlbi\tvale2"; break; + case 6: Asm = "tlbi\tvmalls12e1"; break; + } + break; + case 6: + switch (Op2Val) { + default: + break; + case 0: Asm = "tlbi\talle3"; break; + case 1: Asm = "tlbi\tvae3"; break; + case 5: Asm = "tlbi\tvale3"; break; + } + break; + } + break; + } + } + + if (Asm) { + unsigned Reg = MI->getOperand(4).getReg(); + + O << '\t' << Asm; + if (StringRef(Asm).lower().find("all") == StringRef::npos) + O << ", " << getRegisterName(Reg); + } + + return Asm != nullptr; +} + +void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + unsigned Reg = Op.getReg(); + O << getRegisterName(Reg); + } else if (Op.isImm()) { + O << '#' << Op.getImm(); + } else { + assert(Op.isExpr() && "unknown operand kind in printOperand"); + O << *Op.getExpr(); + } +} + +void AArch64InstPrinter::printHexImm(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + O << format("#%#llx", Op.getImm()); +} + +void AArch64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo, + unsigned Imm, raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + unsigned Reg = Op.getReg(); + if (Reg == AArch64::XZR) + O << "#" << Imm; + else + O << getRegisterName(Reg); + } else + assert(0 && "unknown operand kind in printPostIncOperand64"); +} + +void AArch64InstPrinter::printVRegOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + assert(Op.isReg() && "Non-register vreg operand!"); + unsigned Reg = Op.getReg(); + O << getRegisterName(Reg, AArch64::vreg); +} + +void AArch64InstPrinter::printSysCROperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + assert(Op.isImm() && "System instruction C[nm] operands must be immediates!"); + O << "c" << Op.getImm(); +} + +void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + if (MO.isImm()) { + unsigned Val = (MO.getImm() & 0xfff); + assert(Val == MO.getImm() && "Add/sub immediate out of range!"); + unsigned Shift = + AArch64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm()); + O << '#' << Val; + if (Shift != 0) + printShifter(MI, OpNum + 1, O); + + if (CommentStream) + *CommentStream << '=' << (Val << Shift) << '\n'; + } else { + assert(MO.isExpr() && "Unexpected operand type!"); + O << *MO.getExpr(); + printShifter(MI, OpNum + 1, O); + } +} + +void AArch64InstPrinter::printLogicalImm32(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + uint64_t Val = MI->getOperand(OpNum).getImm(); + O << "#0x"; + O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 32)); +} + +void AArch64InstPrinter::printLogicalImm64(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + uint64_t Val = MI->getOperand(OpNum).getImm(); + O << "#0x"; + O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 64)); +} + +void AArch64InstPrinter::printShifter(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNum).getImm(); + // LSL #0 should not be printed. + if (AArch64_AM::getShiftType(Val) == AArch64_AM::LSL && + AArch64_AM::getShiftValue(Val) == 0) + return; + O << ", " << AArch64_AM::getShiftExtendName(AArch64_AM::getShiftType(Val)) + << " #" << AArch64_AM::getShiftValue(Val); +} + +void AArch64InstPrinter::printShiftedRegister(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << getRegisterName(MI->getOperand(OpNum).getReg()); + printShifter(MI, OpNum + 1, O); +} + +void AArch64InstPrinter::printExtendedRegister(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << getRegisterName(MI->getOperand(OpNum).getReg()); + printArithExtend(MI, OpNum + 1, O); +} + +void AArch64InstPrinter::printArithExtend(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNum).getImm(); + AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getArithExtendType(Val); + unsigned ShiftVal = AArch64_AM::getArithShiftValue(Val); + + // If the destination or first source register operand is [W]SP, print + // UXTW/UXTX as LSL, and if the shift amount is also zero, print nothing at + // all. + if (ExtType == AArch64_AM::UXTW || ExtType == AArch64_AM::UXTX) { + unsigned Dest = MI->getOperand(0).getReg(); + unsigned Src1 = MI->getOperand(1).getReg(); + if ( ((Dest == AArch64::SP || Src1 == AArch64::SP) && + ExtType == AArch64_AM::UXTX) || + ((Dest == AArch64::WSP || Src1 == AArch64::WSP) && + ExtType == AArch64_AM::UXTW) ) { + if (ShiftVal != 0) + O << ", lsl #" << ShiftVal; + return; + } + } + O << ", " << AArch64_AM::getShiftExtendName(ExtType); + if (ShiftVal != 0) + O << " #" << ShiftVal; +} + +void AArch64InstPrinter::printMemExtend(const MCInst *MI, unsigned OpNum, + raw_ostream &O, char SrcRegKind, + unsigned Width) { + unsigned SignExtend = MI->getOperand(OpNum).getImm(); + unsigned DoShift = MI->getOperand(OpNum + 1).getImm(); + + // sxtw, sxtx, uxtw or lsl (== uxtx) + bool IsLSL = !SignExtend && SrcRegKind == 'x'; + if (IsLSL) + O << "lsl"; + else + O << (SignExtend ? 's' : 'u') << "xt" << SrcRegKind; + + if (DoShift || IsLSL) + O << " #" << Log2_32(Width / 8); +} + +void AArch64InstPrinter::printCondCode(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm(); + O << AArch64CC::getCondCodeName(CC); +} + +void AArch64InstPrinter::printInverseCondCode(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm(); + O << AArch64CC::getCondCodeName(AArch64CC::getInvertedCondCode(CC)); +} + +void AArch64InstPrinter::printAMNoIndex(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']'; +} + +template +void AArch64InstPrinter::printImmScale(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << '#' << Scale * MI->getOperand(OpNum).getImm(); +} + +void AArch64InstPrinter::printUImm12Offset(const MCInst *MI, unsigned OpNum, + unsigned Scale, raw_ostream &O) { + const MCOperand MO = MI->getOperand(OpNum); + if (MO.isImm()) { + O << "#" << (MO.getImm() * Scale); + } else { + assert(MO.isExpr() && "Unexpected operand type!"); + O << *MO.getExpr(); + } +} + +void AArch64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum, + unsigned Scale, raw_ostream &O) { + const MCOperand MO1 = MI->getOperand(OpNum + 1); + O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()); + if (MO1.isImm()) { + O << ", #" << (MO1.getImm() * Scale); + } else { + assert(MO1.isExpr() && "Unexpected operand type!"); + O << ", " << *MO1.getExpr(); + } + O << ']'; +} + +void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + unsigned prfop = MI->getOperand(OpNum).getImm(); + bool Valid; + StringRef Name = AArch64PRFM::PRFMMapper().toString(prfop, Valid); + if (Valid) + O << Name; + else + O << '#' << prfop; +} + +void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + float FPImm = + MO.isFPImm() ? MO.getFPImm() : AArch64_AM::getFPImmFloat(MO.getImm()); + + // 8 decimal places are enough to perfectly represent permitted floats. + O << format("#%.8f", FPImm); +} + +static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) { + while (Stride--) { + switch (Reg) { + default: + assert(0 && "Vector register expected!"); + case AArch64::Q0: Reg = AArch64::Q1; break; + case AArch64::Q1: Reg = AArch64::Q2; break; + case AArch64::Q2: Reg = AArch64::Q3; break; + case AArch64::Q3: Reg = AArch64::Q4; break; + case AArch64::Q4: Reg = AArch64::Q5; break; + case AArch64::Q5: Reg = AArch64::Q6; break; + case AArch64::Q6: Reg = AArch64::Q7; break; + case AArch64::Q7: Reg = AArch64::Q8; break; + case AArch64::Q8: Reg = AArch64::Q9; break; + case AArch64::Q9: Reg = AArch64::Q10; break; + case AArch64::Q10: Reg = AArch64::Q11; break; + case AArch64::Q11: Reg = AArch64::Q12; break; + case AArch64::Q12: Reg = AArch64::Q13; break; + case AArch64::Q13: Reg = AArch64::Q14; break; + case AArch64::Q14: Reg = AArch64::Q15; break; + case AArch64::Q15: Reg = AArch64::Q16; break; + case AArch64::Q16: Reg = AArch64::Q17; break; + case AArch64::Q17: Reg = AArch64::Q18; break; + case AArch64::Q18: Reg = AArch64::Q19; break; + case AArch64::Q19: Reg = AArch64::Q20; break; + case AArch64::Q20: Reg = AArch64::Q21; break; + case AArch64::Q21: Reg = AArch64::Q22; break; + case AArch64::Q22: Reg = AArch64::Q23; break; + case AArch64::Q23: Reg = AArch64::Q24; break; + case AArch64::Q24: Reg = AArch64::Q25; break; + case AArch64::Q25: Reg = AArch64::Q26; break; + case AArch64::Q26: Reg = AArch64::Q27; break; + case AArch64::Q27: Reg = AArch64::Q28; break; + case AArch64::Q28: Reg = AArch64::Q29; break; + case AArch64::Q29: Reg = AArch64::Q30; break; + case AArch64::Q30: Reg = AArch64::Q31; break; + // Vector lists can wrap around. + case AArch64::Q31: + Reg = AArch64::Q0; + break; + } + } + return Reg; +} + +void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum, + raw_ostream &O, + StringRef LayoutSuffix) { + unsigned Reg = MI->getOperand(OpNum).getReg(); + + O << "{ "; + + // Work out how many registers there are in the list (if there is an actual + // list). + unsigned NumRegs = 1; + if (MRI.getRegClass(AArch64::DDRegClassID).contains(Reg) || + MRI.getRegClass(AArch64::QQRegClassID).contains(Reg)) + NumRegs = 2; + else if (MRI.getRegClass(AArch64::DDDRegClassID).contains(Reg) || + MRI.getRegClass(AArch64::QQQRegClassID).contains(Reg)) + NumRegs = 3; + else if (MRI.getRegClass(AArch64::DDDDRegClassID).contains(Reg) || + MRI.getRegClass(AArch64::QQQQRegClassID).contains(Reg)) + NumRegs = 4; + + // Now forget about the list and find out what the first register is. + if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::dsub0)) + Reg = FirstReg; + else if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::qsub0)) + Reg = FirstReg; + + // If it's a D-reg, we need to promote it to the equivalent Q-reg before + // printing (otherwise getRegisterName fails). + if (MRI.getRegClass(AArch64::FPR64RegClassID).contains(Reg)) { + const MCRegisterClass &FPR128RC = + MRI.getRegClass(AArch64::FPR128RegClassID); + Reg = MRI.getMatchingSuperReg(Reg, AArch64::dsub, &FPR128RC); + } + + for (unsigned i = 0; i < NumRegs; ++i, Reg = getNextVectorRegister(Reg)) { + O << getRegisterName(Reg, AArch64::vreg) << LayoutSuffix; + if (i + 1 != NumRegs) + O << ", "; + } + + O << " }"; +} + +void AArch64InstPrinter::printImplicitlyTypedVectorList(const MCInst *MI, + unsigned OpNum, + raw_ostream &O) { + printVectorList(MI, OpNum, O, ""); +} + +template +void AArch64InstPrinter::printTypedVectorList(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + std::string Suffix("."); + if (NumLanes) + Suffix += itostr(NumLanes) + LaneKind; + else + Suffix += LaneKind; + + printVectorList(MI, OpNum, O, Suffix); +} + +void AArch64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + O << "[" << MI->getOperand(OpNum).getImm() << "]"; +} + +void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + + // If the label has already been resolved to an immediate offset (say, when + // we're running the disassembler), just print the immediate. + if (Op.isImm()) { + O << "#" << (Op.getImm() << 2); + return; + } + + // If the branch target is simply an address then print it in hex. + const MCConstantExpr *BranchTarget = + dyn_cast(MI->getOperand(OpNum).getExpr()); + int64_t Address; + if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) { + O << "0x"; + O.write_hex(Address); + } else { + // Otherwise, just print the expression. + O << *MI->getOperand(OpNum).getExpr(); + } +} + +void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNum); + + // If the label has already been resolved to an immediate offset (say, when + // we're running the disassembler), just print the immediate. + if (Op.isImm()) { + O << "#" << (Op.getImm() << 12); + return; + } + + // Otherwise, just print the expression. + O << *MI->getOperand(OpNum).getExpr(); +} + +void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNo).getImm(); + unsigned Opcode = MI->getOpcode(); + + bool Valid; + StringRef Name; + if (Opcode == AArch64::ISB) + Name = AArch64ISB::ISBMapper().toString(Val, Valid); + else + Name = AArch64DB::DBarrierMapper().toString(Val, Valid); + if (Valid) + O << Name; + else + O << "#" << Val; +} + +void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNo).getImm(); + + bool Valid; + auto Mapper = AArch64SysReg::MRSMapper(getAvailableFeatures()); + std::string Name = Mapper.toString(Val, Valid); + + if (Valid) + O << StringRef(Name).upper(); +} + +void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNo).getImm(); + + bool Valid; + auto Mapper = AArch64SysReg::MSRMapper(getAvailableFeatures()); + std::string Name = Mapper.toString(Val, Valid); + + if (Valid) + O << StringRef(Name).upper(); +} + +void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned Val = MI->getOperand(OpNo).getImm(); + + bool Valid; + StringRef Name = AArch64PState::PStateMapper().toString(Val, Valid); + if (Valid) + O << StringRef(Name.str()).upper(); + else + O << "#" << Val; +} + +void AArch64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned RawVal = MI->getOperand(OpNo).getImm(); + uint64_t Val = AArch64_AM::decodeAdvSIMDModImmType10(RawVal); + O << format("#%#016llx", Val); +} diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h new file mode 100644 index 00000000000..fe7666e5cad --- /dev/null +++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h @@ -0,0 +1,140 @@ +//===-- AArch64InstPrinter.h - Convert AArch64 MCInst to assembly syntax --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This class prints an AArch64 MCInst to a .s file. +// +//===----------------------------------------------------------------------===// + +#ifndef AArch64INSTPRINTER_H +#define AArch64INSTPRINTER_H + +#include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCSubtargetInfo.h" + +namespace llvm { + +class MCOperand; + +class AArch64InstPrinter : public MCInstPrinter { +public: + AArch64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI, const MCSubtargetInfo &STI); + + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override; + void printRegName(raw_ostream &OS, unsigned RegNo) const override; + + // Autogenerated by tblgen. + virtual void printInstruction(const MCInst *MI, raw_ostream &O); + virtual bool printAliasInstr(const MCInst *MI, raw_ostream &O); + virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, + unsigned PrintMethodIdx, raw_ostream &O); + virtual StringRef getRegName(unsigned RegNo) const { + return getRegisterName(RegNo); + } + static const char *getRegisterName(unsigned RegNo, + unsigned AltIdx = AArch64::NoRegAltName); + +protected: + bool printSysAlias(const MCInst *MI, raw_ostream &O); + // Operand printers + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printHexImm(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm, + raw_ostream &O); + template + void printPostIncOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { + printPostIncOperand(MI, OpNo, Amount, O); + } + + void printVRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printSysCROperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printAddSubImm(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printLogicalImm32(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printLogicalImm64(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printShifter(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printShiftedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printExtendedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printArithExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O); + + void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O, + char SrcRegKind, unsigned Width); + template + void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O) { + printMemExtend(MI, OpNum, O, SrcRegKind, Width); + } + + void printCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printInverseCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAlignedLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printUImm12Offset(const MCInst *MI, unsigned OpNum, unsigned Scale, + raw_ostream &O); + void printAMIndexedWB(const MCInst *MI, unsigned OpNum, unsigned Scale, + raw_ostream &O); + + template + void printUImm12Offset(const MCInst *MI, unsigned OpNum, raw_ostream &O) { + printUImm12Offset(MI, OpNum, Scale, O); + } + + template + void printAMIndexedWB(const MCInst *MI, unsigned OpNum, raw_ostream &O) { + printAMIndexedWB(MI, OpNum, BitWidth / 8, O); + } + + void printAMNoIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O); + + template + void printImmScale(const MCInst *MI, unsigned OpNum, raw_ostream &O); + + void printPrefetchOp(const MCInst *MI, unsigned OpNum, raw_ostream &O); + + void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); + + void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O, + StringRef LayoutSuffix); + + /// Print a list of vector registers where the type suffix is implicit + /// (i.e. attached to the instruction rather than the registers). + void printImplicitlyTypedVectorList(const MCInst *MI, unsigned OpNum, + raw_ostream &O); + + template + void printTypedVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O); + + void printVectorIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printAdrpLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printBarrierOption(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printMSRSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printMRSSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printSystemPStateField(const MCInst *MI, unsigned OpNum, raw_ostream &O); + void printSIMDType10Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); +}; + +class AArch64AppleInstPrinter : public AArch64InstPrinter { +public: + AArch64AppleInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, + const MCRegisterInfo &MRI, const MCSubtargetInfo &STI); + + void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override; + + void printInstruction(const MCInst *MI, raw_ostream &O) override; + bool printAliasInstr(const MCInst *MI, raw_ostream &O) override; + virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, + unsigned PrintMethodIdx, raw_ostream &O); + StringRef getRegName(unsigned RegNo) const override { + return getRegisterName(RegNo); + } + static const char *getRegisterName(unsigned RegNo, + unsigned AltIdx = AArch64::NoRegAltName); +}; +} + +#endif diff --git a/lib/Target/AArch64/InstPrinter/CMakeLists.txt b/lib/Target/AArch64/InstPrinter/CMakeLists.txt new file mode 100644 index 00000000000..363f50258d7 --- /dev/null +++ b/lib/Target/AArch64/InstPrinter/CMakeLists.txt @@ -0,0 +1,7 @@ +include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) + +add_llvm_library(LLVMAArch64AsmPrinter + AArch64InstPrinter.cpp + ) + +add_dependencies(LLVMAArch64AsmPrinter AArch64CommonTableGen) diff --git a/lib/Target/AArch64/InstPrinter/LLVMBuild.txt b/lib/Target/AArch64/InstPrinter/LLVMBuild.txt new file mode 100644 index 00000000000..a13e842cdd3 --- /dev/null +++ b/lib/Target/AArch64/InstPrinter/LLVMBuild.txt @@ -0,0 +1,24 @@ +;===- ./lib/Target/AArch64/InstPrinter/LLVMBuild.txt -------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = AArch64AsmPrinter +parent = AArch64 +required_libraries = AArch64Utils MC Support +add_to_library_groups = AArch64 + diff --git a/lib/Target/AArch64/InstPrinter/Makefile b/lib/Target/AArch64/InstPrinter/Makefile new file mode 100644 index 00000000000..b17e8d08011 --- /dev/null +++ b/lib/Target/AArch64/InstPrinter/Makefile @@ -0,0 +1,15 @@ +##===- lib/Target/AArch64/AsmPrinter/Makefile --------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../../.. +LIBRARYNAME = LLVMAArch64AsmPrinter + +# Hack: we need to include 'main' arm target directory to grab private headers +CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/AArch64/LLVMBuild.txt b/lib/Target/AArch64/LLVMBuild.txt new file mode 100644 index 00000000000..642c18394a6 --- /dev/null +++ b/lib/Target/AArch64/LLVMBuild.txt @@ -0,0 +1,35 @@ +;===- ./lib/Target/AArch64/LLVMBuild.txt -------------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[common] +subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo Utils + +[component_0] +type = TargetGroup +name = AArch64 +parent = Target +has_asmparser = 1 +has_asmprinter = 1 +has_disassembler = 1 +has_jit = 1 + +[component_1] +type = Library +name = AArch64CodeGen +parent = AArch64 +required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target +add_to_library_groups = AArch64 diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h new file mode 100644 index 00000000000..8b1e44e26e9 --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h @@ -0,0 +1,738 @@ +//===- AArch64AddressingModes.h - AArch64 Addressing Modes ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the AArch64 addressing mode implementation stuff. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TARGET_AArch64_AArch64ADDRESSINGMODES_H +#define LLVM_TARGET_AArch64_AArch64ADDRESSINGMODES_H + +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include + +namespace llvm { + +/// AArch64_AM - AArch64 Addressing Mode Stuff +namespace AArch64_AM { + +//===----------------------------------------------------------------------===// +// Shifts +// + +enum ShiftExtendType { + InvalidShiftExtend = -1, + LSL = 0, + LSR, + ASR, + ROR, + MSL, + + UXTB, + UXTH, + UXTW, + UXTX, + + SXTB, + SXTH, + SXTW, + SXTX, +}; + +/// getShiftName - Get the string encoding for the shift type. +static inline const char *getShiftExtendName(AArch64_AM::ShiftExtendType ST) { + switch (ST) { + default: assert(false && "unhandled shift type!"); + case AArch64_AM::LSL: return "lsl"; + case AArch64_AM::LSR: return "lsr"; + case AArch64_AM::ASR: return "asr"; + case AArch64_AM::ROR: return "ror"; + case AArch64_AM::MSL: return "msl"; + case AArch64_AM::UXTB: return "uxtb"; + case AArch64_AM::UXTH: return "uxth"; + case AArch64_AM::UXTW: return "uxtw"; + case AArch64_AM::UXTX: return "uxtx"; + case AArch64_AM::SXTB: return "sxtb"; + case AArch64_AM::SXTH: return "sxth"; + case AArch64_AM::SXTW: return "sxtw"; + case AArch64_AM::SXTX: return "sxtx"; + } + return nullptr; +} + +/// getShiftType - Extract the shift type. +static inline AArch64_AM::ShiftExtendType getShiftType(unsigned Imm) { + switch ((Imm >> 6) & 0x7) { + default: return AArch64_AM::InvalidShiftExtend; + case 0: return AArch64_AM::LSL; + case 1: return AArch64_AM::LSR; + case 2: return AArch64_AM::ASR; + case 3: return AArch64_AM::ROR; + case 4: return AArch64_AM::MSL; + } +} + +/// getShiftValue - Extract the shift value. +static inline unsigned getShiftValue(unsigned Imm) { + return Imm & 0x3f; +} + +/// getShifterImm - Encode the shift type and amount: +/// imm: 6-bit shift amount +/// shifter: 000 ==> lsl +/// 001 ==> lsr +/// 010 ==> asr +/// 011 ==> ror +/// 100 ==> msl +/// {8-6} = shifter +/// {5-0} = imm +static inline unsigned getShifterImm(AArch64_AM::ShiftExtendType ST, + unsigned Imm) { + assert((Imm & 0x3f) == Imm && "Illegal shifted immedate value!"); + unsigned STEnc = 0; + switch (ST) { + default: llvm_unreachable("Invalid shift requested"); + case AArch64_AM::LSL: STEnc = 0; break; + case AArch64_AM::LSR: STEnc = 1; break; + case AArch64_AM::ASR: STEnc = 2; break; + case AArch64_AM::ROR: STEnc = 3; break; + case AArch64_AM::MSL: STEnc = 4; break; + } + return (STEnc << 6) | (Imm & 0x3f); +} + +//===----------------------------------------------------------------------===// +// Extends +// + +/// getArithShiftValue - get the arithmetic shift value. +static inline unsigned getArithShiftValue(unsigned Imm) { + return Imm & 0x7; +} + +/// getExtendType - Extract the extend type for operands of arithmetic ops. +static inline AArch64_AM::ShiftExtendType getExtendType(unsigned Imm) { + assert((Imm & 0x7) == Imm && "invalid immediate!"); + switch (Imm) { + default: llvm_unreachable("Compiler bug!"); + case 0: return AArch64_AM::UXTB; + case 1: return AArch64_AM::UXTH; + case 2: return AArch64_AM::UXTW; + case 3: return AArch64_AM::UXTX; + case 4: return AArch64_AM::SXTB; + case 5: return AArch64_AM::SXTH; + case 6: return AArch64_AM::SXTW; + case 7: return AArch64_AM::SXTX; + } +} + +static inline AArch64_AM::ShiftExtendType getArithExtendType(unsigned Imm) { + return getExtendType((Imm >> 3) & 0x7); +} + +/// Mapping from extend bits to required operation: +/// shifter: 000 ==> uxtb +/// 001 ==> uxth +/// 010 ==> uxtw +/// 011 ==> uxtx +/// 100 ==> sxtb +/// 101 ==> sxth +/// 110 ==> sxtw +/// 111 ==> sxtx +inline unsigned getExtendEncoding(AArch64_AM::ShiftExtendType ET) { + switch (ET) { + default: llvm_unreachable("Invalid extend type requested"); + case AArch64_AM::UXTB: return 0; break; + case AArch64_AM::UXTH: return 1; break; + case AArch64_AM::UXTW: return 2; break; + case AArch64_AM::UXTX: return 3; break; + case AArch64_AM::SXTB: return 4; break; + case AArch64_AM::SXTH: return 5; break; + case AArch64_AM::SXTW: return 6; break; + case AArch64_AM::SXTX: return 7; break; + } +} + +/// getArithExtendImm - Encode the extend type and shift amount for an +/// arithmetic instruction: +/// imm: 3-bit extend amount +/// {5-3} = shifter +/// {2-0} = imm3 +static inline unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, + unsigned Imm) { + assert((Imm & 0x7) == Imm && "Illegal shifted immedate value!"); + return (getExtendEncoding(ET) << 3) | (Imm & 0x7); +} + +/// getMemDoShift - Extract the "do shift" flag value for load/store +/// instructions. +static inline bool getMemDoShift(unsigned Imm) { + return (Imm & 0x1) != 0; +} + +/// getExtendType - Extract the extend type for the offset operand of +/// loads/stores. +static inline AArch64_AM::ShiftExtendType getMemExtendType(unsigned Imm) { + return getExtendType((Imm >> 1) & 0x7); +} + +/// getExtendImm - Encode the extend type and amount for a load/store inst: +/// doshift: should the offset be scaled by the access size +/// shifter: 000 ==> uxtb +/// 001 ==> uxth +/// 010 ==> uxtw +/// 011 ==> uxtx +/// 100 ==> sxtb +/// 101 ==> sxth +/// 110 ==> sxtw +/// 111 ==> sxtx +/// {3-1} = shifter +/// {0} = doshift +static inline unsigned getMemExtendImm(AArch64_AM::ShiftExtendType ET, + bool DoShift) { + return (getExtendEncoding(ET) << 1) | unsigned(DoShift); +} + +static inline uint64_t ror(uint64_t elt, unsigned size) { + return ((elt & 1) << (size-1)) | (elt >> 1); +} + +/// processLogicalImmediate - Determine if an immediate value can be encoded +/// as the immediate operand of a logical instruction for the given register +/// size. If so, return true with "encoding" set to the encoded value in +/// the form N:immr:imms. +static inline bool processLogicalImmediate(uint64_t imm, unsigned regSize, + uint64_t &encoding) { + if (imm == 0ULL || imm == ~0ULL || + (regSize != 64 && (imm >> regSize != 0 || imm == ~0U))) + return false; + + unsigned size = 2; + uint64_t eltVal = imm; + + // First, determine the element size. + while (size < regSize) { + unsigned numElts = regSize / size; + unsigned mask = (1ULL << size) - 1; + uint64_t lowestEltVal = imm & mask; + + bool allMatched = true; + for (unsigned i = 1; i < numElts; ++i) { + uint64_t currEltVal = (imm >> (i*size)) & mask; + if (currEltVal != lowestEltVal) { + allMatched = false; + break; + } + } + + if (allMatched) { + eltVal = lowestEltVal; + break; + } + + size *= 2; + } + + // Second, determine the rotation to make the element be: 0^m 1^n. + for (unsigned i = 0; i < size; ++i) { + eltVal = ror(eltVal, size); + uint32_t clz = countLeadingZeros(eltVal) - (64 - size); + uint32_t cto = CountTrailingOnes_64(eltVal); + + if (clz + cto == size) { + // Encode in immr the number of RORs it would take to get *from* this + // element value to our target value, where i+1 is the number of RORs + // to go the opposite direction. + unsigned immr = size - (i + 1); + + // If size has a 1 in the n'th bit, create a value that has zeroes in + // bits [0, n] and ones above that. + uint64_t nimms = ~(size-1) << 1; + + // Or the CTO value into the low bits, which must be below the Nth bit + // bit mentioned above. + nimms |= (cto-1); + + // Extract the seventh bit and toggle it to create the N field. + unsigned N = ((nimms >> 6) & 1) ^ 1; + + encoding = (N << 12) | (immr << 6) | (nimms & 0x3f); + return true; + } + } + + return false; +} + +/// isLogicalImmediate - Return true if the immediate is valid for a logical +/// immediate instruction of the given register size. Return false otherwise. +static inline bool isLogicalImmediate(uint64_t imm, unsigned regSize) { + uint64_t encoding; + return processLogicalImmediate(imm, regSize, encoding); +} + +/// encodeLogicalImmediate - Return the encoded immediate value for a logical +/// immediate instruction of the given register size. +static inline uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize) { + uint64_t encoding = 0; + bool res = processLogicalImmediate(imm, regSize, encoding); + assert(res && "invalid logical immediate"); + (void)res; + return encoding; +} + +/// decodeLogicalImmediate - Decode a logical immediate value in the form +/// "N:immr:imms" (where the immr and imms fields are each 6 bits) into the +/// integer value it represents with regSize bits. +static inline uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize) { + // Extract the N, imms, and immr fields. + unsigned N = (val >> 12) & 1; + unsigned immr = (val >> 6) & 0x3f; + unsigned imms = val & 0x3f; + + assert((regSize == 64 || N == 0) && "undefined logical immediate encoding"); + int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f)); + assert(len >= 0 && "undefined logical immediate encoding"); + unsigned size = (1 << len); + unsigned R = immr & (size - 1); + unsigned S = imms & (size - 1); + assert(S != size - 1 && "undefined logical immediate encoding"); + uint64_t pattern = (1ULL << (S + 1)) - 1; + for (unsigned i = 0; i < R; ++i) + pattern = ror(pattern, size); + + // Replicate the pattern to fill the regSize. + while (size != regSize) { + pattern |= (pattern << size); + size *= 2; + } + return pattern; +} + +/// isValidDecodeLogicalImmediate - Check to see if the logical immediate value +/// in the form "N:immr:imms" (where the immr and imms fields are each 6 bits) +/// is a valid encoding for an integer value with regSize bits. +static inline bool isValidDecodeLogicalImmediate(uint64_t val, + unsigned regSize) { + // Extract the N and imms fields needed for checking. + unsigned N = (val >> 12) & 1; + unsigned imms = val & 0x3f; + + if (regSize == 32 && N != 0) // undefined logical immediate encoding + return false; + int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f)); + if (len < 0) // undefined logical immediate encoding + return false; + unsigned size = (1 << len); + unsigned S = imms & (size - 1); + if (S == size - 1) // undefined logical immediate encoding + return false; + + return true; +} + +//===----------------------------------------------------------------------===// +// Floating-point Immediates +// +static inline float getFPImmFloat(unsigned Imm) { + // We expect an 8-bit binary encoding of a floating-point number here. + union { + uint32_t I; + float F; + } FPUnion; + + uint8_t Sign = (Imm >> 7) & 0x1; + uint8_t Exp = (Imm >> 4) & 0x7; + uint8_t Mantissa = Imm & 0xf; + + // 8-bit FP iEEEE Float Encoding + // abcd efgh aBbbbbbc defgh000 00000000 00000000 + // + // where B = NOT(b); + + FPUnion.I = 0; + FPUnion.I |= Sign << 31; + FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30; + FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25; + FPUnion.I |= (Exp & 0x3) << 23; + FPUnion.I |= Mantissa << 19; + return FPUnion.F; +} + +/// getFP32Imm - Return an 8-bit floating-point version of the 32-bit +/// floating-point value. If the value cannot be represented as an 8-bit +/// floating-point value, then return -1. +static inline int getFP32Imm(const APInt &Imm) { + uint32_t Sign = Imm.lshr(31).getZExtValue() & 1; + int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127; // -126 to 127 + int64_t Mantissa = Imm.getZExtValue() & 0x7fffff; // 23 bits + + // We can handle 4 bits of mantissa. + // mantissa = (16+UInt(e:f:g:h))/16. + if (Mantissa & 0x7ffff) + return -1; + Mantissa >>= 19; + if ((Mantissa & 0xf) != Mantissa) + return -1; + + // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3 + if (Exp < -3 || Exp > 4) + return -1; + Exp = ((Exp+3) & 0x7) ^ 4; + + return ((int)Sign << 7) | (Exp << 4) | Mantissa; +} + +static inline int getFP32Imm(const APFloat &FPImm) { + return getFP32Imm(FPImm.bitcastToAPInt()); +} + +/// getFP64Imm - Return an 8-bit floating-point version of the 64-bit +/// floating-point value. If the value cannot be represented as an 8-bit +/// floating-point value, then return -1. +static inline int getFP64Imm(const APInt &Imm) { + uint64_t Sign = Imm.lshr(63).getZExtValue() & 1; + int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023; // -1022 to 1023 + uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL; + + // We can handle 4 bits of mantissa. + // mantissa = (16+UInt(e:f:g:h))/16. + if (Mantissa & 0xffffffffffffULL) + return -1; + Mantissa >>= 48; + if ((Mantissa & 0xf) != Mantissa) + return -1; + + // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3 + if (Exp < -3 || Exp > 4) + return -1; + Exp = ((Exp+3) & 0x7) ^ 4; + + return ((int)Sign << 7) | (Exp << 4) | Mantissa; +} + +static inline int getFP64Imm(const APFloat &FPImm) { + return getFP64Imm(FPImm.bitcastToAPInt()); +} + +//===--------------------------------------------------------------------===// +// AdvSIMD Modified Immediates +//===--------------------------------------------------------------------===// + +// 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh +static inline bool isAdvSIMDModImmType1(uint64_t Imm) { + return ((Imm >> 32) == (Imm & 0xffffffffULL)) && + ((Imm & 0xffffff00ffffff00ULL) == 0); +} + +static inline uint8_t encodeAdvSIMDModImmType1(uint64_t Imm) { + return (Imm & 0xffULL); +} + +static inline uint64_t decodeAdvSIMDModImmType1(uint8_t Imm) { + uint64_t EncVal = Imm; + return (EncVal << 32) | EncVal; +} + +// 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 +static inline bool isAdvSIMDModImmType2(uint64_t Imm) { + return ((Imm >> 32) == (Imm & 0xffffffffULL)) && + ((Imm & 0xffff00ffffff00ffULL) == 0); +} + +static inline uint8_t encodeAdvSIMDModImmType2(uint64_t Imm) { + return (Imm & 0xff00ULL) >> 8; +} + +static inline uint64_t decodeAdvSIMDModImmType2(uint8_t Imm) { + uint64_t EncVal = Imm; + return (EncVal << 40) | (EncVal << 8); +} + +// 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00 +static inline bool isAdvSIMDModImmType3(uint64_t Imm) { + return ((Imm >> 32) == (Imm & 0xffffffffULL)) && + ((Imm & 0xff00ffffff00ffffULL) == 0); +} + +static inline uint8_t encodeAdvSIMDModImmType3(uint64_t Imm) { + return (Imm & 0xff0000ULL) >> 16; +} + +static inline uint64_t decodeAdvSIMDModImmType3(uint8_t Imm) { + uint64_t EncVal = Imm; + return (EncVal << 48) | (EncVal << 16); +} + +// abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00 +static inline bool isAdvSIMDModImmType4(uint64_t Imm) { + return ((Imm >> 32) == (Imm & 0xffffffffULL)) && + ((Imm & 0x00ffffff00ffffffULL) == 0); +} + +static inline uint8_t encodeAdvSIMDModImmType4(uint64_t Imm) { + return (Imm & 0xff000000ULL) >> 24; +} + +static inline uint64_t decodeAdvSIMDModImmType4(uint8_t Imm) { + uint64_t EncVal = Imm; + return (EncVal << 56) | (EncVal << 24); +} + +// 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh +static inline bool isAdvSIMDModImmType5(uint64_t Imm) { + return ((Imm >> 32) == (Imm & 0xffffffffULL)) && + (((Imm & 0x00ff0000ULL) >> 16) == (Imm & 0x000000ffULL)) && + ((Imm & 0xff00ff00ff00ff00ULL) == 0); +} + +static inline uint8_t encodeAdvSIMDModImmType5(uint64_t Imm) { + return (Imm & 0xffULL); +} + +static inline uint64_t decodeAdvSIMDModImmType5(uint8_t Imm) { + uint64_t EncVal = Imm; + return (EncVal << 48) | (EncVal << 32) | (EncVal << 16) | EncVal; +} + +// abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 +static inline bool isAdvSIMDModImmType6(uint64_t Imm) { + return ((Imm >> 32) == (Imm & 0xffffffffULL)) && + (((Imm & 0xff000000ULL) >> 16) == (Imm & 0x0000ff00ULL)) && + ((Imm & 0x00ff00ff00ff00ffULL) == 0); +} + +static inline uint8_t encodeAdvSIMDModImmType6(uint64_t Imm) { + return (Imm & 0xff00ULL) >> 8; +} + +static inline uint64_t decodeAdvSIMDModImmType6(uint8_t Imm) { + uint64_t EncVal = Imm; + return (EncVal << 56) | (EncVal << 40) | (EncVal << 24) | (EncVal << 8); +} + +// 0x00 0x00 abcdefgh 0xFF 0x00 0x00 abcdefgh 0xFF +static inline bool isAdvSIMDModImmType7(uint64_t Imm) { + return ((Imm >> 32) == (Imm & 0xffffffffULL)) && + ((Imm & 0xffff00ffffff00ffULL) == 0x000000ff000000ffULL); +} + +static inline uint8_t encodeAdvSIMDModImmType7(uint64_t Imm) { + return (Imm & 0xff00ULL) >> 8; +} + +static inline uint64_t decodeAdvSIMDModImmType7(uint8_t Imm) { + uint64_t EncVal = Imm; + return (EncVal << 40) | (EncVal << 8) | 0x000000ff000000ffULL; +} + +// 0x00 abcdefgh 0xFF 0xFF 0x00 abcdefgh 0xFF 0xFF +static inline bool isAdvSIMDModImmType8(uint64_t Imm) { + return ((Imm >> 32) == (Imm & 0xffffffffULL)) && + ((Imm & 0xff00ffffff00ffffULL) == 0x0000ffff0000ffffULL); +} + +static inline uint64_t decodeAdvSIMDModImmType8(uint8_t Imm) { + uint64_t EncVal = Imm; + return (EncVal << 48) | (EncVal << 16) | 0x0000ffff0000ffffULL; +} + +static inline uint8_t encodeAdvSIMDModImmType8(uint64_t Imm) { + return (Imm & 0x00ff0000ULL) >> 16; +} + +// abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh +static inline bool isAdvSIMDModImmType9(uint64_t Imm) { + return ((Imm >> 32) == (Imm & 0xffffffffULL)) && + ((Imm >> 48) == (Imm & 0x0000ffffULL)) && + ((Imm >> 56) == (Imm & 0x000000ffULL)); +} + +static inline uint8_t encodeAdvSIMDModImmType9(uint64_t Imm) { + return (Imm & 0xffULL); +} + +static inline uint64_t decodeAdvSIMDModImmType9(uint8_t Imm) { + uint64_t EncVal = Imm; + EncVal |= (EncVal << 8); + EncVal |= (EncVal << 16); + EncVal |= (EncVal << 32); + return EncVal; +} + +// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh +// cmode: 1110, op: 1 +static inline bool isAdvSIMDModImmType10(uint64_t Imm) { + uint64_t ByteA = Imm & 0xff00000000000000ULL; + uint64_t ByteB = Imm & 0x00ff000000000000ULL; + uint64_t ByteC = Imm & 0x0000ff0000000000ULL; + uint64_t ByteD = Imm & 0x000000ff00000000ULL; + uint64_t ByteE = Imm & 0x00000000ff000000ULL; + uint64_t ByteF = Imm & 0x0000000000ff0000ULL; + uint64_t ByteG = Imm & 0x000000000000ff00ULL; + uint64_t ByteH = Imm & 0x00000000000000ffULL; + + return (ByteA == 0ULL || ByteA == 0xff00000000000000ULL) && + (ByteB == 0ULL || ByteB == 0x00ff000000000000ULL) && + (ByteC == 0ULL || ByteC == 0x0000ff0000000000ULL) && + (ByteD == 0ULL || ByteD == 0x000000ff00000000ULL) && + (ByteE == 0ULL || ByteE == 0x00000000ff000000ULL) && + (ByteF == 0ULL || ByteF == 0x0000000000ff0000ULL) && + (ByteG == 0ULL || ByteG == 0x000000000000ff00ULL) && + (ByteH == 0ULL || ByteH == 0x00000000000000ffULL); +} + +static inline uint8_t encodeAdvSIMDModImmType10(uint64_t Imm) { + uint8_t BitA = (Imm & 0xff00000000000000ULL) != 0; + uint8_t BitB = (Imm & 0x00ff000000000000ULL) != 0; + uint8_t BitC = (Imm & 0x0000ff0000000000ULL) != 0; + uint8_t BitD = (Imm & 0x000000ff00000000ULL) != 0; + uint8_t BitE = (Imm & 0x00000000ff000000ULL) != 0; + uint8_t BitF = (Imm & 0x0000000000ff0000ULL) != 0; + uint8_t BitG = (Imm & 0x000000000000ff00ULL) != 0; + uint8_t BitH = (Imm & 0x00000000000000ffULL) != 0; + + uint8_t EncVal = BitA; + EncVal <<= 1; + EncVal |= BitB; + EncVal <<= 1; + EncVal |= BitC; + EncVal <<= 1; + EncVal |= BitD; + EncVal <<= 1; + EncVal |= BitE; + EncVal <<= 1; + EncVal |= BitF; + EncVal <<= 1; + EncVal |= BitG; + EncVal <<= 1; + EncVal |= BitH; + return EncVal; +} + +static inline uint64_t decodeAdvSIMDModImmType10(uint8_t Imm) { + uint64_t EncVal = 0; + if (Imm & 0x80) EncVal |= 0xff00000000000000ULL; + if (Imm & 0x40) EncVal |= 0x00ff000000000000ULL; + if (Imm & 0x20) EncVal |= 0x0000ff0000000000ULL; + if (Imm & 0x10) EncVal |= 0x000000ff00000000ULL; + if (Imm & 0x08) EncVal |= 0x00000000ff000000ULL; + if (Imm & 0x04) EncVal |= 0x0000000000ff0000ULL; + if (Imm & 0x02) EncVal |= 0x000000000000ff00ULL; + if (Imm & 0x01) EncVal |= 0x00000000000000ffULL; + return EncVal; +} + +// aBbbbbbc defgh000 0x00 0x00 aBbbbbbc defgh000 0x00 0x00 +static inline bool isAdvSIMDModImmType11(uint64_t Imm) { + uint64_t BString = (Imm & 0x7E000000ULL) >> 25; + return ((Imm >> 32) == (Imm & 0xffffffffULL)) && + (BString == 0x1f || BString == 0x20) && + ((Imm & 0x0007ffff0007ffffULL) == 0); +} + +static inline uint8_t encodeAdvSIMDModImmType11(uint64_t Imm) { + uint8_t BitA = (Imm & 0x80000000ULL) != 0; + uint8_t BitB = (Imm & 0x20000000ULL) != 0; + uint8_t BitC = (Imm & 0x01000000ULL) != 0; + uint8_t BitD = (Imm & 0x00800000ULL) != 0; + uint8_t BitE = (Imm & 0x00400000ULL) != 0; + uint8_t BitF = (Imm & 0x00200000ULL) != 0; + uint8_t BitG = (Imm & 0x00100000ULL) != 0; + uint8_t BitH = (Imm & 0x00080000ULL) != 0; + + uint8_t EncVal = BitA; + EncVal <<= 1; + EncVal |= BitB; + EncVal <<= 1; + EncVal |= BitC; + EncVal <<= 1; + EncVal |= BitD; + EncVal <<= 1; + EncVal |= BitE; + EncVal <<= 1; + EncVal |= BitF; + EncVal <<= 1; + EncVal |= BitG; + EncVal <<= 1; + EncVal |= BitH; + return EncVal; +} + +static inline uint64_t decodeAdvSIMDModImmType11(uint8_t Imm) { + uint64_t EncVal = 0; + if (Imm & 0x80) EncVal |= 0x80000000ULL; + if (Imm & 0x40) EncVal |= 0x3e000000ULL; + else EncVal |= 0x40000000ULL; + if (Imm & 0x20) EncVal |= 0x01000000ULL; + if (Imm & 0x10) EncVal |= 0x00800000ULL; + if (Imm & 0x08) EncVal |= 0x00400000ULL; + if (Imm & 0x04) EncVal |= 0x00200000ULL; + if (Imm & 0x02) EncVal |= 0x00100000ULL; + if (Imm & 0x01) EncVal |= 0x00080000ULL; + return (EncVal << 32) | EncVal; +} + +// aBbbbbbb bbcdefgh 0x00 0x00 0x00 0x00 0x00 0x00 +static inline bool isAdvSIMDModImmType12(uint64_t Imm) { + uint64_t BString = (Imm & 0x7fc0000000000000ULL) >> 54; + return ((BString == 0xff || BString == 0x100) && + ((Imm & 0x0000ffffffffffffULL) == 0)); +} + +static inline uint8_t encodeAdvSIMDModImmType12(uint64_t Imm) { + uint8_t BitA = (Imm & 0x8000000000000000ULL) != 0; + uint8_t BitB = (Imm & 0x0040000000000000ULL) != 0; + uint8_t BitC = (Imm & 0x0020000000000000ULL) != 0; + uint8_t BitD = (Imm & 0x0010000000000000ULL) != 0; + uint8_t BitE = (Imm & 0x0008000000000000ULL) != 0; + uint8_t BitF = (Imm & 0x0004000000000000ULL) != 0; + uint8_t BitG = (Imm & 0x0002000000000000ULL) != 0; + uint8_t BitH = (Imm & 0x0001000000000000ULL) != 0; + + uint8_t EncVal = BitA; + EncVal <<= 1; + EncVal |= BitB; + EncVal <<= 1; + EncVal |= BitC; + EncVal <<= 1; + EncVal |= BitD; + EncVal <<= 1; + EncVal |= BitE; + EncVal <<= 1; + EncVal |= BitF; + EncVal <<= 1; + EncVal |= BitG; + EncVal <<= 1; + EncVal |= BitH; + return EncVal; +} + +static inline uint64_t decodeAdvSIMDModImmType12(uint8_t Imm) { + uint64_t EncVal = 0; + if (Imm & 0x80) EncVal |= 0x8000000000000000ULL; + if (Imm & 0x40) EncVal |= 0x3fc0000000000000ULL; + else EncVal |= 0x4000000000000000ULL; + if (Imm & 0x20) EncVal |= 0x0020000000000000ULL; + if (Imm & 0x10) EncVal |= 0x0010000000000000ULL; + if (Imm & 0x08) EncVal |= 0x0008000000000000ULL; + if (Imm & 0x04) EncVal |= 0x0004000000000000ULL; + if (Imm & 0x02) EncVal |= 0x0002000000000000ULL; + if (Imm & 0x01) EncVal |= 0x0001000000000000ULL; + return (EncVal << 32) | EncVal; +} + +} // end namespace AArch64_AM + +} // end namespace llvm + +#endif diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp new file mode 100644 index 00000000000..d8900d4fceb --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -0,0 +1,566 @@ +//===-- AArch64AsmBackend.cpp - AArch64 Assembler Backend -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64RegisterInfo.h" +#include "MCTargetDesc/AArch64FixupKinds.h" +#include "llvm/ADT/Triple.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCDirectives.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MachO.h" +using namespace llvm; + +namespace { + +class AArch64AsmBackend : public MCAsmBackend { + static const unsigned PCRelFlagVal = + MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel; + +public: + AArch64AsmBackend(const Target &T) : MCAsmBackend() {} + + unsigned getNumFixupKinds() const override { + return AArch64::NumTargetFixupKinds; + } + + const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override { + const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = { + // This table *must* be in the order that the fixup_* kinds are defined in + // AArch64FixupKinds.h. + // + // Name Offset (bits) Size (bits) Flags + { "fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal }, + { "fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal }, + { "fixup_aarch64_add_imm12", 10, 12, 0 }, + { "fixup_aarch64_ldst_imm12_scale1", 10, 12, 0 }, + { "fixup_aarch64_ldst_imm12_scale2", 10, 12, 0 }, + { "fixup_aarch64_ldst_imm12_scale4", 10, 12, 0 }, + { "fixup_aarch64_ldst_imm12_scale8", 10, 12, 0 }, + { "fixup_aarch64_ldst_imm12_scale16", 10, 12, 0 }, + { "fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal }, + { "fixup_aarch64_movw", 5, 16, 0 }, + { "fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal }, + { "fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal }, + { "fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal }, + { "fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal }, + { "fixup_aarch64_tlsdesc_call", 0, 0, 0 } + }; + + if (Kind < FirstTargetFixupKind) + return MCAsmBackend::getFixupKindInfo(Kind); + + assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() && + "Invalid kind!"); + return Infos[Kind - FirstTargetFixupKind]; + } + + void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value, bool IsPCRel) const override; + + bool mayNeedRelaxation(const MCInst &Inst) const override; + bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const override; + void relaxInstruction(const MCInst &Inst, MCInst &Res) const override; + bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override; + + void HandleAssemblerFlag(MCAssemblerFlag Flag) {} + + unsigned getPointerSize() const { return 8; } +}; + +} // end anonymous namespace + +/// \brief The number of bytes the fixup may change. +static unsigned getFixupKindNumBytes(unsigned Kind) { + switch (Kind) { + default: + assert(0 && "Unknown fixup kind!"); + + case AArch64::fixup_aarch64_tlsdesc_call: + return 0; + + case FK_Data_1: + return 1; + + case FK_Data_2: + case AArch64::fixup_aarch64_movw: + return 2; + + case AArch64::fixup_aarch64_pcrel_branch14: + case AArch64::fixup_aarch64_add_imm12: + case AArch64::fixup_aarch64_ldst_imm12_scale1: + case AArch64::fixup_aarch64_ldst_imm12_scale2: + case AArch64::fixup_aarch64_ldst_imm12_scale4: + case AArch64::fixup_aarch64_ldst_imm12_scale8: + case AArch64::fixup_aarch64_ldst_imm12_scale16: + case AArch64::fixup_aarch64_ldr_pcrel_imm19: + case AArch64::fixup_aarch64_pcrel_branch19: + return 3; + + case AArch64::fixup_aarch64_pcrel_adr_imm21: + case AArch64::fixup_aarch64_pcrel_adrp_imm21: + case AArch64::fixup_aarch64_pcrel_branch26: + case AArch64::fixup_aarch64_pcrel_call26: + case FK_Data_4: + return 4; + + case FK_Data_8: + return 8; + } +} + +static unsigned AdrImmBits(unsigned Value) { + unsigned lo2 = Value & 0x3; + unsigned hi19 = (Value & 0x1ffffc) >> 2; + return (hi19 << 5) | (lo2 << 29); +} + +static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) { + int64_t SignedValue = static_cast(Value); + switch (Kind) { + default: + assert(false && "Unknown fixup kind!"); + case AArch64::fixup_aarch64_pcrel_adr_imm21: + if (SignedValue > 2097151 || SignedValue < -2097152) + report_fatal_error("fixup value out of range"); + return AdrImmBits(Value & 0x1fffffULL); + case AArch64::fixup_aarch64_pcrel_adrp_imm21: + return AdrImmBits((Value & 0x1fffff000ULL) >> 12); + case AArch64::fixup_aarch64_ldr_pcrel_imm19: + case AArch64::fixup_aarch64_pcrel_branch19: + // Signed 21-bit immediate + if (SignedValue > 2097151 || SignedValue < -2097152) + report_fatal_error("fixup value out of range"); + // Low two bits are not encoded. + return (Value >> 2) & 0x7ffff; + case AArch64::fixup_aarch64_add_imm12: + case AArch64::fixup_aarch64_ldst_imm12_scale1: + // Unsigned 12-bit immediate + if (Value >= 0x1000) + report_fatal_error("invalid imm12 fixup value"); + return Value; + case AArch64::fixup_aarch64_ldst_imm12_scale2: + // Unsigned 12-bit immediate which gets multiplied by 2 + if (Value & 1 || Value >= 0x2000) + report_fatal_error("invalid imm12 fixup value"); + return Value >> 1; + case AArch64::fixup_aarch64_ldst_imm12_scale4: + // Unsigned 12-bit immediate which gets multiplied by 4 + if (Value & 3 || Value >= 0x4000) + report_fatal_error("invalid imm12 fixup value"); + return Value >> 2; + case AArch64::fixup_aarch64_ldst_imm12_scale8: + // Unsigned 12-bit immediate which gets multiplied by 8 + if (Value & 7 || Value >= 0x8000) + report_fatal_error("invalid imm12 fixup value"); + return Value >> 3; + case AArch64::fixup_aarch64_ldst_imm12_scale16: + // Unsigned 12-bit immediate which gets multiplied by 16 + if (Value & 15 || Value >= 0x10000) + report_fatal_error("invalid imm12 fixup value"); + return Value >> 4; + case AArch64::fixup_aarch64_movw: + report_fatal_error("no resolvable MOVZ/MOVK fixups supported yet"); + return Value; + case AArch64::fixup_aarch64_pcrel_branch14: + // Signed 16-bit immediate + if (SignedValue > 32767 || SignedValue < -32768) + report_fatal_error("fixup value out of range"); + // Low two bits are not encoded (4-byte alignment assumed). + if (Value & 0x3) + report_fatal_error("fixup not sufficiently aligned"); + return (Value >> 2) & 0x3fff; + case AArch64::fixup_aarch64_pcrel_branch26: + case AArch64::fixup_aarch64_pcrel_call26: + // Signed 28-bit immediate + if (SignedValue > 134217727 || SignedValue < -134217728) + report_fatal_error("fixup value out of range"); + // Low two bits are not encoded (4-byte alignment assumed). + if (Value & 0x3) + report_fatal_error("fixup not sufficiently aligned"); + return (Value >> 2) & 0x3ffffff; + case FK_Data_1: + case FK_Data_2: + case FK_Data_4: + case FK_Data_8: + return Value; + } +} + +void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, + unsigned DataSize, uint64_t Value, + bool IsPCRel) const { + unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); + if (!Value) + return; // Doesn't change encoding. + MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind()); + // Apply any target-specific value adjustments. + Value = adjustFixupValue(Fixup.getKind(), Value); + + // Shift the value into position. + Value <<= Info.TargetOffset; + + unsigned Offset = Fixup.getOffset(); + assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!"); + + // For each byte of the fragment that the fixup touches, mask in the + // bits from the fixup value. + for (unsigned i = 0; i != NumBytes; ++i) + Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); +} + +bool AArch64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const { + return false; +} + +bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, + uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const { + // FIXME: This isn't correct for AArch64. Just moving the "generic" logic + // into the targets for now. + // + // Relax if the value is too big for a (signed) i8. + return int64_t(Value) != int64_t(int8_t(Value)); +} + +void AArch64AsmBackend::relaxInstruction(const MCInst &Inst, + MCInst &Res) const { + assert(false && "AArch64AsmBackend::relaxInstruction() unimplemented"); +} + +bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { + // If the count is not 4-byte aligned, we must be writing data into the text + // section (otherwise we have unaligned instructions, and thus have far + // bigger problems), so just write zeros instead. + if ((Count & 3) != 0) { + for (uint64_t i = 0, e = (Count & 3); i != e; ++i) + OW->Write8(0); + } + + // We are properly aligned, so write NOPs as requested. + Count /= 4; + for (uint64_t i = 0; i != Count; ++i) + OW->Write32(0xd503201f); + return true; +} + +namespace { + +namespace CU { + +/// \brief Compact unwind encoding values. +enum CompactUnwindEncodings { + /// \brief A "frameless" leaf function, where no non-volatile registers are + /// saved. The return remains in LR throughout the function. + UNWIND_AArch64_MODE_FRAMELESS = 0x02000000, + + /// \brief No compact unwind encoding available. Instead the low 23-bits of + /// the compact unwind encoding is the offset of the DWARF FDE in the + /// __eh_frame section. This mode is never used in object files. It is only + /// generated by the linker in final linked images, which have only DWARF info + /// for a function. + UNWIND_AArch64_MODE_DWARF = 0x03000000, + + /// \brief This is a standard arm64 prologue where FP/LR are immediately + /// pushed on the stack, then SP is copied to FP. If there are any + /// non-volatile register saved, they are copied into the stack fame in pairs + /// in a contiguous ranger right below the saved FP/LR pair. Any subset of the + /// five X pairs and four D pairs can be saved, but the memory layout must be + /// in register number order. + UNWIND_AArch64_MODE_FRAME = 0x04000000, + + /// \brief Frame register pair encodings. + UNWIND_AArch64_FRAME_X19_X20_PAIR = 0x00000001, + UNWIND_AArch64_FRAME_X21_X22_PAIR = 0x00000002, + UNWIND_AArch64_FRAME_X23_X24_PAIR = 0x00000004, + UNWIND_AArch64_FRAME_X25_X26_PAIR = 0x00000008, + UNWIND_AArch64_FRAME_X27_X28_PAIR = 0x00000010, + UNWIND_AArch64_FRAME_D8_D9_PAIR = 0x00000100, + UNWIND_AArch64_FRAME_D10_D11_PAIR = 0x00000200, + UNWIND_AArch64_FRAME_D12_D13_PAIR = 0x00000400, + UNWIND_AArch64_FRAME_D14_D15_PAIR = 0x00000800 +}; + +} // end CU namespace + +// FIXME: This should be in a separate file. +class DarwinAArch64AsmBackend : public AArch64AsmBackend { + const MCRegisterInfo &MRI; + + /// \brief Encode compact unwind stack adjustment for frameless functions. + /// See UNWIND_AArch64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h. + /// The stack size always needs to be 16 byte aligned. + uint32_t encodeStackAdjustment(uint32_t StackSize) const { + return (StackSize / 16) << 12; + } + +public: + DarwinAArch64AsmBackend(const Target &T, const MCRegisterInfo &MRI) + : AArch64AsmBackend(T), MRI(MRI) {} + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { + return createAArch64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64, + MachO::CPU_SUBTYPE_ARM64_ALL); + } + + bool doesSectionRequireSymbols(const MCSection &Section) const override { + // Any section for which the linker breaks things into atoms needs to + // preserve symbols, including assembler local symbols, to identify + // those atoms. These sections are: + // Sections of type: + // + // S_CSTRING_LITERALS (e.g. __cstring) + // S_LITERAL_POINTERS (e.g. objc selector pointers) + // S_16BYTE_LITERALS, S_8BYTE_LITERALS, S_4BYTE_LITERALS + // + // Sections named: + // + // __TEXT,__eh_frame + // __TEXT,__ustring + // __DATA,__cfstring + // __DATA,__objc_classrefs + // __DATA,__objc_catlist + // + // FIXME: It would be better if the compiler used actual linker local + // symbols for each of these sections rather than preserving what + // are ostensibly assembler local symbols. + const MCSectionMachO &SMO = static_cast(Section); + return (SMO.getType() == MachO::S_CSTRING_LITERALS || + SMO.getType() == MachO::S_4BYTE_LITERALS || + SMO.getType() == MachO::S_8BYTE_LITERALS || + SMO.getType() == MachO::S_16BYTE_LITERALS || + SMO.getType() == MachO::S_LITERAL_POINTERS || + (SMO.getSegmentName() == "__TEXT" && + (SMO.getSectionName() == "__eh_frame" || + SMO.getSectionName() == "__ustring")) || + (SMO.getSegmentName() == "__DATA" && + (SMO.getSectionName() == "__cfstring" || + SMO.getSectionName() == "__objc_classrefs" || + SMO.getSectionName() == "__objc_catlist"))); + } + + /// \brief Generate the compact unwind encoding from the CFI directives. + uint32_t generateCompactUnwindEncoding( + ArrayRef Instrs) const override { + if (Instrs.empty()) + return CU::UNWIND_AArch64_MODE_FRAMELESS; + + bool HasFP = false; + unsigned StackSize = 0; + + uint32_t CompactUnwindEncoding = 0; + for (size_t i = 0, e = Instrs.size(); i != e; ++i) { + const MCCFIInstruction &Inst = Instrs[i]; + + switch (Inst.getOperation()) { + default: + // Cannot handle this directive: bail out. + return CU::UNWIND_AArch64_MODE_DWARF; + case MCCFIInstruction::OpDefCfa: { + // Defines a frame pointer. + assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) == + AArch64::FP && + "Invalid frame pointer!"); + assert(i + 2 < e && "Insufficient CFI instructions to define a frame!"); + + const MCCFIInstruction &LRPush = Instrs[++i]; + assert(LRPush.getOperation() == MCCFIInstruction::OpOffset && + "Link register not pushed!"); + const MCCFIInstruction &FPPush = Instrs[++i]; + assert(FPPush.getOperation() == MCCFIInstruction::OpOffset && + "Frame pointer not pushed!"); + + unsigned LRReg = MRI.getLLVMRegNum(LRPush.getRegister(), true); + unsigned FPReg = MRI.getLLVMRegNum(FPPush.getRegister(), true); + + LRReg = getXRegFromWReg(LRReg); + FPReg = getXRegFromWReg(FPReg); + + assert(LRReg == AArch64::LR && FPReg == AArch64::FP && + "Pushing invalid registers for frame!"); + + // Indicate that the function has a frame. + CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAME; + HasFP = true; + break; + } + case MCCFIInstruction::OpDefCfaOffset: { + assert(StackSize == 0 && "We already have the CFA offset!"); + StackSize = std::abs(Inst.getOffset()); + break; + } + case MCCFIInstruction::OpOffset: { + // Registers are saved in pairs. We expect there to be two consecutive + // `.cfi_offset' instructions with the appropriate registers specified. + unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true); + if (i + 1 == e) + return CU::UNWIND_AArch64_MODE_DWARF; + + const MCCFIInstruction &Inst2 = Instrs[++i]; + if (Inst2.getOperation() != MCCFIInstruction::OpOffset) + return CU::UNWIND_AArch64_MODE_DWARF; + unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true); + + // N.B. The encodings must be in register number order, and the X + // registers before the D registers. + + // X19/X20 pair = 0x00000001, + // X21/X22 pair = 0x00000002, + // X23/X24 pair = 0x00000004, + // X25/X26 pair = 0x00000008, + // X27/X28 pair = 0x00000010 + Reg1 = getXRegFromWReg(Reg1); + Reg2 = getXRegFromWReg(Reg2); + + if (Reg1 == AArch64::X19 && Reg2 == AArch64::X20 && + (CompactUnwindEncoding & 0xF1E) == 0) + CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X19_X20_PAIR; + else if (Reg1 == AArch64::X21 && Reg2 == AArch64::X22 && + (CompactUnwindEncoding & 0xF1C) == 0) + CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X21_X22_PAIR; + else if (Reg1 == AArch64::X23 && Reg2 == AArch64::X24 && + (CompactUnwindEncoding & 0xF18) == 0) + CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X23_X24_PAIR; + else if (Reg1 == AArch64::X25 && Reg2 == AArch64::X26 && + (CompactUnwindEncoding & 0xF10) == 0) + CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X25_X26_PAIR; + else if (Reg1 == AArch64::X27 && Reg2 == AArch64::X28 && + (CompactUnwindEncoding & 0xF00) == 0) + CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X27_X28_PAIR; + else { + Reg1 = getDRegFromBReg(Reg1); + Reg2 = getDRegFromBReg(Reg2); + + // D8/D9 pair = 0x00000100, + // D10/D11 pair = 0x00000200, + // D12/D13 pair = 0x00000400, + // D14/D15 pair = 0x00000800 + if (Reg1 == AArch64::D8 && Reg2 == AArch64::D9 && + (CompactUnwindEncoding & 0xE00) == 0) + CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D8_D9_PAIR; + else if (Reg1 == AArch64::D10 && Reg2 == AArch64::D11 && + (CompactUnwindEncoding & 0xC00) == 0) + CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D10_D11_PAIR; + else if (Reg1 == AArch64::D12 && Reg2 == AArch64::D13 && + (CompactUnwindEncoding & 0x800) == 0) + CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D12_D13_PAIR; + else if (Reg1 == AArch64::D14 && Reg2 == AArch64::D15) + CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D14_D15_PAIR; + else + // A pair was pushed which we cannot handle. + return CU::UNWIND_AArch64_MODE_DWARF; + } + + break; + } + } + } + + if (!HasFP) { + // With compact unwind info we can only represent stack adjustments of up + // to 65520 bytes. + if (StackSize > 65520) + return CU::UNWIND_AArch64_MODE_DWARF; + + CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAMELESS; + CompactUnwindEncoding |= encodeStackAdjustment(StackSize); + } + + return CompactUnwindEncoding; + } +}; + +} // end anonymous namespace + +namespace { + +class ELFAArch64AsmBackend : public AArch64AsmBackend { +public: + uint8_t OSABI; + bool IsLittleEndian; + + ELFAArch64AsmBackend(const Target &T, uint8_t OSABI, bool IsLittleEndian) + : AArch64AsmBackend(T), OSABI(OSABI), IsLittleEndian(IsLittleEndian) {} + + MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { + return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian); + } + + void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout, + const MCFixup &Fixup, const MCFragment *DF, + const MCValue &Target, uint64_t &Value, + bool &IsResolved) override; + + void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value, bool IsPCRel) const override; +}; + +void ELFAArch64AsmBackend::processFixupValue( + const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFixup &Fixup, + const MCFragment *DF, const MCValue &Target, uint64_t &Value, + bool &IsResolved) { + // The ADRP instruction adds some multiple of 0x1000 to the current PC & + // ~0xfff. This means that the required offset to reach a symbol can vary by + // up to one step depending on where the ADRP is in memory. For example: + // + // ADRP x0, there + // there: + // + // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and + // we'll need that as an offset. At any other address "there" will be in the + // same page as the ADRP and the instruction should encode 0x0. Assuming the + // section isn't 0x1000-aligned, we therefore need to delegate this decision + // to the linker -- a relocation! + if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21) + IsResolved = false; +} + +void ELFAArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, + unsigned DataSize, uint64_t Value, + bool IsPCRel) const { + // store fixups in .eh_frame section in big endian order + if (!IsLittleEndian && Fixup.getKind() == FK_Data_4) { + const MCSection *Sec = Fixup.getValue()->FindAssociatedSection(); + const MCSectionELF *SecELF = static_cast(Sec); + if (SecELF->getSectionName() == ".eh_frame") + Value = ByteSwap_32(unsigned(Value)); + } + AArch64AsmBackend::applyFixup (Fixup, Data, DataSize, Value, IsPCRel); +} +} + +MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T, + const MCRegisterInfo &MRI, + StringRef TT, StringRef CPU) { + Triple TheTriple(TT); + + if (TheTriple.isOSDarwin()) + return new DarwinAArch64AsmBackend(T, MRI); + + assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target"); + return new ELFAArch64AsmBackend(T, TheTriple.getOS(), /*IsLittleEndian=*/true); +} + +MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T, + const MCRegisterInfo &MRI, + StringRef TT, StringRef CPU) { + Triple TheTriple(TT); + + assert(TheTriple.isOSBinFormatELF() && + "Big endian is only supported for ELF targets!"); + return new ELFAArch64AsmBackend(T, TheTriple.getOS(), + /*IsLittleEndian=*/false); +} diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp new file mode 100644 index 00000000000..e05191eaf3e --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -0,0 +1,257 @@ +//===-- AArch64ELFObjectWriter.cpp - AArch64 ELF Writer -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file handles ELF-specific object emission, converting LLVM's internal +// fixups into the appropriate relocations. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/AArch64FixupKinds.h" +#include "MCTargetDesc/AArch64MCExpr.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; + +namespace { +class AArch64ELFObjectWriter : public MCELFObjectTargetWriter { +public: + AArch64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian); + + virtual ~AArch64ELFObjectWriter(); + +protected: + unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, + bool IsPCRel) const override; + +private: +}; +} + +AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, + bool IsLittleEndian) + : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64, + /*HasRelocationAddend*/ true) {} + +AArch64ELFObjectWriter::~AArch64ELFObjectWriter() {} + +unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target, + const MCFixup &Fixup, + bool IsPCRel) const { + AArch64MCExpr::VariantKind RefKind = + static_cast(Target.getRefKind()); + AArch64MCExpr::VariantKind SymLoc = AArch64MCExpr::getSymbolLoc(RefKind); + bool IsNC = AArch64MCExpr::isNotChecked(RefKind); + + assert((!Target.getSymA() || + Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None) && + "Should only be expression-level modifiers here"); + + assert((!Target.getSymB() || + Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None) && + "Should only be expression-level modifiers here"); + + if (IsPCRel) { + switch ((unsigned)Fixup.getKind()) { + case FK_Data_2: + return ELF::R_AARCH64_PREL16; + case FK_Data_4: + return ELF::R_AARCH64_PREL32; + case FK_Data_8: + return ELF::R_AARCH64_PREL64; + case AArch64::fixup_aarch64_pcrel_adr_imm21: + assert(SymLoc == AArch64MCExpr::VK_NONE && "unexpected ADR relocation"); + return ELF::R_AARCH64_ADR_PREL_LO21; + case AArch64::fixup_aarch64_pcrel_adrp_imm21: + if (SymLoc == AArch64MCExpr::VK_ABS && !IsNC) + return ELF::R_AARCH64_ADR_PREL_PG_HI21; + if (SymLoc == AArch64MCExpr::VK_GOT && !IsNC) + return ELF::R_AARCH64_ADR_GOT_PAGE; + if (SymLoc == AArch64MCExpr::VK_GOTTPREL && !IsNC) + return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21; + if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC) + return ELF::R_AARCH64_TLSDESC_ADR_PAGE; + llvm_unreachable("invalid symbol kind for ADRP relocation"); + case AArch64::fixup_aarch64_pcrel_branch26: + return ELF::R_AARCH64_JUMP26; + case AArch64::fixup_aarch64_pcrel_call26: + return ELF::R_AARCH64_CALL26; + case AArch64::fixup_aarch64_ldr_pcrel_imm19: + if (SymLoc == AArch64MCExpr::VK_GOTTPREL) + return ELF::R_AARCH64_TLSIE_LD_GOTTPREL_PREL19; + return ELF::R_AARCH64_LD_PREL_LO19; + case AArch64::fixup_aarch64_pcrel_branch14: + return ELF::R_AARCH64_TSTBR14; + case AArch64::fixup_aarch64_pcrel_branch19: + return ELF::R_AARCH64_CONDBR19; + default: + llvm_unreachable("Unsupported pc-relative fixup kind"); + } + } else { + switch ((unsigned)Fixup.getKind()) { + case FK_Data_2: + return ELF::R_AARCH64_ABS16; + case FK_Data_4: + return ELF::R_AARCH64_ABS32; + case FK_Data_8: + return ELF::R_AARCH64_ABS64; + case AArch64::fixup_aarch64_add_imm12: + if (RefKind == AArch64MCExpr::VK_DTPREL_HI12) + return ELF::R_AARCH64_TLSLD_ADD_DTPREL_HI12; + if (RefKind == AArch64MCExpr::VK_TPREL_HI12) + return ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12; + if (RefKind == AArch64MCExpr::VK_DTPREL_LO12_NC) + return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC; + if (RefKind == AArch64MCExpr::VK_DTPREL_LO12) + return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12; + if (RefKind == AArch64MCExpr::VK_TPREL_LO12_NC) + return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC; + if (RefKind == AArch64MCExpr::VK_TPREL_LO12) + return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12; + if (RefKind == AArch64MCExpr::VK_TLSDESC_LO12) + return ELF::R_AARCH64_TLSDESC_ADD_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) + return ELF::R_AARCH64_ADD_ABS_LO12_NC; + + report_fatal_error("invalid fixup for add (uimm12) instruction"); + return 0; + case AArch64::fixup_aarch64_ldst_imm12_scale1: + if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) + return ELF::R_AARCH64_LDST8_ABS_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC) + return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12; + if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC) + return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC) + return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12; + if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC) + return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC; + + report_fatal_error("invalid fixup for 8-bit load/store instruction"); + return 0; + case AArch64::fixup_aarch64_ldst_imm12_scale2: + if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) + return ELF::R_AARCH64_LDST16_ABS_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC) + return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12; + if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC) + return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC) + return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12; + if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC) + return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC; + + report_fatal_error("invalid fixup for 16-bit load/store instruction"); + return 0; + case AArch64::fixup_aarch64_ldst_imm12_scale4: + if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) + return ELF::R_AARCH64_LDST32_ABS_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC) + return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12; + if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC) + return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC) + return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12; + if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC) + return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC; + + report_fatal_error("invalid fixup for 32-bit load/store instruction"); + return 0; + case AArch64::fixup_aarch64_ldst_imm12_scale8: + if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) + return ELF::R_AARCH64_LDST64_ABS_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_GOT && IsNC) + return ELF::R_AARCH64_LD64_GOT_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC) + return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12; + if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC) + return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC) + return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12; + if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC) + return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_GOTTPREL && IsNC) + return ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC; + if (SymLoc == AArch64MCExpr::VK_TLSDESC && IsNC) + return ELF::R_AARCH64_TLSDESC_LD64_LO12_NC; + + report_fatal_error("invalid fixup for 64-bit load/store instruction"); + return 0; + case AArch64::fixup_aarch64_ldst_imm12_scale16: + if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) + return ELF::R_AARCH64_LDST128_ABS_LO12_NC; + + report_fatal_error("invalid fixup for 128-bit load/store instruction"); + return 0; + case AArch64::fixup_aarch64_movw: + if (RefKind == AArch64MCExpr::VK_ABS_G3) + return ELF::R_AARCH64_MOVW_UABS_G3; + if (RefKind == AArch64MCExpr::VK_ABS_G2) + return ELF::R_AARCH64_MOVW_UABS_G2; + if (RefKind == AArch64MCExpr::VK_ABS_G2_S) + return ELF::R_AARCH64_MOVW_SABS_G2; + if (RefKind == AArch64MCExpr::VK_ABS_G2_NC) + return ELF::R_AARCH64_MOVW_UABS_G2_NC; + if (RefKind == AArch64MCExpr::VK_ABS_G1) + return ELF::R_AARCH64_MOVW_UABS_G1; + if (RefKind == AArch64MCExpr::VK_ABS_G1_S) + return ELF::R_AARCH64_MOVW_SABS_G1; + if (RefKind == AArch64MCExpr::VK_ABS_G1_NC) + return ELF::R_AARCH64_MOVW_UABS_G1_NC; + if (RefKind == AArch64MCExpr::VK_ABS_G0) + return ELF::R_AARCH64_MOVW_UABS_G0; + if (RefKind == AArch64MCExpr::VK_ABS_G0_S) + return ELF::R_AARCH64_MOVW_SABS_G0; + if (RefKind == AArch64MCExpr::VK_ABS_G0_NC) + return ELF::R_AARCH64_MOVW_UABS_G0_NC; + if (RefKind == AArch64MCExpr::VK_DTPREL_G2) + return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2; + if (RefKind == AArch64MCExpr::VK_DTPREL_G1) + return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1; + if (RefKind == AArch64MCExpr::VK_DTPREL_G1_NC) + return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC; + if (RefKind == AArch64MCExpr::VK_DTPREL_G0) + return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0; + if (RefKind == AArch64MCExpr::VK_DTPREL_G0_NC) + return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC; + if (RefKind == AArch64MCExpr::VK_TPREL_G2) + return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2; + if (RefKind == AArch64MCExpr::VK_TPREL_G1) + return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1; + if (RefKind == AArch64MCExpr::VK_TPREL_G1_NC) + return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC; + if (RefKind == AArch64MCExpr::VK_TPREL_G0) + return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0; + if (RefKind == AArch64MCExpr::VK_TPREL_G0_NC) + return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC; + if (RefKind == AArch64MCExpr::VK_GOTTPREL_G1) + return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1; + if (RefKind == AArch64MCExpr::VK_GOTTPREL_G0_NC) + return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC; + report_fatal_error("invalid fixup for movz/movk instruction"); + return 0; + case AArch64::fixup_aarch64_tlsdesc_call: + return ELF::R_AARCH64_TLSDESC_CALL; + default: + llvm_unreachable("Unknown ELF relocation type"); + } + } + + llvm_unreachable("Unimplemented fixup -> relocation"); +} + +MCObjectWriter *llvm::createAArch64ELFObjectWriter(raw_ostream &OS, + uint8_t OSABI, + bool IsLittleEndian) { + MCELFObjectTargetWriter *MOTW = + new AArch64ELFObjectWriter(OSABI, IsLittleEndian); + return createELFObjectWriter(MOTW, OS, IsLittleEndian); +} diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp new file mode 100644 index 00000000000..a79406d9d1f --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -0,0 +1,160 @@ +//===- lib/MC/AArch64ELFStreamer.cpp - ELF Object Output for AArch64 ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file assembles .s files and emits AArch64 ELF .o object files. Different +// from generic ELF streamer in emitting mapping symbols ($x and $d) to delimit +// regions of data and code. +// +//===----------------------------------------------------------------------===// + +#include "llvm/MC/MCELFStreamer.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCELF.h" +#include "llvm/MC/MCELFStreamer.h" +#include "llvm/MC/MCELFSymbolFlags.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCObjectStreamer.h" +#include "llvm/MC/MCSection.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { + +/// Extend the generic ELFStreamer class so that it can emit mapping symbols at +/// the appropriate points in the object files. These symbols are defined in the +/// AArch64 ELF ABI: +/// infocenter.arm.com/help/topic/com.arm.doc.ihi0056a/IHI0056A_aaelf64.pdf +/// +/// In brief: $x or $d should be emitted at the start of each contiguous region +/// of A64 code or data in a section. In practice, this emission does not rely +/// on explicit assembler directives but on inherent properties of the +/// directives doing the emission (e.g. ".byte" is data, "add x0, x0, x0" an +/// instruction). +/// +/// As a result this system is orthogonal to the DataRegion infrastructure used +/// by MachO. Beware! +class AArch64ELFStreamer : public MCELFStreamer { +public: + AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS, + MCCodeEmitter *Emitter) + : MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0), + LastEMS(EMS_None) {} + + ~AArch64ELFStreamer() {} + + void ChangeSection(const MCSection *Section, + const MCExpr *Subsection) override { + // We have to keep track of the mapping symbol state of any sections we + // use. Each one should start off as EMS_None, which is provided as the + // default constructor by DenseMap::lookup. + LastMappingSymbols[getPreviousSection().first] = LastEMS; + LastEMS = LastMappingSymbols.lookup(Section); + + MCELFStreamer::ChangeSection(Section, Subsection); + } + + /// This function is the one used to emit instruction data into the ELF + /// streamer. We override it to add the appropriate mapping symbol if + /// necessary. + void EmitInstruction(const MCInst &Inst, + const MCSubtargetInfo &STI) override { + EmitA64MappingSymbol(); + MCELFStreamer::EmitInstruction(Inst, STI); + } + + /// This is one of the functions used to emit data into an ELF section, so the + /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d) + /// if necessary. + void EmitBytes(StringRef Data) override { + EmitDataMappingSymbol(); + MCELFStreamer::EmitBytes(Data); + } + + /// This is one of the functions used to emit data into an ELF section, so the + /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d) + /// if necessary. + void EmitValueImpl(const MCExpr *Value, unsigned Size, + const SMLoc &Loc) override { + EmitDataMappingSymbol(); + MCELFStreamer::EmitValueImpl(Value, Size); + } + +private: + enum ElfMappingSymbol { + EMS_None, + EMS_A64, + EMS_Data + }; + + void EmitDataMappingSymbol() { + if (LastEMS == EMS_Data) + return; + EmitMappingSymbol("$d"); + LastEMS = EMS_Data; + } + + void EmitA64MappingSymbol() { + if (LastEMS == EMS_A64) + return; + EmitMappingSymbol("$x"); + LastEMS = EMS_A64; + } + + void EmitMappingSymbol(StringRef Name) { + MCSymbol *Start = getContext().CreateTempSymbol(); + EmitLabel(Start); + + MCSymbol *Symbol = getContext().GetOrCreateSymbol( + Name + "." + Twine(MappingSymbolCounter++)); + + MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol); + MCELF::SetType(SD, ELF::STT_NOTYPE); + MCELF::SetBinding(SD, ELF::STB_LOCAL); + SD.setExternal(false); + Symbol->setSection(*getCurrentSection().first); + + const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext()); + Symbol->setVariableValue(Value); + } + + int64_t MappingSymbolCounter; + + DenseMap LastMappingSymbols; + ElfMappingSymbol LastEMS; + + /// @} +}; +} + +namespace llvm { +MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, + raw_ostream &OS, MCCodeEmitter *Emitter, + bool RelaxAll, bool NoExecStack) { + AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter); + if (RelaxAll) + S->getAssembler().setRelaxAll(true); + if (NoExecStack) + S->getAssembler().setNoExecStack(true); + return S; +} +} diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h new file mode 100644 index 00000000000..bc6973bd5f8 --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h @@ -0,0 +1,26 @@ +//===-- AArch64ELFStreamer.h - ELF Streamer for AArch64 ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements ELF streamer information for the AArch64 backend. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_AARCH64_ELF_STREAMER_H +#define LLVM_AARCH64_ELF_STREAMER_H + +#include "llvm/MC/MCELFStreamer.h" + +namespace llvm { + +MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, + raw_ostream &OS, MCCodeEmitter *Emitter, + bool RelaxAll, bool NoExecStack); +} + +#endif // AArch64_ELF_STREAMER_H diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h new file mode 100644 index 00000000000..bf405fbac77 --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h @@ -0,0 +1,76 @@ +//===-- AArch64FixupKinds.h - AArch64 Specific Fixup Entries ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_AArch64FIXUPKINDS_H +#define LLVM_AArch64FIXUPKINDS_H + +#include "llvm/MC/MCFixup.h" + +namespace llvm { +namespace AArch64 { + +enum Fixups { + // fixup_aarch64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into + // an ADR instruction. + fixup_aarch64_pcrel_adr_imm21 = FirstTargetFixupKind, + + // fixup_aarch64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into + // an ADRP instruction. + fixup_aarch64_pcrel_adrp_imm21, + + // fixup_aarch64_imm12 - 12-bit fixup for add/sub instructions. + // No alignment adjustment. All value bits are encoded. + fixup_aarch64_add_imm12, + + // fixup_aarch64_ldst_imm12_* - unsigned 12-bit fixups for load and + // store instructions. + fixup_aarch64_ldst_imm12_scale1, + fixup_aarch64_ldst_imm12_scale2, + fixup_aarch64_ldst_imm12_scale4, + fixup_aarch64_ldst_imm12_scale8, + fixup_aarch64_ldst_imm12_scale16, + + // fixup_aarch64_ldr_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative + // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is used by + // pc-relative loads and generates relocations directly when necessary. + fixup_aarch64_ldr_pcrel_imm19, + + // FIXME: comment + fixup_aarch64_movw, + + // fixup_aarch64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative + // immediate. + fixup_aarch64_pcrel_branch14, + + // fixup_aarch64_pcrel_branch19 - The high 19 bits of a 21-bit pc-relative + // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is use by + // b.cc and generates relocations directly when necessary. + fixup_aarch64_pcrel_branch19, + + // fixup_aarch64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative + // immediate. + fixup_aarch64_pcrel_branch26, + + // fixup_aarch64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative + // immediate. Distinguished from branch26 only on ELF. + fixup_aarch64_pcrel_call26, + + // fixup_aarch64_tlsdesc_call - zero-space placeholder for the ELF + // R_AARCH64_TLSDESC_CALL relocation. + fixup_aarch64_tlsdesc_call, + + // Marker + LastTargetFixupKind, + NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind +}; + +} // end namespace AArch64 +} // end namespace llvm + +#endif diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp new file mode 100644 index 00000000000..dc4a8bf6c9a --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -0,0 +1,99 @@ +//===-- AArch64MCAsmInfo.cpp - AArch64 asm properties ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of the AArch64MCAsmInfo properties. +// +//===----------------------------------------------------------------------===// + +#include "AArch64MCAsmInfo.h" +#include "llvm/ADT/Triple.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Support/CommandLine.h" +using namespace llvm; + +enum AsmWriterVariantTy { + Default = -1, + Generic = 0, + Apple = 1 +}; + +static cl::opt AsmWriterVariant( + "aarch64-neon-syntax", cl::init(Default), + cl::desc("Choose style of NEON code to emit from AArch64 backend:"), + cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"), + clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly"), + clEnumValEnd)); + +AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() { + // We prefer NEON instructions to be printed in the short form. + AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant; + + PrivateGlobalPrefix = "L"; + SeparatorString = "%%"; + CommentString = ";"; + PointerSize = CalleeSaveStackSlotSize = 8; + + AlignmentIsInBytes = false; + UsesELFSectionDirectiveForBSS = true; + SupportsDebugInformation = true; + UseDataRegionDirectives = true; + + ExceptionsType = ExceptionHandling::DwarfCFI; +} + +const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol( + const MCSymbol *Sym, unsigned Encoding, MCStreamer &Streamer) const { + // On Darwin, we can reference dwarf symbols with foo@GOT-., which + // is an indirect pc-relative reference. The default implementation + // won't reference using the GOT, so we need this target-specific + // version. + MCContext &Context = Streamer.getContext(); + const MCExpr *Res = + MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, Context); + MCSymbol *PCSym = Context.CreateTempSymbol(); + Streamer.EmitLabel(PCSym); + const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, Context); + return MCBinaryExpr::CreateSub(Res, PC, Context); +} + +AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(StringRef TT) { + Triple T(TT); + if (T.getArch() == Triple::arm64_be || T.getArch() == Triple::aarch64_be) + IsLittleEndian = false; + + // We prefer NEON instructions to be printed in the short form. + AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant; + + PointerSize = 8; + + // ".comm align is in bytes but .align is pow-2." + AlignmentIsInBytes = false; + + CommentString = "//"; + PrivateGlobalPrefix = ".L"; + Code32Directive = ".code\t32"; + + Data16bitsDirective = "\t.hword\t"; + Data32bitsDirective = "\t.word\t"; + Data64bitsDirective = "\t.xword\t"; + + UseDataRegionDirectives = false; + + WeakRefDirective = "\t.weak\t"; + + HasLEB128 = true; + SupportsDebugInformation = true; + + // Exceptions handling + ExceptionsType = ExceptionHandling::DwarfCFI; + + UseIntegratedAssembler = true; +} diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h new file mode 100644 index 00000000000..42a031d7c2c --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h @@ -0,0 +1,36 @@ +//=====-- AArch64MCAsmInfo.h - AArch64 asm properties ---------*- C++ -*--====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the AArch64MCAsmInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef AArch64TARGETASMINFO_H +#define AArch64TARGETASMINFO_H + +#include "llvm/MC/MCAsmInfoDarwin.h" + +namespace llvm { +class Target; +class StringRef; +class MCStreamer; +struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin { + explicit AArch64MCAsmInfoDarwin(); + const MCExpr * + getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding, + MCStreamer &Streamer) const override; +}; + +struct AArch64MCAsmInfoELF : public MCAsmInfo { + explicit AArch64MCAsmInfoELF(StringRef TT); +}; + +} // namespace llvm + +#endif diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp new file mode 100644 index 00000000000..464a18cdbc0 --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -0,0 +1,654 @@ +//=- AArch64/AArch64MCCodeEmitter.cpp - Convert AArch64 code to machine code-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the AArch64MCCodeEmitter class. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/AArch64AddressingModes.h" +#include "MCTargetDesc/AArch64FixupKinds.h" +#include "MCTargetDesc/AArch64MCExpr.h" +#include "Utils/AArch64BaseInfo.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "mccodeemitter" + +STATISTIC(MCNumEmitted, "Number of MC instructions emitted."); +STATISTIC(MCNumFixups, "Number of MC fixups created."); + +namespace { + +class AArch64MCCodeEmitter : public MCCodeEmitter { + MCContext &Ctx; + + AArch64MCCodeEmitter(const AArch64MCCodeEmitter &); // DO NOT IMPLEMENT + void operator=(const AArch64MCCodeEmitter &); // DO NOT IMPLEMENT +public: + AArch64MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti, + MCContext &ctx) + : Ctx(ctx) {} + + ~AArch64MCCodeEmitter() {} + + // getBinaryCodeForInstr - TableGen'erated function for getting the + // binary encoding for an instruction. + uint64_t getBinaryCodeForInstr(const MCInst &MI, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + /// getMachineOpValue - Return binary encoding of operand. If the machine + /// operand requires relocation, record the relocation and return zero. + unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + /// getLdStUImm12OpValue - Return encoding info for 12-bit unsigned immediate + /// attached to a load, store or prfm instruction. If operand requires a + /// relocation, record it and return zero in that part of the encoding. + template + uint32_t getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + /// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label + /// target. + uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + /// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and + /// the 2-bit shift field. + uint32_t getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + /// getCondBranchTargetOpValue - Return the encoded value for a conditional + /// branch target. + uint32_t getCondBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + /// getLoadLiteralOpValue - Return the encoded value for a load-literal + /// pc-relative address. + uint32_t getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + /// getMemExtendOpValue - Return the encoded value for a reg-extend load/store + /// instruction: bit 0 is whether a shift is present, bit 1 is whether the + /// operation is a sign extend (as opposed to a zero extend). + uint32_t getMemExtendOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + /// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and- + /// branch target. + uint32_t getTestBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + /// getBranchTargetOpValue - Return the encoded value for an unconditional + /// branch target. + uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + /// getMoveWideImmOpValue - Return the encoded value for the immediate operand + /// of a MOVZ or MOVK instruction. + uint32_t getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + /// getVecShifterOpValue - Return the encoded value for the vector shifter. + uint32_t getVecShifterOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + /// getMoveVecShifterOpValue - Return the encoded value for the vector move + /// shifter (MSL). + uint32_t getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + /// getFixedPointScaleOpValue - Return the encoded value for the + // FP-to-fixed-point scale factor. + uint32_t getFixedPointScaleOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + uint32_t getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + uint32_t getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + uint32_t getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + uint32_t getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + uint32_t getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + uint32_t getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + uint32_t getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + uint32_t getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + /// getSIMDShift64OpValue - Return the encoded value for the + // shift-by-immediate AdvSIMD instructions. + uint32_t getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + uint32_t getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + uint32_t getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + uint32_t getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; + + unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue, + const MCSubtargetInfo &STI) const; + + void EmitByte(unsigned char C, raw_ostream &OS) const { OS << (char)C; } + + void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const { + // Output the constant in little endian byte order. + for (unsigned i = 0; i != Size; ++i) { + EmitByte(Val & 255, OS); + Val >>= 8; + } + } + + void EncodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const override; + + unsigned fixMulHigh(const MCInst &MI, unsigned EncodedValue, + const MCSubtargetInfo &STI) const; + + template unsigned + fixLoadStoreExclusive(const MCInst &MI, unsigned EncodedValue, + const MCSubtargetInfo &STI) const; + + unsigned fixOneOperandFPComparison(const MCInst &MI, unsigned EncodedValue, + const MCSubtargetInfo &STI) const; +}; + +} // end anonymous namespace + +MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + const MCSubtargetInfo &STI, + MCContext &Ctx) { + return new AArch64MCCodeEmitter(MCII, STI, Ctx); +} + +/// getMachineOpValue - Return binary encoding of operand. If the machine +/// operand requires relocation, record the relocation and return zero. +unsigned +AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + if (MO.isReg()) + return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()); + else { + assert(MO.isImm() && "did not expect relocated expression"); + return static_cast(MO.getImm()); + } + + assert(0 && "Unable to encode MCOperand!"); + return 0; +} + +template uint32_t +AArch64MCCodeEmitter::getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + uint32_t ImmVal = 0; + + if (MO.isImm()) + ImmVal = static_cast(MO.getImm()); + else { + assert(MO.isExpr() && "unable to encode load/store imm operand"); + MCFixupKind Kind = MCFixupKind(FixupKind); + Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc())); + ++MCNumFixups; + } + + return ImmVal; +} + +/// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label +/// target. +uint32_t +AArch64MCCodeEmitter::getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + + // If the destination is an immediate, we have nothing to do. + if (MO.isImm()) + return MO.getImm(); + assert(MO.isExpr() && "Unexpected target type!"); + const MCExpr *Expr = MO.getExpr(); + + MCFixupKind Kind = MI.getOpcode() == AArch64::ADR + ? MCFixupKind(AArch64::fixup_aarch64_pcrel_adr_imm21) + : MCFixupKind(AArch64::fixup_aarch64_pcrel_adrp_imm21); + Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc())); + + MCNumFixups += 1; + + // All of the information is in the fixup. + return 0; +} + +/// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and +/// the 2-bit shift field. The shift field is stored in bits 13-14 of the +/// return value. +uint32_t +AArch64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + // Suboperands are [imm, shifter]. + const MCOperand &MO = MI.getOperand(OpIdx); + const MCOperand &MO1 = MI.getOperand(OpIdx + 1); + assert(AArch64_AM::getShiftType(MO1.getImm()) == AArch64_AM::LSL && + "unexpected shift type for add/sub immediate"); + unsigned ShiftVal = AArch64_AM::getShiftValue(MO1.getImm()); + assert((ShiftVal == 0 || ShiftVal == 12) && + "unexpected shift value for add/sub immediate"); + if (MO.isImm()) + return MO.getImm() | (ShiftVal == 0 ? 0 : (1 << 12)); + assert(MO.isExpr() && "Unable to encode MCOperand!"); + const MCExpr *Expr = MO.getExpr(); + + // Encode the 12 bits of the fixup. + MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_add_imm12); + Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc())); + + ++MCNumFixups; + + return 0; +} + +/// getCondBranchTargetOpValue - Return the encoded value for a conditional +/// branch target. +uint32_t AArch64MCCodeEmitter::getCondBranchTargetOpValue( + const MCInst &MI, unsigned OpIdx, SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + + // If the destination is an immediate, we have nothing to do. + if (MO.isImm()) + return MO.getImm(); + assert(MO.isExpr() && "Unexpected target type!"); + + MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch19); + Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc())); + + ++MCNumFixups; + + // All of the information is in the fixup. + return 0; +} + +/// getLoadLiteralOpValue - Return the encoded value for a load-literal +/// pc-relative address. +uint32_t +AArch64MCCodeEmitter::getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + + // If the destination is an immediate, we have nothing to do. + if (MO.isImm()) + return MO.getImm(); + assert(MO.isExpr() && "Unexpected target type!"); + + MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_ldr_pcrel_imm19); + Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc())); + + ++MCNumFixups; + + // All of the information is in the fixup. + return 0; +} + +uint32_t +AArch64MCCodeEmitter::getMemExtendOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + unsigned SignExtend = MI.getOperand(OpIdx).getImm(); + unsigned DoShift = MI.getOperand(OpIdx + 1).getImm(); + return (SignExtend << 1) | DoShift; +} + +uint32_t +AArch64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + + if (MO.isImm()) + return MO.getImm(); + assert(MO.isExpr() && "Unexpected movz/movk immediate"); + + Fixups.push_back(MCFixup::Create( + 0, MO.getExpr(), MCFixupKind(AArch64::fixup_aarch64_movw), MI.getLoc())); + + ++MCNumFixups; + + return 0; +} + +/// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and- +/// branch target. +uint32_t AArch64MCCodeEmitter::getTestBranchTargetOpValue( + const MCInst &MI, unsigned OpIdx, SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + + // If the destination is an immediate, we have nothing to do. + if (MO.isImm()) + return MO.getImm(); + assert(MO.isExpr() && "Unexpected ADR target type!"); + + MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch14); + Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc())); + + ++MCNumFixups; + + // All of the information is in the fixup. + return 0; +} + +/// getBranchTargetOpValue - Return the encoded value for an unconditional +/// branch target. +uint32_t +AArch64MCCodeEmitter::getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + + // If the destination is an immediate, we have nothing to do. + if (MO.isImm()) + return MO.getImm(); + assert(MO.isExpr() && "Unexpected ADR target type!"); + + MCFixupKind Kind = MI.getOpcode() == AArch64::BL + ? MCFixupKind(AArch64::fixup_aarch64_pcrel_call26) + : MCFixupKind(AArch64::fixup_aarch64_pcrel_branch26); + Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc())); + + ++MCNumFixups; + + // All of the information is in the fixup. + return 0; +} + +/// getVecShifterOpValue - Return the encoded value for the vector shifter: +/// +/// 00 -> 0 +/// 01 -> 8 +/// 10 -> 16 +/// 11 -> 24 +uint32_t +AArch64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the shift amount!"); + + switch (MO.getImm()) { + default: + break; + case 0: + return 0; + case 8: + return 1; + case 16: + return 2; + case 24: + return 3; + } + + assert(false && "Invalid value for vector shift amount!"); + return 0; +} + +uint32_t +AArch64MCCodeEmitter::getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the shift amount!"); + return 64 - (MO.getImm()); +} + +uint32_t AArch64MCCodeEmitter::getSIMDShift64_32OpValue( + const MCInst &MI, unsigned OpIdx, SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the shift amount!"); + return 64 - (MO.getImm() | 32); +} + +uint32_t +AArch64MCCodeEmitter::getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the shift amount!"); + return 32 - (MO.getImm() | 16); +} + +uint32_t +AArch64MCCodeEmitter::getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the shift amount!"); + return 16 - (MO.getImm() | 8); +} + +/// getFixedPointScaleOpValue - Return the encoded value for the +// FP-to-fixed-point scale factor. +uint32_t AArch64MCCodeEmitter::getFixedPointScaleOpValue( + const MCInst &MI, unsigned OpIdx, SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the scale amount!"); + return 64 - MO.getImm(); +} + +uint32_t +AArch64MCCodeEmitter::getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the scale amount!"); + return 64 - MO.getImm(); +} + +uint32_t +AArch64MCCodeEmitter::getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the scale amount!"); + return 32 - MO.getImm(); +} + +uint32_t +AArch64MCCodeEmitter::getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the scale amount!"); + return 16 - MO.getImm(); +} + +uint32_t +AArch64MCCodeEmitter::getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the scale amount!"); + return 8 - MO.getImm(); +} + +uint32_t +AArch64MCCodeEmitter::getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the scale amount!"); + return MO.getImm() - 64; +} + +uint32_t +AArch64MCCodeEmitter::getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the scale amount!"); + return MO.getImm() - 32; +} + +uint32_t +AArch64MCCodeEmitter::getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the scale amount!"); + return MO.getImm() - 16; +} + +uint32_t +AArch64MCCodeEmitter::getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && "Expected an immediate value for the scale amount!"); + return MO.getImm() - 8; +} + +/// getMoveVecShifterOpValue - Return the encoded value for the vector move +/// shifter (MSL). +uint32_t AArch64MCCodeEmitter::getMoveVecShifterOpValue( + const MCInst &MI, unsigned OpIdx, SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + const MCOperand &MO = MI.getOperand(OpIdx); + assert(MO.isImm() && + "Expected an immediate value for the move shift amount!"); + unsigned ShiftVal = AArch64_AM::getShiftValue(MO.getImm()); + assert((ShiftVal == 8 || ShiftVal == 16) && "Invalid shift amount!"); + return ShiftVal == 8 ? 0 : 1; +} + +unsigned AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue, + const MCSubtargetInfo &STI) const { + // If one of the signed fixup kinds is applied to a MOVZ instruction, the + // eventual result could be either a MOVZ or a MOVN. It's the MCCodeEmitter's + // job to ensure that any bits possibly affected by this are 0. This means we + // must zero out bit 30 (essentially emitting a MOVN). + MCOperand UImm16MO = MI.getOperand(1); + + // Nothing to do if there's no fixup. + if (UImm16MO.isImm()) + return EncodedValue; + + const AArch64MCExpr *A64E = cast(UImm16MO.getExpr()); + switch (A64E->getKind()) { + case AArch64MCExpr::VK_DTPREL_G2: + case AArch64MCExpr::VK_DTPREL_G1: + case AArch64MCExpr::VK_DTPREL_G0: + case AArch64MCExpr::VK_GOTTPREL_G1: + case AArch64MCExpr::VK_TPREL_G2: + case AArch64MCExpr::VK_TPREL_G1: + case AArch64MCExpr::VK_TPREL_G0: + return EncodedValue & ~(1u << 30); + default: + // Nothing to do for an unsigned fixup. + return EncodedValue; + } + + + return EncodedValue & ~(1u << 30); +} + +void AArch64MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + if (MI.getOpcode() == AArch64::TLSDESCCALL) { + // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the + // following (BLR) instruction. It doesn't emit any code itself so it + // doesn't go through the normal TableGenerated channels. + MCFixupKind Fixup = MCFixupKind(AArch64::fixup_aarch64_tlsdesc_call); + Fixups.push_back(MCFixup::Create(0, MI.getOperand(0).getExpr(), Fixup)); + return; + } + + uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI); + EmitConstant(Binary, 4, OS); + ++MCNumEmitted; // Keep track of the # of mi's emitted. +} + +unsigned +AArch64MCCodeEmitter::fixMulHigh(const MCInst &MI, + unsigned EncodedValue, + const MCSubtargetInfo &STI) const { + // The Ra field of SMULH and UMULH is unused: it should be assembled as 31 + // (i.e. all bits 1) but is ignored by the processor. + EncodedValue |= 0x1f << 10; + return EncodedValue; +} + +template unsigned +AArch64MCCodeEmitter::fixLoadStoreExclusive(const MCInst &MI, + unsigned EncodedValue, + const MCSubtargetInfo &STI) const { + if (!hasRs) EncodedValue |= 0x001F0000; + if (!hasRt2) EncodedValue |= 0x00007C00; + + return EncodedValue; +} + +unsigned AArch64MCCodeEmitter::fixOneOperandFPComparison( + const MCInst &MI, unsigned EncodedValue, const MCSubtargetInfo &STI) const { + // The Rm field of FCMP and friends is unused - it should be assembled + // as 0, but is ignored by the processor. + EncodedValue &= ~(0x1f << 16); + return EncodedValue; +} + +#include "AArch64GenMCCodeEmitter.inc" diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp new file mode 100644 index 00000000000..85c3ec7a55f --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp @@ -0,0 +1,174 @@ +//===-- AArch64MCExpr.cpp - AArch64 specific MC expression classes --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of the assembly expression modifiers +// accepted by the AArch64 architecture (e.g. ":lo12:", ":gottprel_g1:", ...). +// +//===----------------------------------------------------------------------===// + +#include "AArch64MCExpr.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCELF.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Object/ELF.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64symbolrefexpr" + +const AArch64MCExpr *AArch64MCExpr::Create(const MCExpr *Expr, VariantKind Kind, + MCContext &Ctx) { + return new (Ctx) AArch64MCExpr(Expr, Kind); +} + +StringRef AArch64MCExpr::getVariantKindName() const { + switch (static_cast(getKind())) { + case VK_CALL: return ""; + case VK_LO12: return ":lo12:"; + case VK_ABS_G3: return ":abs_g3:"; + case VK_ABS_G2: return ":abs_g2:"; + case VK_ABS_G2_S: return ":abs_g2_s:"; + case VK_ABS_G2_NC: return ":abs_g2_nc:"; + case VK_ABS_G1: return ":abs_g1:"; + case VK_ABS_G1_S: return ":abs_g1_s:"; + case VK_ABS_G1_NC: return ":abs_g1_nc:"; + case VK_ABS_G0: return ":abs_g0:"; + case VK_ABS_G0_S: return ":abs_g0_s:"; + case VK_ABS_G0_NC: return ":abs_g0_nc:"; + case VK_DTPREL_G2: return ":dtprel_g2:"; + case VK_DTPREL_G1: return ":dtprel_g1:"; + case VK_DTPREL_G1_NC: return ":dtprel_g1_nc:"; + case VK_DTPREL_G0: return ":dtprel_g0:"; + case VK_DTPREL_G0_NC: return ":dtprel_g0_nc:"; + case VK_DTPREL_HI12: return ":dtprel_hi12:"; + case VK_DTPREL_LO12: return ":dtprel_lo12:"; + case VK_DTPREL_LO12_NC: return ":dtprel_lo12_nc:"; + case VK_TPREL_G2: return ":tprel_g2:"; + case VK_TPREL_G1: return ":tprel_g1:"; + case VK_TPREL_G1_NC: return ":tprel_g1_nc:"; + case VK_TPREL_G0: return ":tprel_g0:"; + case VK_TPREL_G0_NC: return ":tprel_g0_nc:"; + case VK_TPREL_HI12: return ":tprel_hi12:"; + case VK_TPREL_LO12: return ":tprel_lo12:"; + case VK_TPREL_LO12_NC: return ":tprel_lo12_nc:"; + case VK_TLSDESC_LO12: return ":tlsdesc_lo12:"; + case VK_ABS_PAGE: return ""; + case VK_GOT_PAGE: return ":got:"; + case VK_GOT_LO12: return ":got_lo12:"; + case VK_GOTTPREL_PAGE: return ":gottprel:"; + case VK_GOTTPREL_LO12_NC: return ":gottprel_lo12:"; + case VK_GOTTPREL_G1: return ":gottprel_g1:"; + case VK_GOTTPREL_G0_NC: return ":gottprel_g0_nc:"; + case VK_TLSDESC: return ""; + case VK_TLSDESC_PAGE: return ":tlsdesc:"; + default: + llvm_unreachable("Invalid ELF symbol kind"); + } +} + +void AArch64MCExpr::PrintImpl(raw_ostream &OS) const { + if (getKind() != VK_NONE) + OS << getVariantKindName(); + OS << *Expr; +} + +// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps +// that method should be made public? +// FIXME: really do above: now that two backends are using it. +static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) { + switch (Value->getKind()) { + case MCExpr::Target: + llvm_unreachable("Can't handle nested target expr!"); + break; + + case MCExpr::Constant: + break; + + case MCExpr::Binary: { + const MCBinaryExpr *BE = cast(Value); + AddValueSymbolsImpl(BE->getLHS(), Asm); + AddValueSymbolsImpl(BE->getRHS(), Asm); + break; + } + + case MCExpr::SymbolRef: + Asm->getOrCreateSymbolData(cast(Value)->getSymbol()); + break; + + case MCExpr::Unary: + AddValueSymbolsImpl(cast(Value)->getSubExpr(), Asm); + break; + } +} + +void AArch64MCExpr::AddValueSymbols(MCAssembler *Asm) const { + AddValueSymbolsImpl(getSubExpr(), Asm); +} + +const MCSection *AArch64MCExpr::FindAssociatedSection() const { + llvm_unreachable("FIXME: what goes here?"); +} + +bool AArch64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res, + const MCAsmLayout *Layout) const { + if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout)) + return false; + + Res = + MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind()); + + return true; +} + +static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) { + switch (Expr->getKind()) { + case MCExpr::Target: + llvm_unreachable("Can't handle nested target expression"); + break; + case MCExpr::Constant: + break; + + case MCExpr::Binary: { + const MCBinaryExpr *BE = cast(Expr); + fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm); + fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm); + break; + } + + case MCExpr::SymbolRef: { + // We're known to be under a TLS fixup, so any symbol should be + // modified. There should be only one. + const MCSymbolRefExpr &SymRef = *cast(Expr); + MCSymbolData &SD = Asm.getOrCreateSymbolData(SymRef.getSymbol()); + MCELF::SetType(SD, ELF::STT_TLS); + break; + } + + case MCExpr::Unary: + fixELFSymbolsInTLSFixupsImpl(cast(Expr)->getSubExpr(), Asm); + break; + } +} + +void AArch64MCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const { + switch (getSymbolLoc(Kind)) { + default: + return; + case VK_DTPREL: + case VK_GOTTPREL: + case VK_TPREL: + case VK_TLSDESC: + break; + } + + fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm); +} diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h new file mode 100644 index 00000000000..e869ed0a26a --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h @@ -0,0 +1,168 @@ +//=--- AArch64MCExpr.h - AArch64 specific MC expression classes ---*- C++ -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes AArch64-specific MCExprs, used for modifiers like +// ":lo12:" or ":gottprel_g1:". +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_AArch64MCEXPR_H +#define LLVM_AArch64MCEXPR_H + +#include "llvm/MC/MCExpr.h" +#include "llvm/Support/ErrorHandling.h" + +namespace llvm { + +class AArch64MCExpr : public MCTargetExpr { +public: + enum VariantKind { + VK_NONE = 0x000, + + // Symbol locations specifying (roughly speaking) what calculation should be + // performed to construct the final address for the relocated + // symbol. E.g. direct, via the GOT, ... + VK_ABS = 0x001, + VK_SABS = 0x002, + VK_GOT = 0x003, + VK_DTPREL = 0x004, + VK_GOTTPREL = 0x005, + VK_TPREL = 0x006, + VK_TLSDESC = 0x007, + VK_SymLocBits = 0x00f, + + // Variants specifying which part of the final address calculation is + // used. E.g. the low 12 bits for an ADD/LDR, the middle 16 bits for a + // MOVZ/MOVK. + VK_PAGE = 0x010, + VK_PAGEOFF = 0x020, + VK_HI12 = 0x030, + VK_G0 = 0x040, + VK_G1 = 0x050, + VK_G2 = 0x060, + VK_G3 = 0x070, + VK_AddressFragBits = 0x0f0, + + // Whether the final relocation is a checked one (where a linker should + // perform a range-check on the final address) or not. Note that this field + // is unfortunately sometimes omitted from the assembly syntax. E.g. :lo12: + // on its own is a non-checked relocation. We side with ELF on being + // explicit about this! + VK_NC = 0x100, + + // Convenience definitions for referring to specific textual representations + // of relocation specifiers. Note that this means the "_NC" is sometimes + // omitted in line with assembly syntax here (VK_LO12 rather than VK_LO12_NC + // since a user would write ":lo12:"). + VK_CALL = VK_ABS, + VK_ABS_PAGE = VK_ABS | VK_PAGE, + VK_ABS_G3 = VK_ABS | VK_G3, + VK_ABS_G2 = VK_ABS | VK_G2, + VK_ABS_G2_S = VK_SABS | VK_G2, + VK_ABS_G2_NC = VK_ABS | VK_G2 | VK_NC, + VK_ABS_G1 = VK_ABS | VK_G1, + VK_ABS_G1_S = VK_SABS | VK_G1, + VK_ABS_G1_NC = VK_ABS | VK_G1 | VK_NC, + VK_ABS_G0 = VK_ABS | VK_G0, + VK_ABS_G0_S = VK_SABS | VK_G0, + VK_ABS_G0_NC = VK_ABS | VK_G0 | VK_NC, + VK_LO12 = VK_ABS | VK_PAGEOFF | VK_NC, + VK_GOT_LO12 = VK_GOT | VK_PAGEOFF | VK_NC, + VK_GOT_PAGE = VK_GOT | VK_PAGE, + VK_DTPREL_G2 = VK_DTPREL | VK_G2, + VK_DTPREL_G1 = VK_DTPREL | VK_G1, + VK_DTPREL_G1_NC = VK_DTPREL | VK_G1 | VK_NC, + VK_DTPREL_G0 = VK_DTPREL | VK_G0, + VK_DTPREL_G0_NC = VK_DTPREL | VK_G0 | VK_NC, + VK_DTPREL_HI12 = VK_DTPREL | VK_HI12, + VK_DTPREL_LO12 = VK_DTPREL | VK_PAGEOFF, + VK_DTPREL_LO12_NC = VK_DTPREL | VK_PAGEOFF | VK_NC, + VK_GOTTPREL_PAGE = VK_GOTTPREL | VK_PAGE, + VK_GOTTPREL_LO12_NC = VK_GOTTPREL | VK_PAGEOFF | VK_NC, + VK_GOTTPREL_G1 = VK_GOTTPREL | VK_G1, + VK_GOTTPREL_G0_NC = VK_GOTTPREL | VK_G0 | VK_NC, + VK_TPREL_G2 = VK_TPREL | VK_G2, + VK_TPREL_G1 = VK_TPREL | VK_G1, + VK_TPREL_G1_NC = VK_TPREL | VK_G1 | VK_NC, + VK_TPREL_G0 = VK_TPREL | VK_G0, + VK_TPREL_G0_NC = VK_TPREL | VK_G0 | VK_NC, + VK_TPREL_HI12 = VK_TPREL | VK_HI12, + VK_TPREL_LO12 = VK_TPREL | VK_PAGEOFF, + VK_TPREL_LO12_NC = VK_TPREL | VK_PAGEOFF | VK_NC, + VK_TLSDESC_LO12 = VK_TLSDESC | VK_PAGEOFF | VK_NC, + VK_TLSDESC_PAGE = VK_TLSDESC | VK_PAGE, + + VK_INVALID = 0xfff + }; + +private: + const MCExpr *Expr; + const VariantKind Kind; + + explicit AArch64MCExpr(const MCExpr *Expr, VariantKind Kind) + : Expr(Expr), Kind(Kind) {} + +public: + /// @name Construction + /// @{ + + static const AArch64MCExpr *Create(const MCExpr *Expr, VariantKind Kind, + MCContext &Ctx); + + /// @} + /// @name Accessors + /// @{ + + /// Get the kind of this expression. + VariantKind getKind() const { return static_cast(Kind); } + + /// Get the expression this modifier applies to. + const MCExpr *getSubExpr() const { return Expr; } + + /// @} + /// @name VariantKind information extractors. + /// @{ + + static VariantKind getSymbolLoc(VariantKind Kind) { + return static_cast(Kind & VK_SymLocBits); + } + + static VariantKind getAddressFrag(VariantKind Kind) { + return static_cast(Kind & VK_AddressFragBits); + } + + static bool isNotChecked(VariantKind Kind) { return Kind & VK_NC; } + + /// @} + + /// Convert the variant kind into an ELF-appropriate modifier + /// (e.g. ":got:", ":lo12:"). + StringRef getVariantKindName() const; + + void PrintImpl(raw_ostream &OS) const override; + + void AddValueSymbols(MCAssembler *) const override; + + const MCSection *FindAssociatedSection() const override; + + bool EvaluateAsRelocatableImpl(MCValue &Res, + const MCAsmLayout *Layout) const override; + + void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override; + + static bool classof(const MCExpr *E) { + return E->getKind() == MCExpr::Target; + } + + static bool classof(const AArch64MCExpr *) { return true; } + +}; +} // end namespace llvm + +#endif diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp new file mode 100644 index 00000000000..ae698c59f6c --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -0,0 +1,225 @@ +//===-- AArch64MCTargetDesc.cpp - AArch64 Target Descriptions ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides AArch64 specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#include "AArch64MCTargetDesc.h" +#include "AArch64ELFStreamer.h" +#include "AArch64MCAsmInfo.h" +#include "InstPrinter/AArch64InstPrinter.h" +#include "llvm/MC/MCCodeGenInfo.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +#define GET_INSTRINFO_MC_DESC +#include "AArch64GenInstrInfo.inc" + +#define GET_SUBTARGETINFO_MC_DESC +#include "AArch64GenSubtargetInfo.inc" + +#define GET_REGINFO_MC_DESC +#include "AArch64GenRegisterInfo.inc" + +static MCInstrInfo *createAArch64MCInstrInfo() { + MCInstrInfo *X = new MCInstrInfo(); + InitAArch64MCInstrInfo(X); + return X; +} + +static MCSubtargetInfo * +createAArch64MCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS) { + MCSubtargetInfo *X = new MCSubtargetInfo(); + + if (CPU.empty()) + CPU = "generic"; + + InitAArch64MCSubtargetInfo(X, TT, CPU, FS); + return X; +} + +static MCRegisterInfo *createAArch64MCRegisterInfo(StringRef Triple) { + MCRegisterInfo *X = new MCRegisterInfo(); + InitAArch64MCRegisterInfo(X, AArch64::LR); + return X; +} + +static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI, + StringRef TT) { + Triple TheTriple(TT); + + MCAsmInfo *MAI; + if (TheTriple.isOSDarwin()) + MAI = new AArch64MCAsmInfoDarwin(); + else { + assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF"); + MAI = new AArch64MCAsmInfoELF(TT); + } + + // Initial state of the frame pointer is SP. + unsigned Reg = MRI.getDwarfRegNum(AArch64::SP, true); + MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0); + MAI->addInitialFrameState(Inst); + + return MAI; +} + +static MCCodeGenInfo *createAArch64MCCodeGenInfo(StringRef TT, Reloc::Model RM, + CodeModel::Model CM, + CodeGenOpt::Level OL) { + Triple TheTriple(TT); + assert((TheTriple.isOSBinFormatELF() || TheTriple.isOSBinFormatMachO()) && + "Only expect Darwin and ELF targets"); + + if (CM == CodeModel::Default) + CM = CodeModel::Small; + // The default MCJIT memory managers make no guarantees about where they can + // find an executable page; JITed code needs to be able to refer to globals + // no matter how far away they are. + else if (CM == CodeModel::JITDefault) + CM = CodeModel::Large; + else if (CM != CodeModel::Small && CM != CodeModel::Large) + report_fatal_error( + "Only small and large code models are allowed on AArch64"); + + // AArch64 Darwin is always PIC. + if (TheTriple.isOSDarwin()) + RM = Reloc::PIC_; + // On ELF platforms the default static relocation model has a smart enough + // linker to cope with referencing external symbols defined in a shared + // library. Hence DynamicNoPIC doesn't need to be promoted to PIC. + else if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC) + RM = Reloc::Static; + + MCCodeGenInfo *X = new MCCodeGenInfo(); + X->InitMCCodeGenInfo(RM, CM, OL); + return X; +} + +static MCInstPrinter *createAArch64MCInstPrinter(const Target &T, + unsigned SyntaxVariant, + const MCAsmInfo &MAI, + const MCInstrInfo &MII, + const MCRegisterInfo &MRI, + const MCSubtargetInfo &STI) { + if (SyntaxVariant == 0) + return new AArch64InstPrinter(MAI, MII, MRI, STI); + if (SyntaxVariant == 1) + return new AArch64AppleInstPrinter(MAI, MII, MRI, STI); + + return nullptr; +} + +static MCStreamer *createMCStreamer(const Target &T, StringRef TT, + MCContext &Ctx, MCAsmBackend &TAB, + raw_ostream &OS, MCCodeEmitter *Emitter, + const MCSubtargetInfo &STI, bool RelaxAll, + bool NoExecStack) { + Triple TheTriple(TT); + + if (TheTriple.isOSDarwin()) + return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll, + /*LabelSections*/ true); + + return createAArch64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack); +} + +// Force static initialization. +extern "C" void LLVMInitializeAArch64TargetMC() { + // Register the MC asm info. + RegisterMCAsmInfoFn X(TheAArch64leTarget, createAArch64MCAsmInfo); + RegisterMCAsmInfoFn Y(TheAArch64beTarget, createAArch64MCAsmInfo); + RegisterMCAsmInfoFn Z(TheARM64leTarget, createAArch64MCAsmInfo); + RegisterMCAsmInfoFn W(TheARM64beTarget, createAArch64MCAsmInfo); + + // Register the MC codegen info. + TargetRegistry::RegisterMCCodeGenInfo(TheAArch64leTarget, + createAArch64MCCodeGenInfo); + TargetRegistry::RegisterMCCodeGenInfo(TheAArch64beTarget, + createAArch64MCCodeGenInfo); + TargetRegistry::RegisterMCCodeGenInfo(TheARM64leTarget, + createAArch64MCCodeGenInfo); + TargetRegistry::RegisterMCCodeGenInfo(TheARM64beTarget, + createAArch64MCCodeGenInfo); + + // Register the MC instruction info. + TargetRegistry::RegisterMCInstrInfo(TheAArch64leTarget, + createAArch64MCInstrInfo); + TargetRegistry::RegisterMCInstrInfo(TheAArch64beTarget, + createAArch64MCInstrInfo); + TargetRegistry::RegisterMCInstrInfo(TheARM64leTarget, + createAArch64MCInstrInfo); + TargetRegistry::RegisterMCInstrInfo(TheARM64beTarget, + createAArch64MCInstrInfo); + + // Register the MC register info. + TargetRegistry::RegisterMCRegInfo(TheAArch64leTarget, + createAArch64MCRegisterInfo); + TargetRegistry::RegisterMCRegInfo(TheAArch64beTarget, + createAArch64MCRegisterInfo); + TargetRegistry::RegisterMCRegInfo(TheARM64leTarget, + createAArch64MCRegisterInfo); + TargetRegistry::RegisterMCRegInfo(TheARM64beTarget, + createAArch64MCRegisterInfo); + + // Register the MC subtarget info. + TargetRegistry::RegisterMCSubtargetInfo(TheAArch64leTarget, + createAArch64MCSubtargetInfo); + TargetRegistry::RegisterMCSubtargetInfo(TheAArch64beTarget, + createAArch64MCSubtargetInfo); + TargetRegistry::RegisterMCSubtargetInfo(TheARM64leTarget, + createAArch64MCSubtargetInfo); + TargetRegistry::RegisterMCSubtargetInfo(TheARM64beTarget, + createAArch64MCSubtargetInfo); + + // Register the asm backend. + TargetRegistry::RegisterMCAsmBackend(TheAArch64leTarget, + createAArch64leAsmBackend); + TargetRegistry::RegisterMCAsmBackend(TheAArch64beTarget, + createAArch64beAsmBackend); + TargetRegistry::RegisterMCAsmBackend(TheARM64leTarget, + createAArch64leAsmBackend); + TargetRegistry::RegisterMCAsmBackend(TheARM64beTarget, + createAArch64beAsmBackend); + + // Register the MC Code Emitter + TargetRegistry::RegisterMCCodeEmitter(TheAArch64leTarget, + createAArch64MCCodeEmitter); + TargetRegistry::RegisterMCCodeEmitter(TheAArch64beTarget, + createAArch64MCCodeEmitter); + TargetRegistry::RegisterMCCodeEmitter(TheARM64leTarget, + createAArch64MCCodeEmitter); + TargetRegistry::RegisterMCCodeEmitter(TheARM64beTarget, + createAArch64MCCodeEmitter); + + // Register the object streamer. + TargetRegistry::RegisterMCObjectStreamer(TheAArch64leTarget, + createMCStreamer); + TargetRegistry::RegisterMCObjectStreamer(TheAArch64beTarget, + createMCStreamer); + TargetRegistry::RegisterMCObjectStreamer(TheARM64leTarget, createMCStreamer); + TargetRegistry::RegisterMCObjectStreamer(TheARM64beTarget, createMCStreamer); + + // Register the MCInstPrinter. + TargetRegistry::RegisterMCInstPrinter(TheAArch64leTarget, + createAArch64MCInstPrinter); + TargetRegistry::RegisterMCInstPrinter(TheAArch64beTarget, + createAArch64MCInstPrinter); + TargetRegistry::RegisterMCInstPrinter(TheARM64leTarget, + createAArch64MCInstPrinter); + TargetRegistry::RegisterMCInstPrinter(TheARM64beTarget, + createAArch64MCInstPrinter); +} diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h new file mode 100644 index 00000000000..d886ea23c13 --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h @@ -0,0 +1,70 @@ +//===-- AArch64MCTargetDesc.h - AArch64 Target Descriptions -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides AArch64 specific target descriptions. +// +//===----------------------------------------------------------------------===// + +#ifndef AArch64MCTARGETDESC_H +#define AArch64MCTARGETDESC_H + +#include "llvm/Support/DataTypes.h" +#include + +namespace llvm { +class MCAsmBackend; +class MCCodeEmitter; +class MCContext; +class MCInstrInfo; +class MCRegisterInfo; +class MCObjectWriter; +class MCSubtargetInfo; +class StringRef; +class Target; +class raw_ostream; + +extern Target TheAArch64leTarget; +extern Target TheAArch64beTarget; +extern Target TheARM64leTarget; +extern Target TheARM64beTarget; + +MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + const MCSubtargetInfo &STI, + MCContext &Ctx); +MCAsmBackend *createAArch64leAsmBackend(const Target &T, + const MCRegisterInfo &MRI, StringRef TT, + StringRef CPU); +MCAsmBackend *createAArch64beAsmBackend(const Target &T, + const MCRegisterInfo &MRI, StringRef TT, + StringRef CPU); + +MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS, uint8_t OSABI, + bool IsLittleEndian); + +MCObjectWriter *createAArch64MachObjectWriter(raw_ostream &OS, uint32_t CPUType, + uint32_t CPUSubtype); + +} // End llvm namespace + +// Defines symbolic names for AArch64 registers. This defines a mapping from +// register name to register number. +// +#define GET_REGINFO_ENUM +#include "AArch64GenRegisterInfo.inc" + +// Defines symbolic names for the AArch64 instructions. +// +#define GET_INSTRINFO_ENUM +#include "AArch64GenInstrInfo.inc" + +#define GET_SUBTARGETINFO_ENUM +#include "AArch64GenSubtargetInfo.inc" + +#endif diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp new file mode 100644 index 00000000000..5c86189a6ef --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -0,0 +1,396 @@ +//===-- AArch64MachObjectWriter.cpp - ARM Mach Object Writer --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/AArch64FixupKinds.h" +#include "MCTargetDesc/AArch64MCTargetDesc.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCAsmLayout.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCMachObjectWriter.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/MC/MCValue.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MachO.h" +using namespace llvm; + +namespace { +class AArch64MachObjectWriter : public MCMachObjectTargetWriter { + bool getAArch64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType, + const MCSymbolRefExpr *Sym, + unsigned &Log2Size, const MCAssembler &Asm); + +public: + AArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype) + : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype, + /*UseAggressiveSymbolFolding=*/true) {} + + void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm, + const MCAsmLayout &Layout, const MCFragment *Fragment, + const MCFixup &Fixup, MCValue Target, + uint64_t &FixedValue) override; +}; +} + +bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( + const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym, + unsigned &Log2Size, const MCAssembler &Asm) { + RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED); + Log2Size = ~0U; + + switch ((unsigned)Fixup.getKind()) { + default: + return false; + + case FK_Data_1: + Log2Size = llvm::Log2_32(1); + return true; + case FK_Data_2: + Log2Size = llvm::Log2_32(2); + return true; + case FK_Data_4: + Log2Size = llvm::Log2_32(4); + if (Sym->getKind() == MCSymbolRefExpr::VK_GOT) + RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT); + return true; + case FK_Data_8: + Log2Size = llvm::Log2_32(8); + if (Sym->getKind() == MCSymbolRefExpr::VK_GOT) + RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT); + return true; + case AArch64::fixup_aarch64_add_imm12: + case AArch64::fixup_aarch64_ldst_imm12_scale1: + case AArch64::fixup_aarch64_ldst_imm12_scale2: + case AArch64::fixup_aarch64_ldst_imm12_scale4: + case AArch64::fixup_aarch64_ldst_imm12_scale8: + case AArch64::fixup_aarch64_ldst_imm12_scale16: + Log2Size = llvm::Log2_32(4); + switch (Sym->getKind()) { + default: + assert(0 && "Unexpected symbol reference variant kind!"); + case MCSymbolRefExpr::VK_PAGEOFF: + RelocType = unsigned(MachO::ARM64_RELOC_PAGEOFF12); + return true; + case MCSymbolRefExpr::VK_GOTPAGEOFF: + RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12); + return true; + case MCSymbolRefExpr::VK_TLVPPAGEOFF: + RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12); + return true; + } + case AArch64::fixup_aarch64_pcrel_adrp_imm21: + Log2Size = llvm::Log2_32(4); + // This encompasses the relocation for the whole 21-bit value. + switch (Sym->getKind()) { + default: + Asm.getContext().FatalError(Fixup.getLoc(), + "ADR/ADRP relocations must be GOT relative"); + case MCSymbolRefExpr::VK_PAGE: + RelocType = unsigned(MachO::ARM64_RELOC_PAGE21); + return true; + case MCSymbolRefExpr::VK_GOTPAGE: + RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGE21); + return true; + case MCSymbolRefExpr::VK_TLVPPAGE: + RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGE21); + return true; + } + return true; + case AArch64::fixup_aarch64_pcrel_branch26: + case AArch64::fixup_aarch64_pcrel_call26: + Log2Size = llvm::Log2_32(4); + RelocType = unsigned(MachO::ARM64_RELOC_BRANCH26); + return true; + } +} + +void AArch64MachObjectWriter::RecordRelocation( + MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout, + const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, + uint64_t &FixedValue) { + unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); + + // See . + uint32_t FixupOffset = Layout.getFragmentOffset(Fragment); + unsigned Log2Size = 0; + int64_t Value = 0; + unsigned Index = 0; + unsigned IsExtern = 0; + unsigned Type = 0; + unsigned Kind = Fixup.getKind(); + + FixupOffset += Fixup.getOffset(); + + // AArch64 pcrel relocation addends do not include the section offset. + if (IsPCRel) + FixedValue += FixupOffset; + + // ADRP fixups use relocations for the whole symbol value and only + // put the addend in the instruction itself. Clear out any value the + // generic code figured out from the sybmol definition. + if (Kind == AArch64::fixup_aarch64_pcrel_adrp_imm21) + FixedValue = 0; + + // imm19 relocations are for conditional branches, which require + // assembler local symbols. If we got here, that's not what we have, + // so complain loudly. + if (Kind == AArch64::fixup_aarch64_pcrel_branch19) { + Asm.getContext().FatalError(Fixup.getLoc(), + "conditional branch requires assembler-local" + " label. '" + + Target.getSymA()->getSymbol().getName() + + "' is external."); + return; + } + + // 14-bit branch relocations should only target internal labels, and so + // should never get here. + if (Kind == AArch64::fixup_aarch64_pcrel_branch14) { + Asm.getContext().FatalError(Fixup.getLoc(), + "Invalid relocation on conditional branch!"); + return; + } + + if (!getAArch64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size, + Asm)) { + Asm.getContext().FatalError(Fixup.getLoc(), "unknown AArch64 fixup kind!"); + return; + } + + Value = Target.getConstant(); + + if (Target.isAbsolute()) { // constant + // FIXME: Should this always be extern? + // SymbolNum of 0 indicates the absolute section. + Type = MachO::ARM64_RELOC_UNSIGNED; + Index = 0; + + if (IsPCRel) { + IsExtern = 1; + Asm.getContext().FatalError(Fixup.getLoc(), + "PC relative absolute relocation!"); + + // FIXME: x86_64 sets the type to a branch reloc here. Should we do + // something similar? + } + } else if (Target.getSymB()) { // A - B + constant + const MCSymbol *A = &Target.getSymA()->getSymbol(); + const MCSymbolData &A_SD = Asm.getSymbolData(*A); + const MCSymbolData *A_Base = Asm.getAtom(&A_SD); + + const MCSymbol *B = &Target.getSymB()->getSymbol(); + const MCSymbolData &B_SD = Asm.getSymbolData(*B); + const MCSymbolData *B_Base = Asm.getAtom(&B_SD); + + // Check for "_foo@got - .", which comes through here as: + // Ltmp0: + // ... _foo@got - Ltmp0 + if (Target.getSymA()->getKind() == MCSymbolRefExpr::VK_GOT && + Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None && + Layout.getSymbolOffset(&B_SD) == + Layout.getFragmentOffset(Fragment) + Fixup.getOffset()) { + // SymB is the PC, so use a PC-rel pointer-to-GOT relocation. + Index = A_Base->getIndex(); + IsExtern = 1; + Type = MachO::ARM64_RELOC_POINTER_TO_GOT; + IsPCRel = 1; + MachO::any_relocation_info MRE; + MRE.r_word0 = FixupOffset; + MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | + (IsExtern << 27) | (Type << 28)); + Writer->addRelocation(Fragment->getParent(), MRE); + return; + } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None || + Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) + // Otherwise, neither symbol can be modified. + Asm.getContext().FatalError(Fixup.getLoc(), + "unsupported relocation of modified symbol"); + + // We don't support PCrel relocations of differences. + if (IsPCRel) + Asm.getContext().FatalError(Fixup.getLoc(), + "unsupported pc-relative relocation of " + "difference"); + + // AArch64 always uses external relocations. If there is no symbol to use as + // a base address (a local symbol with no preceding non-local symbol), + // error out. + // + // FIXME: We should probably just synthesize an external symbol and use + // that. + if (!A_Base) + Asm.getContext().FatalError( + Fixup.getLoc(), + "unsupported relocation of local symbol '" + A->getName() + + "'. Must have non-local symbol earlier in section."); + if (!B_Base) + Asm.getContext().FatalError( + Fixup.getLoc(), + "unsupported relocation of local symbol '" + B->getName() + + "'. Must have non-local symbol earlier in section."); + + if (A_Base == B_Base && A_Base) + Asm.getContext().FatalError(Fixup.getLoc(), + "unsupported relocation with identical base"); + + Value += (!A_SD.getFragment() ? 0 + : Writer->getSymbolAddress(&A_SD, Layout)) - + (!A_Base || !A_Base->getFragment() + ? 0 + : Writer->getSymbolAddress(A_Base, Layout)); + Value -= (!B_SD.getFragment() ? 0 + : Writer->getSymbolAddress(&B_SD, Layout)) - + (!B_Base || !B_Base->getFragment() + ? 0 + : Writer->getSymbolAddress(B_Base, Layout)); + + Index = A_Base->getIndex(); + IsExtern = 1; + Type = MachO::ARM64_RELOC_UNSIGNED; + + MachO::any_relocation_info MRE; + MRE.r_word0 = FixupOffset; + MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | + (IsExtern << 27) | (Type << 28)); + Writer->addRelocation(Fragment->getParent(), MRE); + + Index = B_Base->getIndex(); + IsExtern = 1; + Type = MachO::ARM64_RELOC_SUBTRACTOR; + } else { // A + constant + const MCSymbol *Symbol = &Target.getSymA()->getSymbol(); + const MCSymbolData &SD = Asm.getSymbolData(*Symbol); + const MCSymbolData *Base = Asm.getAtom(&SD); + const MCSectionMachO &Section = static_cast( + Fragment->getParent()->getSection()); + + // If the symbol is a variable and we weren't able to get a Base for it + // (i.e., it's not in the symbol table associated with a section) resolve + // the relocation based its expansion instead. + if (Symbol->isVariable() && !Base) { + // If the evaluation is an absolute value, just use that directly + // to keep things easy. + int64_t Res; + if (SD.getSymbol().getVariableValue()->EvaluateAsAbsolute( + Res, Layout, Writer->getSectionAddressMap())) { + FixedValue = Res; + return; + } + + // FIXME: Will the Target we already have ever have any data in it + // we need to preserve and merge with the new Target? How about + // the FixedValue? + if (!Symbol->getVariableValue()->EvaluateAsRelocatable(Target, &Layout)) + Asm.getContext().FatalError(Fixup.getLoc(), + "unable to resolve variable '" + + Symbol->getName() + "'"); + return RecordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target, + FixedValue); + } + + // Relocations inside debug sections always use local relocations when + // possible. This seems to be done because the debugger doesn't fully + // understand relocation entries and expects to find values that + // have already been fixed up. + if (Symbol->isInSection()) { + if (Section.hasAttribute(MachO::S_ATTR_DEBUG)) + Base = nullptr; + } + + // AArch64 uses external relocations as much as possible. For debug + // sections, and for pointer-sized relocations (.quad), we allow section + // relocations. It's code sections that run into trouble. + if (Base) { + Index = Base->getIndex(); + IsExtern = 1; + + // Add the local offset, if needed. + if (Base != &SD) + Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base); + } else if (Symbol->isInSection()) { + // Pointer-sized relocations can use a local relocation. Otherwise, + // we have to be in a debug info section. + if (!Section.hasAttribute(MachO::S_ATTR_DEBUG) && Log2Size != 3) + Asm.getContext().FatalError( + Fixup.getLoc(), + "unsupported relocation of local symbol '" + Symbol->getName() + + "'. Must have non-local symbol earlier in section."); + // Adjust the relocation to be section-relative. + // The index is the section ordinal (1-based). + const MCSectionData &SymSD = + Asm.getSectionData(SD.getSymbol().getSection()); + Index = SymSD.getOrdinal() + 1; + IsExtern = 0; + Value += Writer->getSymbolAddress(&SD, Layout); + + if (IsPCRel) + Value -= Writer->getFragmentAddress(Fragment, Layout) + + Fixup.getOffset() + (1ULL << Log2Size); + } else { + // Resolve constant variables. + if (SD.getSymbol().isVariable()) { + int64_t Res; + if (SD.getSymbol().getVariableValue()->EvaluateAsAbsolute( + Res, Layout, Writer->getSectionAddressMap())) { + FixedValue = Res; + return; + } + } + Asm.getContext().FatalError(Fixup.getLoc(), + "unsupported relocation of variable '" + + Symbol->getName() + "'"); + } + } + + // If the relocation kind is Branch26, Page21, or Pageoff12, any addend + // is represented via an Addend relocation, not encoded directly into + // the instruction. + if ((Type == MachO::ARM64_RELOC_BRANCH26 || + Type == MachO::ARM64_RELOC_PAGE21 || + Type == MachO::ARM64_RELOC_PAGEOFF12) && + Value) { + assert((Value & 0xff000000) == 0 && "Added relocation out of range!"); + + MachO::any_relocation_info MRE; + MRE.r_word0 = FixupOffset; + MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | + (IsExtern << 27) | (Type << 28)); + Writer->addRelocation(Fragment->getParent(), MRE); + + // Now set up the Addend relocation. + Type = MachO::ARM64_RELOC_ADDEND; + Index = Value; + IsPCRel = 0; + Log2Size = 2; + IsExtern = 0; + + // Put zero into the instruction itself. The addend is in the relocation. + Value = 0; + } + + // If there's any addend left to handle, encode it in the instruction. + FixedValue = Value; + + // struct relocation_info (8 bytes) + MachO::any_relocation_info MRE; + MRE.r_word0 = FixupOffset; + MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | + (IsExtern << 27) | (Type << 28)); + Writer->addRelocation(Fragment->getParent(), MRE); +} + +MCObjectWriter *llvm::createAArch64MachObjectWriter(raw_ostream &OS, + uint32_t CPUType, + uint32_t CPUSubtype) { + return createMachObjectWriter( + new AArch64MachObjectWriter(CPUType, CPUSubtype), OS, + /*IsLittleEndian=*/true); +} diff --git a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt new file mode 100644 index 00000000000..7d5bced17a6 --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt @@ -0,0 +1,14 @@ +add_llvm_library(LLVMAArch64Desc + AArch64AsmBackend.cpp + AArch64ELFObjectWriter.cpp + AArch64ELFStreamer.cpp + AArch64MCAsmInfo.cpp + AArch64MCCodeEmitter.cpp + AArch64MCExpr.cpp + AArch64MCTargetDesc.cpp + AArch64MachObjectWriter.cpp +) +add_dependencies(LLVMAArch64Desc AArch64CommonTableGen) + +# Hack: we need to include 'main' target directory to grab private headers +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..) diff --git a/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt b/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt new file mode 100644 index 00000000000..70cff0b704f --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt @@ -0,0 +1,24 @@ +;===- ./lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = AArch64Desc +parent = AArch64 +required_libraries = AArch64AsmPrinter AArch64Info MC Support +add_to_library_groups = AArch64 + diff --git a/lib/Target/AArch64/MCTargetDesc/Makefile b/lib/Target/AArch64/MCTargetDesc/Makefile new file mode 100644 index 00000000000..5779ac5ac60 --- /dev/null +++ b/lib/Target/AArch64/MCTargetDesc/Makefile @@ -0,0 +1,16 @@ +##===- lib/Target/AArch64/TargetDesc/Makefile --------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## + +LEVEL = ../../../.. +LIBRARYNAME = LLVMAArch64Desc + +# Hack: we need to include 'main' target directory to grab private headers +CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/AArch64/Makefile b/lib/Target/AArch64/Makefile new file mode 100644 index 00000000000..f356c585041 --- /dev/null +++ b/lib/Target/AArch64/Makefile @@ -0,0 +1,25 @@ +##===- lib/Target/AArch64/Makefile -------------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## + +LEVEL = ../../.. +LIBRARYNAME = LLVMAArch64CodeGen +TARGET = AArch64 + +# Make sure that tblgen is run, first thing. +BUILT_SOURCES = AArch64GenRegisterInfo.inc AArch64GenInstrInfo.inc \ + AArch64GenAsmWriter.inc AArch64GenAsmWriter1.inc \ + AArch64GenDAGISel.inc \ + AArch64GenCallingConv.inc AArch64GenAsmMatcher.inc \ + AArch64GenSubtargetInfo.inc AArch64GenMCCodeEmitter.inc \ + AArch64GenFastISel.inc AArch64GenDisassemblerTables.inc \ + AArch64GenMCPseudoLowering.inc + +DIRS = TargetInfo InstPrinter AsmParser Disassembler MCTargetDesc Utils + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp new file mode 100644 index 00000000000..3a382c165e7 --- /dev/null +++ b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp @@ -0,0 +1,31 @@ +//===-- AArch64TargetInfo.cpp - AArch64 Target Implementation -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Triple.h" +#include "llvm/Support/TargetRegistry.h" +using namespace llvm; + +namespace llvm { +Target TheAArch64leTarget; +Target TheAArch64beTarget; +Target TheARM64leTarget; +Target TheARM64beTarget; +} // end namespace llvm + +extern "C" void LLVMInitializeAArch64TargetInfo() { + RegisterTarget X(TheARM64leTarget, "arm64", + "AArch64 (little endian)"); + RegisterTarget Y(TheARM64beTarget, "arm64_be", + "AArch64 (big endian)"); + + RegisterTarget Z( + TheAArch64leTarget, "aarch64", "AArch64 (little endian)"); + RegisterTarget W( + TheAArch64beTarget, "aarch64_be", "AArch64 (big endian)"); +} diff --git a/lib/Target/AArch64/TargetInfo/CMakeLists.txt b/lib/Target/AArch64/TargetInfo/CMakeLists.txt new file mode 100644 index 00000000000..e236eed00be --- /dev/null +++ b/lib/Target/AArch64/TargetInfo/CMakeLists.txt @@ -0,0 +1,7 @@ +include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) + +add_llvm_library(LLVMAArch64Info + AArch64TargetInfo.cpp + ) + +add_dependencies(LLVMAArch64Info AArch64CommonTableGen) diff --git a/lib/Target/AArch64/TargetInfo/LLVMBuild.txt b/lib/Target/AArch64/TargetInfo/LLVMBuild.txt new file mode 100644 index 00000000000..93c5407bb1f --- /dev/null +++ b/lib/Target/AArch64/TargetInfo/LLVMBuild.txt @@ -0,0 +1,23 @@ +;===- ./lib/Target/AArch64/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = AArch64Info +parent = AArch64 +required_libraries = Support +add_to_library_groups = AArch64 diff --git a/lib/Target/AArch64/TargetInfo/Makefile b/lib/Target/AArch64/TargetInfo/Makefile new file mode 100644 index 00000000000..9dc9aa4bccf --- /dev/null +++ b/lib/Target/AArch64/TargetInfo/Makefile @@ -0,0 +1,15 @@ +##===- lib/Target/AArch64/TargetInfo/Makefile --------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../../.. +LIBRARYNAME = LLVMAArch64Info + +# Hack: we need to include 'main' target directory to grab private headers +CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp new file mode 100644 index 00000000000..3c24bb30a26 --- /dev/null +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -0,0 +1,901 @@ +//===-- AArch64BaseInfo.cpp - AArch64 Base encoding information------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides basic encoding and assembly information for AArch64. +// +//===----------------------------------------------------------------------===// +#include "AArch64BaseInfo.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/Regex.h" + +using namespace llvm; + +StringRef AArch64NamedImmMapper::toString(uint32_t Value, bool &Valid) const { + for (unsigned i = 0; i < NumPairs; ++i) { + if (Pairs[i].Value == Value) { + Valid = true; + return Pairs[i].Name; + } + } + + Valid = false; + return StringRef(); +} + +uint32_t AArch64NamedImmMapper::fromString(StringRef Name, bool &Valid) const { + std::string LowerCaseName = Name.lower(); + for (unsigned i = 0; i < NumPairs; ++i) { + if (Pairs[i].Name == LowerCaseName) { + Valid = true; + return Pairs[i].Value; + } + } + + Valid = false; + return -1; +} + +bool AArch64NamedImmMapper::validImm(uint32_t Value) const { + return Value < TooBigImm; +} + +const AArch64NamedImmMapper::Mapping AArch64AT::ATMapper::ATPairs[] = { + {"s1e1r", S1E1R}, + {"s1e2r", S1E2R}, + {"s1e3r", S1E3R}, + {"s1e1w", S1E1W}, + {"s1e2w", S1E2W}, + {"s1e3w", S1E3W}, + {"s1e0r", S1E0R}, + {"s1e0w", S1E0W}, + {"s12e1r", S12E1R}, + {"s12e1w", S12E1W}, + {"s12e0r", S12E0R}, + {"s12e0w", S12E0W}, +}; + +AArch64AT::ATMapper::ATMapper() + : AArch64NamedImmMapper(ATPairs, 0) {} + +const AArch64NamedImmMapper::Mapping AArch64DB::DBarrierMapper::DBarrierPairs[] = { + {"oshld", OSHLD}, + {"oshst", OSHST}, + {"osh", OSH}, + {"nshld", NSHLD}, + {"nshst", NSHST}, + {"nsh", NSH}, + {"ishld", ISHLD}, + {"ishst", ISHST}, + {"ish", ISH}, + {"ld", LD}, + {"st", ST}, + {"sy", SY} +}; + +AArch64DB::DBarrierMapper::DBarrierMapper() + : AArch64NamedImmMapper(DBarrierPairs, 16u) {} + +const AArch64NamedImmMapper::Mapping AArch64DC::DCMapper::DCPairs[] = { + {"zva", ZVA}, + {"ivac", IVAC}, + {"isw", ISW}, + {"cvac", CVAC}, + {"csw", CSW}, + {"cvau", CVAU}, + {"civac", CIVAC}, + {"cisw", CISW} +}; + +AArch64DC::DCMapper::DCMapper() + : AArch64NamedImmMapper(DCPairs, 0) {} + +const AArch64NamedImmMapper::Mapping AArch64IC::ICMapper::ICPairs[] = { + {"ialluis", IALLUIS}, + {"iallu", IALLU}, + {"ivau", IVAU} +}; + +AArch64IC::ICMapper::ICMapper() + : AArch64NamedImmMapper(ICPairs, 0) {} + +const AArch64NamedImmMapper::Mapping AArch64ISB::ISBMapper::ISBPairs[] = { + {"sy", SY}, +}; + +AArch64ISB::ISBMapper::ISBMapper() + : AArch64NamedImmMapper(ISBPairs, 16) {} + +const AArch64NamedImmMapper::Mapping AArch64PRFM::PRFMMapper::PRFMPairs[] = { + {"pldl1keep", PLDL1KEEP}, + {"pldl1strm", PLDL1STRM}, + {"pldl2keep", PLDL2KEEP}, + {"pldl2strm", PLDL2STRM}, + {"pldl3keep", PLDL3KEEP}, + {"pldl3strm", PLDL3STRM}, + {"plil1keep", PLIL1KEEP}, + {"plil1strm", PLIL1STRM}, + {"plil2keep", PLIL2KEEP}, + {"plil2strm", PLIL2STRM}, + {"plil3keep", PLIL3KEEP}, + {"plil3strm", PLIL3STRM}, + {"pstl1keep", PSTL1KEEP}, + {"pstl1strm", PSTL1STRM}, + {"pstl2keep", PSTL2KEEP}, + {"pstl2strm", PSTL2STRM}, + {"pstl3keep", PSTL3KEEP}, + {"pstl3strm", PSTL3STRM} +}; + +AArch64PRFM::PRFMMapper::PRFMMapper() + : AArch64NamedImmMapper(PRFMPairs, 32) {} + +const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStatePairs[] = { + {"spsel", SPSel}, + {"daifset", DAIFSet}, + {"daifclr", DAIFClr} +}; + +AArch64PState::PStateMapper::PStateMapper() + : AArch64NamedImmMapper(PStatePairs, 0) {} + +const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSPairs[] = { + {"mdccsr_el0", MDCCSR_EL0}, + {"dbgdtrrx_el0", DBGDTRRX_EL0}, + {"mdrar_el1", MDRAR_EL1}, + {"oslsr_el1", OSLSR_EL1}, + {"dbgauthstatus_el1", DBGAUTHSTATUS_EL1}, + {"pmceid0_el0", PMCEID0_EL0}, + {"pmceid1_el0", PMCEID1_EL0}, + {"midr_el1", MIDR_EL1}, + {"ccsidr_el1", CCSIDR_EL1}, + {"clidr_el1", CLIDR_EL1}, + {"ctr_el0", CTR_EL0}, + {"mpidr_el1", MPIDR_EL1}, + {"revidr_el1", REVIDR_EL1}, + {"aidr_el1", AIDR_EL1}, + {"dczid_el0", DCZID_EL0}, + {"id_pfr0_el1", ID_PFR0_EL1}, + {"id_pfr1_el1", ID_PFR1_EL1}, + {"id_dfr0_el1", ID_DFR0_EL1}, + {"id_afr0_el1", ID_AFR0_EL1}, + {"id_mmfr0_el1", ID_MMFR0_EL1}, + {"id_mmfr1_el1", ID_MMFR1_EL1}, + {"id_mmfr2_el1", ID_MMFR2_EL1}, + {"id_mmfr3_el1", ID_MMFR3_EL1}, + {"id_isar0_el1", ID_ISAR0_EL1}, + {"id_isar1_el1", ID_ISAR1_EL1}, + {"id_isar2_el1", ID_ISAR2_EL1}, + {"id_isar3_el1", ID_ISAR3_EL1}, + {"id_isar4_el1", ID_ISAR4_EL1}, + {"id_isar5_el1", ID_ISAR5_EL1}, + {"id_aa64pfr0_el1", ID_A64PFR0_EL1}, + {"id_aa64pfr1_el1", ID_A64PFR1_EL1}, + {"id_aa64dfr0_el1", ID_A64DFR0_EL1}, + {"id_aa64dfr1_el1", ID_A64DFR1_EL1}, + {"id_aa64afr0_el1", ID_A64AFR0_EL1}, + {"id_aa64afr1_el1", ID_A64AFR1_EL1}, + {"id_aa64isar0_el1", ID_A64ISAR0_EL1}, + {"id_aa64isar1_el1", ID_A64ISAR1_EL1}, + {"id_aa64mmfr0_el1", ID_A64MMFR0_EL1}, + {"id_aa64mmfr1_el1", ID_A64MMFR1_EL1}, + {"mvfr0_el1", MVFR0_EL1}, + {"mvfr1_el1", MVFR1_EL1}, + {"mvfr2_el1", MVFR2_EL1}, + {"rvbar_el1", RVBAR_EL1}, + {"rvbar_el2", RVBAR_EL2}, + {"rvbar_el3", RVBAR_EL3}, + {"isr_el1", ISR_EL1}, + {"cntpct_el0", CNTPCT_EL0}, + {"cntvct_el0", CNTVCT_EL0}, + + // Trace registers + {"trcstatr", TRCSTATR}, + {"trcidr8", TRCIDR8}, + {"trcidr9", TRCIDR9}, + {"trcidr10", TRCIDR10}, + {"trcidr11", TRCIDR11}, + {"trcidr12", TRCIDR12}, + {"trcidr13", TRCIDR13}, + {"trcidr0", TRCIDR0}, + {"trcidr1", TRCIDR1}, + {"trcidr2", TRCIDR2}, + {"trcidr3", TRCIDR3}, + {"trcidr4", TRCIDR4}, + {"trcidr5", TRCIDR5}, + {"trcidr6", TRCIDR6}, + {"trcidr7", TRCIDR7}, + {"trcoslsr", TRCOSLSR}, + {"trcpdsr", TRCPDSR}, + {"trcdevaff0", TRCDEVAFF0}, + {"trcdevaff1", TRCDEVAFF1}, + {"trclsr", TRCLSR}, + {"trcauthstatus", TRCAUTHSTATUS}, + {"trcdevarch", TRCDEVARCH}, + {"trcdevid", TRCDEVID}, + {"trcdevtype", TRCDEVTYPE}, + {"trcpidr4", TRCPIDR4}, + {"trcpidr5", TRCPIDR5}, + {"trcpidr6", TRCPIDR6}, + {"trcpidr7", TRCPIDR7}, + {"trcpidr0", TRCPIDR0}, + {"trcpidr1", TRCPIDR1}, + {"trcpidr2", TRCPIDR2}, + {"trcpidr3", TRCPIDR3}, + {"trccidr0", TRCCIDR0}, + {"trccidr1", TRCCIDR1}, + {"trccidr2", TRCCIDR2}, + {"trccidr3", TRCCIDR3}, + + // GICv3 registers + {"icc_iar1_el1", ICC_IAR1_EL1}, + {"icc_iar0_el1", ICC_IAR0_EL1}, + {"icc_hppir1_el1", ICC_HPPIR1_EL1}, + {"icc_hppir0_el1", ICC_HPPIR0_EL1}, + {"icc_rpr_el1", ICC_RPR_EL1}, + {"ich_vtr_el2", ICH_VTR_EL2}, + {"ich_eisr_el2", ICH_EISR_EL2}, + {"ich_elsr_el2", ICH_ELSR_EL2} +}; + +AArch64SysReg::MRSMapper::MRSMapper(uint64_t FeatureBits) + : SysRegMapper(FeatureBits) { + InstPairs = &MRSPairs[0]; + NumInstPairs = llvm::array_lengthof(MRSPairs); +} + +const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRPairs[] = { + {"dbgdtrtx_el0", DBGDTRTX_EL0}, + {"oslar_el1", OSLAR_EL1}, + {"pmswinc_el0", PMSWINC_EL0}, + + // Trace registers + {"trcoslar", TRCOSLAR}, + {"trclar", TRCLAR}, + + // GICv3 registers + {"icc_eoir1_el1", ICC_EOIR1_EL1}, + {"icc_eoir0_el1", ICC_EOIR0_EL1}, + {"icc_dir_el1", ICC_DIR_EL1}, + {"icc_sgi1r_el1", ICC_SGI1R_EL1}, + {"icc_asgi1r_el1", ICC_ASGI1R_EL1}, + {"icc_sgi0r_el1", ICC_SGI0R_EL1} +}; + +AArch64SysReg::MSRMapper::MSRMapper(uint64_t FeatureBits) + : SysRegMapper(FeatureBits) { + InstPairs = &MSRPairs[0]; + NumInstPairs = llvm::array_lengthof(MSRPairs); +} + + +const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegPairs[] = { + {"osdtrrx_el1", OSDTRRX_EL1}, + {"osdtrtx_el1", OSDTRTX_EL1}, + {"teecr32_el1", TEECR32_EL1}, + {"mdccint_el1", MDCCINT_EL1}, + {"mdscr_el1", MDSCR_EL1}, + {"dbgdtr_el0", DBGDTR_EL0}, + {"oseccr_el1", OSECCR_EL1}, + {"dbgvcr32_el2", DBGVCR32_EL2}, + {"dbgbvr0_el1", DBGBVR0_EL1}, + {"dbgbvr1_el1", DBGBVR1_EL1}, + {"dbgbvr2_el1", DBGBVR2_EL1}, + {"dbgbvr3_el1", DBGBVR3_EL1}, + {"dbgbvr4_el1", DBGBVR4_EL1}, + {"dbgbvr5_el1", DBGBVR5_EL1}, + {"dbgbvr6_el1", DBGBVR6_EL1}, + {"dbgbvr7_el1", DBGBVR7_EL1}, + {"dbgbvr8_el1", DBGBVR8_EL1}, + {"dbgbvr9_el1", DBGBVR9_EL1}, + {"dbgbvr10_el1", DBGBVR10_EL1}, + {"dbgbvr11_el1", DBGBVR11_EL1}, + {"dbgbvr12_el1", DBGBVR12_EL1}, + {"dbgbvr13_el1", DBGBVR13_EL1}, + {"dbgbvr14_el1", DBGBVR14_EL1}, + {"dbgbvr15_el1", DBGBVR15_EL1}, + {"dbgbcr0_el1", DBGBCR0_EL1}, + {"dbgbcr1_el1", DBGBCR1_EL1}, + {"dbgbcr2_el1", DBGBCR2_EL1}, + {"dbgbcr3_el1", DBGBCR3_EL1}, + {"dbgbcr4_el1", DBGBCR4_EL1}, + {"dbgbcr5_el1", DBGBCR5_EL1}, + {"dbgbcr6_el1", DBGBCR6_EL1}, + {"dbgbcr7_el1", DBGBCR7_EL1}, + {"dbgbcr8_el1", DBGBCR8_EL1}, + {"dbgbcr9_el1", DBGBCR9_EL1}, + {"dbgbcr10_el1", DBGBCR10_EL1}, + {"dbgbcr11_el1", DBGBCR11_EL1}, + {"dbgbcr12_el1", DBGBCR12_EL1}, + {"dbgbcr13_el1", DBGBCR13_EL1}, + {"dbgbcr14_el1", DBGBCR14_EL1}, + {"dbgbcr15_el1", DBGBCR15_EL1}, + {"dbgwvr0_el1", DBGWVR0_EL1}, + {"dbgwvr1_el1", DBGWVR1_EL1}, + {"dbgwvr2_el1", DBGWVR2_EL1}, + {"dbgwvr3_el1", DBGWVR3_EL1}, + {"dbgwvr4_el1", DBGWVR4_EL1}, + {"dbgwvr5_el1", DBGWVR5_EL1}, + {"dbgwvr6_el1", DBGWVR6_EL1}, + {"dbgwvr7_el1", DBGWVR7_EL1}, + {"dbgwvr8_el1", DBGWVR8_EL1}, + {"dbgwvr9_el1", DBGWVR9_EL1}, + {"dbgwvr10_el1", DBGWVR10_EL1}, + {"dbgwvr11_el1", DBGWVR11_EL1}, + {"dbgwvr12_el1", DBGWVR12_EL1}, + {"dbgwvr13_el1", DBGWVR13_EL1}, + {"dbgwvr14_el1", DBGWVR14_EL1}, + {"dbgwvr15_el1", DBGWVR15_EL1}, + {"dbgwcr0_el1", DBGWCR0_EL1}, + {"dbgwcr1_el1", DBGWCR1_EL1}, + {"dbgwcr2_el1", DBGWCR2_EL1}, + {"dbgwcr3_el1", DBGWCR3_EL1}, + {"dbgwcr4_el1", DBGWCR4_EL1}, + {"dbgwcr5_el1", DBGWCR5_EL1}, + {"dbgwcr6_el1", DBGWCR6_EL1}, + {"dbgwcr7_el1", DBGWCR7_EL1}, + {"dbgwcr8_el1", DBGWCR8_EL1}, + {"dbgwcr9_el1", DBGWCR9_EL1}, + {"dbgwcr10_el1", DBGWCR10_EL1}, + {"dbgwcr11_el1", DBGWCR11_EL1}, + {"dbgwcr12_el1", DBGWCR12_EL1}, + {"dbgwcr13_el1", DBGWCR13_EL1}, + {"dbgwcr14_el1", DBGWCR14_EL1}, + {"dbgwcr15_el1", DBGWCR15_EL1}, + {"teehbr32_el1", TEEHBR32_EL1}, + {"osdlr_el1", OSDLR_EL1}, + {"dbgprcr_el1", DBGPRCR_EL1}, + {"dbgclaimset_el1", DBGCLAIMSET_EL1}, + {"dbgclaimclr_el1", DBGCLAIMCLR_EL1}, + {"csselr_el1", CSSELR_EL1}, + {"vpidr_el2", VPIDR_EL2}, + {"vmpidr_el2", VMPIDR_EL2}, + {"sctlr_el1", SCTLR_EL1}, + {"sctlr_el2", SCTLR_EL2}, + {"sctlr_el3", SCTLR_EL3}, + {"actlr_el1", ACTLR_EL1}, + {"actlr_el2", ACTLR_EL2}, + {"actlr_el3", ACTLR_EL3}, + {"cpacr_el1", CPACR_EL1}, + {"hcr_el2", HCR_EL2}, + {"scr_el3", SCR_EL3}, + {"mdcr_el2", MDCR_EL2}, + {"sder32_el3", SDER32_EL3}, + {"cptr_el2", CPTR_EL2}, + {"cptr_el3", CPTR_EL3}, + {"hstr_el2", HSTR_EL2}, + {"hacr_el2", HACR_EL2}, + {"mdcr_el3", MDCR_EL3}, + {"ttbr0_el1", TTBR0_EL1}, + {"ttbr0_el2", TTBR0_EL2}, + {"ttbr0_el3", TTBR0_EL3}, + {"ttbr1_el1", TTBR1_EL1}, + {"tcr_el1", TCR_EL1}, + {"tcr_el2", TCR_EL2}, + {"tcr_el3", TCR_EL3}, + {"vttbr_el2", VTTBR_EL2}, + {"vtcr_el2", VTCR_EL2}, + {"dacr32_el2", DACR32_EL2}, + {"spsr_el1", SPSR_EL1}, + {"spsr_el2", SPSR_EL2}, + {"spsr_el3", SPSR_EL3}, + {"elr_el1", ELR_EL1}, + {"elr_el2", ELR_EL2}, + {"elr_el3", ELR_EL3}, + {"sp_el0", SP_EL0}, + {"sp_el1", SP_EL1}, + {"sp_el2", SP_EL2}, + {"spsel", SPSel}, + {"nzcv", NZCV}, + {"daif", DAIF}, + {"currentel", CurrentEL}, + {"spsr_irq", SPSR_irq}, + {"spsr_abt", SPSR_abt}, + {"spsr_und", SPSR_und}, + {"spsr_fiq", SPSR_fiq}, + {"fpcr", FPCR}, + {"fpsr", FPSR}, + {"dspsr_el0", DSPSR_EL0}, + {"dlr_el0", DLR_EL0}, + {"ifsr32_el2", IFSR32_EL2}, + {"afsr0_el1", AFSR0_EL1}, + {"afsr0_el2", AFSR0_EL2}, + {"afsr0_el3", AFSR0_EL3}, + {"afsr1_el1", AFSR1_EL1}, + {"afsr1_el2", AFSR1_EL2}, + {"afsr1_el3", AFSR1_EL3}, + {"esr_el1", ESR_EL1}, + {"esr_el2", ESR_EL2}, + {"esr_el3", ESR_EL3}, + {"fpexc32_el2", FPEXC32_EL2}, + {"far_el1", FAR_EL1}, + {"far_el2", FAR_EL2}, + {"far_el3", FAR_EL3}, + {"hpfar_el2", HPFAR_EL2}, + {"par_el1", PAR_EL1}, + {"pmcr_el0", PMCR_EL0}, + {"pmcntenset_el0", PMCNTENSET_EL0}, + {"pmcntenclr_el0", PMCNTENCLR_EL0}, + {"pmovsclr_el0", PMOVSCLR_EL0}, + {"pmselr_el0", PMSELR_EL0}, + {"pmccntr_el0", PMCCNTR_EL0}, + {"pmxevtyper_el0", PMXEVTYPER_EL0}, + {"pmxevcntr_el0", PMXEVCNTR_EL0}, + {"pmuserenr_el0", PMUSERENR_EL0}, + {"pmintenset_el1", PMINTENSET_EL1}, + {"pmintenclr_el1", PMINTENCLR_EL1}, + {"pmovsset_el0", PMOVSSET_EL0}, + {"mair_el1", MAIR_EL1}, + {"mair_el2", MAIR_EL2}, + {"mair_el3", MAIR_EL3}, + {"amair_el1", AMAIR_EL1}, + {"amair_el2", AMAIR_EL2}, + {"amair_el3", AMAIR_EL3}, + {"vbar_el1", VBAR_EL1}, + {"vbar_el2", VBAR_EL2}, + {"vbar_el3", VBAR_EL3}, + {"rmr_el1", RMR_EL1}, + {"rmr_el2", RMR_EL2}, + {"rmr_el3", RMR_EL3}, + {"contextidr_el1", CONTEXTIDR_EL1}, + {"tpidr_el0", TPIDR_EL0}, + {"tpidr_el2", TPIDR_EL2}, + {"tpidr_el3", TPIDR_EL3}, + {"tpidrro_el0", TPIDRRO_EL0}, + {"tpidr_el1", TPIDR_EL1}, + {"cntfrq_el0", CNTFRQ_EL0}, + {"cntvoff_el2", CNTVOFF_EL2}, + {"cntkctl_el1", CNTKCTL_EL1}, + {"cnthctl_el2", CNTHCTL_EL2}, + {"cntp_tval_el0", CNTP_TVAL_EL0}, + {"cnthp_tval_el2", CNTHP_TVAL_EL2}, + {"cntps_tval_el1", CNTPS_TVAL_EL1}, + {"cntp_ctl_el0", CNTP_CTL_EL0}, + {"cnthp_ctl_el2", CNTHP_CTL_EL2}, + {"cntps_ctl_el1", CNTPS_CTL_EL1}, + {"cntp_cval_el0", CNTP_CVAL_EL0}, + {"cnthp_cval_el2", CNTHP_CVAL_EL2}, + {"cntps_cval_el1", CNTPS_CVAL_EL1}, + {"cntv_tval_el0", CNTV_TVAL_EL0}, + {"cntv_ctl_el0", CNTV_CTL_EL0}, + {"cntv_cval_el0", CNTV_CVAL_EL0}, + {"pmevcntr0_el0", PMEVCNTR0_EL0}, + {"pmevcntr1_el0", PMEVCNTR1_EL0}, + {"pmevcntr2_el0", PMEVCNTR2_EL0}, + {"pmevcntr3_el0", PMEVCNTR3_EL0}, + {"pmevcntr4_el0", PMEVCNTR4_EL0}, + {"pmevcntr5_el0", PMEVCNTR5_EL0}, + {"pmevcntr6_el0", PMEVCNTR6_EL0}, + {"pmevcntr7_el0", PMEVCNTR7_EL0}, + {"pmevcntr8_el0", PMEVCNTR8_EL0}, + {"pmevcntr9_el0", PMEVCNTR9_EL0}, + {"pmevcntr10_el0", PMEVCNTR10_EL0}, + {"pmevcntr11_el0", PMEVCNTR11_EL0}, + {"pmevcntr12_el0", PMEVCNTR12_EL0}, + {"pmevcntr13_el0", PMEVCNTR13_EL0}, + {"pmevcntr14_el0", PMEVCNTR14_EL0}, + {"pmevcntr15_el0", PMEVCNTR15_EL0}, + {"pmevcntr16_el0", PMEVCNTR16_EL0}, + {"pmevcntr17_el0", PMEVCNTR17_EL0}, + {"pmevcntr18_el0", PMEVCNTR18_EL0}, + {"pmevcntr19_el0", PMEVCNTR19_EL0}, + {"pmevcntr20_el0", PMEVCNTR20_EL0}, + {"pmevcntr21_el0", PMEVCNTR21_EL0}, + {"pmevcntr22_el0", PMEVCNTR22_EL0}, + {"pmevcntr23_el0", PMEVCNTR23_EL0}, + {"pmevcntr24_el0", PMEVCNTR24_EL0}, + {"pmevcntr25_el0", PMEVCNTR25_EL0}, + {"pmevcntr26_el0", PMEVCNTR26_EL0}, + {"pmevcntr27_el0", PMEVCNTR27_EL0}, + {"pmevcntr28_el0", PMEVCNTR28_EL0}, + {"pmevcntr29_el0", PMEVCNTR29_EL0}, + {"pmevcntr30_el0", PMEVCNTR30_EL0}, + {"pmccfiltr_el0", PMCCFILTR_EL0}, + {"pmevtyper0_el0", PMEVTYPER0_EL0}, + {"pmevtyper1_el0", PMEVTYPER1_EL0}, + {"pmevtyper2_el0", PMEVTYPER2_EL0}, + {"pmevtyper3_el0", PMEVTYPER3_EL0}, + {"pmevtyper4_el0", PMEVTYPER4_EL0}, + {"pmevtyper5_el0", PMEVTYPER5_EL0}, + {"pmevtyper6_el0", PMEVTYPER6_EL0}, + {"pmevtyper7_el0", PMEVTYPER7_EL0}, + {"pmevtyper8_el0", PMEVTYPER8_EL0}, + {"pmevtyper9_el0", PMEVTYPER9_EL0}, + {"pmevtyper10_el0", PMEVTYPER10_EL0}, + {"pmevtyper11_el0", PMEVTYPER11_EL0}, + {"pmevtyper12_el0", PMEVTYPER12_EL0}, + {"pmevtyper13_el0", PMEVTYPER13_EL0}, + {"pmevtyper14_el0", PMEVTYPER14_EL0}, + {"pmevtyper15_el0", PMEVTYPER15_EL0}, + {"pmevtyper16_el0", PMEVTYPER16_EL0}, + {"pmevtyper17_el0", PMEVTYPER17_EL0}, + {"pmevtyper18_el0", PMEVTYPER18_EL0}, + {"pmevtyper19_el0", PMEVTYPER19_EL0}, + {"pmevtyper20_el0", PMEVTYPER20_EL0}, + {"pmevtyper21_el0", PMEVTYPER21_EL0}, + {"pmevtyper22_el0", PMEVTYPER22_EL0}, + {"pmevtyper23_el0", PMEVTYPER23_EL0}, + {"pmevtyper24_el0", PMEVTYPER24_EL0}, + {"pmevtyper25_el0", PMEVTYPER25_EL0}, + {"pmevtyper26_el0", PMEVTYPER26_EL0}, + {"pmevtyper27_el0", PMEVTYPER27_EL0}, + {"pmevtyper28_el0", PMEVTYPER28_EL0}, + {"pmevtyper29_el0", PMEVTYPER29_EL0}, + {"pmevtyper30_el0", PMEVTYPER30_EL0}, + + // Trace registers + {"trcprgctlr", TRCPRGCTLR}, + {"trcprocselr", TRCPROCSELR}, + {"trcconfigr", TRCCONFIGR}, + {"trcauxctlr", TRCAUXCTLR}, + {"trceventctl0r", TRCEVENTCTL0R}, + {"trceventctl1r", TRCEVENTCTL1R}, + {"trcstallctlr", TRCSTALLCTLR}, + {"trctsctlr", TRCTSCTLR}, + {"trcsyncpr", TRCSYNCPR}, + {"trcccctlr", TRCCCCTLR}, + {"trcbbctlr", TRCBBCTLR}, + {"trctraceidr", TRCTRACEIDR}, + {"trcqctlr", TRCQCTLR}, + {"trcvictlr", TRCVICTLR}, + {"trcviiectlr", TRCVIIECTLR}, + {"trcvissctlr", TRCVISSCTLR}, + {"trcvipcssctlr", TRCVIPCSSCTLR}, + {"trcvdctlr", TRCVDCTLR}, + {"trcvdsacctlr", TRCVDSACCTLR}, + {"trcvdarcctlr", TRCVDARCCTLR}, + {"trcseqevr0", TRCSEQEVR0}, + {"trcseqevr1", TRCSEQEVR1}, + {"trcseqevr2", TRCSEQEVR2}, + {"trcseqrstevr", TRCSEQRSTEVR}, + {"trcseqstr", TRCSEQSTR}, + {"trcextinselr", TRCEXTINSELR}, + {"trccntrldvr0", TRCCNTRLDVR0}, + {"trccntrldvr1", TRCCNTRLDVR1}, + {"trccntrldvr2", TRCCNTRLDVR2}, + {"trccntrldvr3", TRCCNTRLDVR3}, + {"trccntctlr0", TRCCNTCTLR0}, + {"trccntctlr1", TRCCNTCTLR1}, + {"trccntctlr2", TRCCNTCTLR2}, + {"trccntctlr3", TRCCNTCTLR3}, + {"trccntvr0", TRCCNTVR0}, + {"trccntvr1", TRCCNTVR1}, + {"trccntvr2", TRCCNTVR2}, + {"trccntvr3", TRCCNTVR3}, + {"trcimspec0", TRCIMSPEC0}, + {"trcimspec1", TRCIMSPEC1}, + {"trcimspec2", TRCIMSPEC2}, + {"trcimspec3", TRCIMSPEC3}, + {"trcimspec4", TRCIMSPEC4}, + {"trcimspec5", TRCIMSPEC5}, + {"trcimspec6", TRCIMSPEC6}, + {"trcimspec7", TRCIMSPEC7}, + {"trcrsctlr2", TRCRSCTLR2}, + {"trcrsctlr3", TRCRSCTLR3}, + {"trcrsctlr4", TRCRSCTLR4}, + {"trcrsctlr5", TRCRSCTLR5}, + {"trcrsctlr6", TRCRSCTLR6}, + {"trcrsctlr7", TRCRSCTLR7}, + {"trcrsctlr8", TRCRSCTLR8}, + {"trcrsctlr9", TRCRSCTLR9}, + {"trcrsctlr10", TRCRSCTLR10}, + {"trcrsctlr11", TRCRSCTLR11}, + {"trcrsctlr12", TRCRSCTLR12}, + {"trcrsctlr13", TRCRSCTLR13}, + {"trcrsctlr14", TRCRSCTLR14}, + {"trcrsctlr15", TRCRSCTLR15}, + {"trcrsctlr16", TRCRSCTLR16}, + {"trcrsctlr17", TRCRSCTLR17}, + {"trcrsctlr18", TRCRSCTLR18}, + {"trcrsctlr19", TRCRSCTLR19}, + {"trcrsctlr20", TRCRSCTLR20}, + {"trcrsctlr21", TRCRSCTLR21}, + {"trcrsctlr22", TRCRSCTLR22}, + {"trcrsctlr23", TRCRSCTLR23}, + {"trcrsctlr24", TRCRSCTLR24}, + {"trcrsctlr25", TRCRSCTLR25}, + {"trcrsctlr26", TRCRSCTLR26}, + {"trcrsctlr27", TRCRSCTLR27}, + {"trcrsctlr28", TRCRSCTLR28}, + {"trcrsctlr29", TRCRSCTLR29}, + {"trcrsctlr30", TRCRSCTLR30}, + {"trcrsctlr31", TRCRSCTLR31}, + {"trcssccr0", TRCSSCCR0}, + {"trcssccr1", TRCSSCCR1}, + {"trcssccr2", TRCSSCCR2}, + {"trcssccr3", TRCSSCCR3}, + {"trcssccr4", TRCSSCCR4}, + {"trcssccr5", TRCSSCCR5}, + {"trcssccr6", TRCSSCCR6}, + {"trcssccr7", TRCSSCCR7}, + {"trcsscsr0", TRCSSCSR0}, + {"trcsscsr1", TRCSSCSR1}, + {"trcsscsr2", TRCSSCSR2}, + {"trcsscsr3", TRCSSCSR3}, + {"trcsscsr4", TRCSSCSR4}, + {"trcsscsr5", TRCSSCSR5}, + {"trcsscsr6", TRCSSCSR6}, + {"trcsscsr7", TRCSSCSR7}, + {"trcsspcicr0", TRCSSPCICR0}, + {"trcsspcicr1", TRCSSPCICR1}, + {"trcsspcicr2", TRCSSPCICR2}, + {"trcsspcicr3", TRCSSPCICR3}, + {"trcsspcicr4", TRCSSPCICR4}, + {"trcsspcicr5", TRCSSPCICR5}, + {"trcsspcicr6", TRCSSPCICR6}, + {"trcsspcicr7", TRCSSPCICR7}, + {"trcpdcr", TRCPDCR}, + {"trcacvr0", TRCACVR0}, + {"trcacvr1", TRCACVR1}, + {"trcacvr2", TRCACVR2}, + {"trcacvr3", TRCACVR3}, + {"trcacvr4", TRCACVR4}, + {"trcacvr5", TRCACVR5}, + {"trcacvr6", TRCACVR6}, + {"trcacvr7", TRCACVR7}, + {"trcacvr8", TRCACVR8}, + {"trcacvr9", TRCACVR9}, + {"trcacvr10", TRCACVR10}, + {"trcacvr11", TRCACVR11}, + {"trcacvr12", TRCACVR12}, + {"trcacvr13", TRCACVR13}, + {"trcacvr14", TRCACVR14}, + {"trcacvr15", TRCACVR15}, + {"trcacatr0", TRCACATR0}, + {"trcacatr1", TRCACATR1}, + {"trcacatr2", TRCACATR2}, + {"trcacatr3", TRCACATR3}, + {"trcacatr4", TRCACATR4}, + {"trcacatr5", TRCACATR5}, + {"trcacatr6", TRCACATR6}, + {"trcacatr7", TRCACATR7}, + {"trcacatr8", TRCACATR8}, + {"trcacatr9", TRCACATR9}, + {"trcacatr10", TRCACATR10}, + {"trcacatr11", TRCACATR11}, + {"trcacatr12", TRCACATR12}, + {"trcacatr13", TRCACATR13}, + {"trcacatr14", TRCACATR14}, + {"trcacatr15", TRCACATR15}, + {"trcdvcvr0", TRCDVCVR0}, + {"trcdvcvr1", TRCDVCVR1}, + {"trcdvcvr2", TRCDVCVR2}, + {"trcdvcvr3", TRCDVCVR3}, + {"trcdvcvr4", TRCDVCVR4}, + {"trcdvcvr5", TRCDVCVR5}, + {"trcdvcvr6", TRCDVCVR6}, + {"trcdvcvr7", TRCDVCVR7}, + {"trcdvcmr0", TRCDVCMR0}, + {"trcdvcmr1", TRCDVCMR1}, + {"trcdvcmr2", TRCDVCMR2}, + {"trcdvcmr3", TRCDVCMR3}, + {"trcdvcmr4", TRCDVCMR4}, + {"trcdvcmr5", TRCDVCMR5}, + {"trcdvcmr6", TRCDVCMR6}, + {"trcdvcmr7", TRCDVCMR7}, + {"trccidcvr0", TRCCIDCVR0}, + {"trccidcvr1", TRCCIDCVR1}, + {"trccidcvr2", TRCCIDCVR2}, + {"trccidcvr3", TRCCIDCVR3}, + {"trccidcvr4", TRCCIDCVR4}, + {"trccidcvr5", TRCCIDCVR5}, + {"trccidcvr6", TRCCIDCVR6}, + {"trccidcvr7", TRCCIDCVR7}, + {"trcvmidcvr0", TRCVMIDCVR0}, + {"trcvmidcvr1", TRCVMIDCVR1}, + {"trcvmidcvr2", TRCVMIDCVR2}, + {"trcvmidcvr3", TRCVMIDCVR3}, + {"trcvmidcvr4", TRCVMIDCVR4}, + {"trcvmidcvr5", TRCVMIDCVR5}, + {"trcvmidcvr6", TRCVMIDCVR6}, + {"trcvmidcvr7", TRCVMIDCVR7}, + {"trccidcctlr0", TRCCIDCCTLR0}, + {"trccidcctlr1", TRCCIDCCTLR1}, + {"trcvmidcctlr0", TRCVMIDCCTLR0}, + {"trcvmidcctlr1", TRCVMIDCCTLR1}, + {"trcitctrl", TRCITCTRL}, + {"trcclaimset", TRCCLAIMSET}, + {"trcclaimclr", TRCCLAIMCLR}, + + // GICv3 registers + {"icc_bpr1_el1", ICC_BPR1_EL1}, + {"icc_bpr0_el1", ICC_BPR0_EL1}, + {"icc_pmr_el1", ICC_PMR_EL1}, + {"icc_ctlr_el1", ICC_CTLR_EL1}, + {"icc_ctlr_el3", ICC_CTLR_EL3}, + {"icc_sre_el1", ICC_SRE_EL1}, + {"icc_sre_el2", ICC_SRE_EL2}, + {"icc_sre_el3", ICC_SRE_EL3}, + {"icc_igrpen0_el1", ICC_IGRPEN0_EL1}, + {"icc_igrpen1_el1", ICC_IGRPEN1_EL1}, + {"icc_igrpen1_el3", ICC_IGRPEN1_EL3}, + {"icc_seien_el1", ICC_SEIEN_EL1}, + {"icc_ap0r0_el1", ICC_AP0R0_EL1}, + {"icc_ap0r1_el1", ICC_AP0R1_EL1}, + {"icc_ap0r2_el1", ICC_AP0R2_EL1}, + {"icc_ap0r3_el1", ICC_AP0R3_EL1}, + {"icc_ap1r0_el1", ICC_AP1R0_EL1}, + {"icc_ap1r1_el1", ICC_AP1R1_EL1}, + {"icc_ap1r2_el1", ICC_AP1R2_EL1}, + {"icc_ap1r3_el1", ICC_AP1R3_EL1}, + {"ich_ap0r0_el2", ICH_AP0R0_EL2}, + {"ich_ap0r1_el2", ICH_AP0R1_EL2}, + {"ich_ap0r2_el2", ICH_AP0R2_EL2}, + {"ich_ap0r3_el2", ICH_AP0R3_EL2}, + {"ich_ap1r0_el2", ICH_AP1R0_EL2}, + {"ich_ap1r1_el2", ICH_AP1R1_EL2}, + {"ich_ap1r2_el2", ICH_AP1R2_EL2}, + {"ich_ap1r3_el2", ICH_AP1R3_EL2}, + {"ich_hcr_el2", ICH_HCR_EL2}, + {"ich_misr_el2", ICH_MISR_EL2}, + {"ich_vmcr_el2", ICH_VMCR_EL2}, + {"ich_vseir_el2", ICH_VSEIR_EL2}, + {"ich_lr0_el2", ICH_LR0_EL2}, + {"ich_lr1_el2", ICH_LR1_EL2}, + {"ich_lr2_el2", ICH_LR2_EL2}, + {"ich_lr3_el2", ICH_LR3_EL2}, + {"ich_lr4_el2", ICH_LR4_EL2}, + {"ich_lr5_el2", ICH_LR5_EL2}, + {"ich_lr6_el2", ICH_LR6_EL2}, + {"ich_lr7_el2", ICH_LR7_EL2}, + {"ich_lr8_el2", ICH_LR8_EL2}, + {"ich_lr9_el2", ICH_LR9_EL2}, + {"ich_lr10_el2", ICH_LR10_EL2}, + {"ich_lr11_el2", ICH_LR11_EL2}, + {"ich_lr12_el2", ICH_LR12_EL2}, + {"ich_lr13_el2", ICH_LR13_EL2}, + {"ich_lr14_el2", ICH_LR14_EL2}, + {"ich_lr15_el2", ICH_LR15_EL2} +}; + +const AArch64NamedImmMapper::Mapping +AArch64SysReg::SysRegMapper::CycloneSysRegPairs[] = { + {"cpm_ioacc_ctl_el3", CPM_IOACC_CTL_EL3} +}; + +uint32_t +AArch64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const { + std::string NameLower = Name.lower(); + + // First search the registers shared by all + for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) { + if (SysRegPairs[i].Name == NameLower) { + Valid = true; + return SysRegPairs[i].Value; + } + } + + // Next search for target specific registers + if (FeatureBits & AArch64::ProcCyclone) { + for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) { + if (CycloneSysRegPairs[i].Name == NameLower) { + Valid = true; + return CycloneSysRegPairs[i].Value; + } + } + } + + // Now try the instruction-specific registers (either read-only or + // write-only). + for (unsigned i = 0; i < NumInstPairs; ++i) { + if (InstPairs[i].Name == NameLower) { + Valid = true; + return InstPairs[i].Value; + } + } + + // Try to parse an S____ register name, where the bits + // are: 11 xxx 1x11 xxxx xxx + Regex GenericRegPattern("^s3_([0-7])_c(1[15])_c([0-9]|1[0-5])_([0-7])$"); + + SmallVector Ops; + if (!GenericRegPattern.match(NameLower, &Ops)) { + Valid = false; + return -1; + } + + uint32_t Op0 = 3, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0; + uint32_t Bits; + Ops[1].getAsInteger(10, Op1); + Ops[2].getAsInteger(10, CRn); + Ops[3].getAsInteger(10, CRm); + Ops[4].getAsInteger(10, Op2); + Bits = (Op0 << 14) | (Op1 << 11) | (CRn << 7) | (CRm << 3) | Op2; + + Valid = true; + return Bits; +} + +std::string +AArch64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const { + // First search the registers shared by all + for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) { + if (SysRegPairs[i].Value == Bits) { + Valid = true; + return SysRegPairs[i].Name; + } + } + + // Next search for target specific registers + if (FeatureBits & AArch64::ProcCyclone) { + for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) { + if (CycloneSysRegPairs[i].Value == Bits) { + Valid = true; + return CycloneSysRegPairs[i].Name; + } + } + } + + // Now try the instruction-specific registers (either read-only or + // write-only). + for (unsigned i = 0; i < NumInstPairs; ++i) { + if (InstPairs[i].Value == Bits) { + Valid = true; + return InstPairs[i].Name; + } + } + + uint32_t Op0 = (Bits >> 14) & 0x3; + uint32_t Op1 = (Bits >> 11) & 0x7; + uint32_t CRn = (Bits >> 7) & 0xf; + uint32_t CRm = (Bits >> 3) & 0xf; + uint32_t Op2 = Bits & 0x7; + + // Only combinations matching: 11 xxx 1x11 xxxx xxx are valid for a generic + // name. + if (Op0 != 3 || (CRn != 11 && CRn != 15)) { + Valid = false; + return ""; + } + + assert(Op0 == 3 && (CRn == 11 || CRn == 15) && "Invalid generic sysreg"); + + Valid = true; + return "s3_" + utostr(Op1) + "_c" + utostr(CRn) + + "_c" + utostr(CRm) + "_" + utostr(Op2); +} + +const AArch64NamedImmMapper::Mapping AArch64TLBI::TLBIMapper::TLBIPairs[] = { + {"ipas2e1is", IPAS2E1IS}, + {"ipas2le1is", IPAS2LE1IS}, + {"vmalle1is", VMALLE1IS}, + {"alle2is", ALLE2IS}, + {"alle3is", ALLE3IS}, + {"vae1is", VAE1IS}, + {"vae2is", VAE2IS}, + {"vae3is", VAE3IS}, + {"aside1is", ASIDE1IS}, + {"vaae1is", VAAE1IS}, + {"alle1is", ALLE1IS}, + {"vale1is", VALE1IS}, + {"vale2is", VALE2IS}, + {"vale3is", VALE3IS}, + {"vmalls12e1is", VMALLS12E1IS}, + {"vaale1is", VAALE1IS}, + {"ipas2e1", IPAS2E1}, + {"ipas2le1", IPAS2LE1}, + {"vmalle1", VMALLE1}, + {"alle2", ALLE2}, + {"alle3", ALLE3}, + {"vae1", VAE1}, + {"vae2", VAE2}, + {"vae3", VAE3}, + {"aside1", ASIDE1}, + {"vaae1", VAAE1}, + {"alle1", ALLE1}, + {"vale1", VALE1}, + {"vale2", VALE2}, + {"vale3", VALE3}, + {"vmalls12e1", VMALLS12E1}, + {"vaale1", VAALE1} +}; + +AArch64TLBI::TLBIMapper::TLBIMapper() + : AArch64NamedImmMapper(TLBIPairs, 0) {} diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h new file mode 100644 index 00000000000..9e4c389cc2e --- /dev/null +++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -0,0 +1,1294 @@ +//===-- AArch64BaseInfo.h - Top level definitions for AArch64 ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains small standalone helper functions and enum definitions for +// the AArch64 target useful for the compiler back-end and the MC libraries. +// As such, it deliberately does not include references to LLVM core +// code gen types, passes, etc.. +// +//===----------------------------------------------------------------------===// + +#ifndef AArch64BASEINFO_H +#define AArch64BASEINFO_H + +// FIXME: Is it easiest to fix this layering violation by moving the .inc +// #includes from AArch64MCTargetDesc.h to here? +#include "MCTargetDesc/AArch64MCTargetDesc.h" // For AArch64::X0 and friends. +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Support/ErrorHandling.h" + +namespace llvm { + +inline static unsigned getWRegFromXReg(unsigned Reg) { + switch (Reg) { + case AArch64::X0: return AArch64::W0; + case AArch64::X1: return AArch64::W1; + case AArch64::X2: return AArch64::W2; + case AArch64::X3: return AArch64::W3; + case AArch64::X4: return AArch64::W4; + case AArch64::X5: return AArch64::W5; + case AArch64::X6: return AArch64::W6; + case AArch64::X7: return AArch64::W7; + case AArch64::X8: return AArch64::W8; + case AArch64::X9: return AArch64::W9; + case AArch64::X10: return AArch64::W10; + case AArch64::X11: return AArch64::W11; + case AArch64::X12: return AArch64::W12; + case AArch64::X13: return AArch64::W13; + case AArch64::X14: return AArch64::W14; + case AArch64::X15: return AArch64::W15; + case AArch64::X16: return AArch64::W16; + case AArch64::X17: return AArch64::W17; + case AArch64::X18: return AArch64::W18; + case AArch64::X19: return AArch64::W19; + case AArch64::X20: return AArch64::W20; + case AArch64::X21: return AArch64::W21; + case AArch64::X22: return AArch64::W22; + case AArch64::X23: return AArch64::W23; + case AArch64::X24: return AArch64::W24; + case AArch64::X25: return AArch64::W25; + case AArch64::X26: return AArch64::W26; + case AArch64::X27: return AArch64::W27; + case AArch64::X28: return AArch64::W28; + case AArch64::FP: return AArch64::W29; + case AArch64::LR: return AArch64::W30; + case AArch64::SP: return AArch64::WSP; + case AArch64::XZR: return AArch64::WZR; + } + // For anything else, return it unchanged. + return Reg; +} + +inline static unsigned getXRegFromWReg(unsigned Reg) { + switch (Reg) { + case AArch64::W0: return AArch64::X0; + case AArch64::W1: return AArch64::X1; + case AArch64::W2: return AArch64::X2; + case AArch64::W3: return AArch64::X3; + case AArch64::W4: return AArch64::X4; + case AArch64::W5: return AArch64::X5; + case AArch64::W6: return AArch64::X6; + case AArch64::W7: return AArch64::X7; + case AArch64::W8: return AArch64::X8; + case AArch64::W9: return AArch64::X9; + case AArch64::W10: return AArch64::X10; + case AArch64::W11: return AArch64::X11; + case AArch64::W12: return AArch64::X12; + case AArch64::W13: return AArch64::X13; + case AArch64::W14: return AArch64::X14; + case AArch64::W15: return AArch64::X15; + case AArch64::W16: return AArch64::X16; + case AArch64::W17: return AArch64::X17; + case AArch64::W18: return AArch64::X18; + case AArch64::W19: return AArch64::X19; + case AArch64::W20: return AArch64::X20; + case AArch64::W21: return AArch64::X21; + case AArch64::W22: return AArch64::X22; + case AArch64::W23: return AArch64::X23; + case AArch64::W24: return AArch64::X24; + case AArch64::W25: return AArch64::X25; + case AArch64::W26: return AArch64::X26; + case AArch64::W27: return AArch64::X27; + case AArch64::W28: return AArch64::X28; + case AArch64::W29: return AArch64::FP; + case AArch64::W30: return AArch64::LR; + case AArch64::WSP: return AArch64::SP; + case AArch64::WZR: return AArch64::XZR; + } + // For anything else, return it unchanged. + return Reg; +} + +static inline unsigned getBRegFromDReg(unsigned Reg) { + switch (Reg) { + case AArch64::D0: return AArch64::B0; + case AArch64::D1: return AArch64::B1; + case AArch64::D2: return AArch64::B2; + case AArch64::D3: return AArch64::B3; + case AArch64::D4: return AArch64::B4; + case AArch64::D5: return AArch64::B5; + case AArch64::D6: return AArch64::B6; + case AArch64::D7: return AArch64::B7; + case AArch64::D8: return AArch64::B8; + case AArch64::D9: return AArch64::B9; + case AArch64::D10: return AArch64::B10; + case AArch64::D11: return AArch64::B11; + case AArch64::D12: return AArch64::B12; + case AArch64::D13: return AArch64::B13; + case AArch64::D14: return AArch64::B14; + case AArch64::D15: return AArch64::B15; + case AArch64::D16: return AArch64::B16; + case AArch64::D17: return AArch64::B17; + case AArch64::D18: return AArch64::B18; + case AArch64::D19: return AArch64::B19; + case AArch64::D20: return AArch64::B20; + case AArch64::D21: return AArch64::B21; + case AArch64::D22: return AArch64::B22; + case AArch64::D23: return AArch64::B23; + case AArch64::D24: return AArch64::B24; + case AArch64::D25: return AArch64::B25; + case AArch64::D26: return AArch64::B26; + case AArch64::D27: return AArch64::B27; + case AArch64::D28: return AArch64::B28; + case AArch64::D29: return AArch64::B29; + case AArch64::D30: return AArch64::B30; + case AArch64::D31: return AArch64::B31; + } + // For anything else, return it unchanged. + return Reg; +} + + +static inline unsigned getDRegFromBReg(unsigned Reg) { + switch (Reg) { + case AArch64::B0: return AArch64::D0; + case AArch64::B1: return AArch64::D1; + case AArch64::B2: return AArch64::D2; + case AArch64::B3: return AArch64::D3; + case AArch64::B4: return AArch64::D4; + case AArch64::B5: return AArch64::D5; + case AArch64::B6: return AArch64::D6; + case AArch64::B7: return AArch64::D7; + case AArch64::B8: return AArch64::D8; + case AArch64::B9: return AArch64::D9; + case AArch64::B10: return AArch64::D10; + case AArch64::B11: return AArch64::D11; + case AArch64::B12: return AArch64::D12; + case AArch64::B13: return AArch64::D13; + case AArch64::B14: return AArch64::D14; + case AArch64::B15: return AArch64::D15; + case AArch64::B16: return AArch64::D16; + case AArch64::B17: return AArch64::D17; + case AArch64::B18: return AArch64::D18; + case AArch64::B19: return AArch64::D19; + case AArch64::B20: return AArch64::D20; + case AArch64::B21: return AArch64::D21; + case AArch64::B22: return AArch64::D22; + case AArch64::B23: return AArch64::D23; + case AArch64::B24: return AArch64::D24; + case AArch64::B25: return AArch64::D25; + case AArch64::B26: return AArch64::D26; + case AArch64::B27: return AArch64::D27; + case AArch64::B28: return AArch64::D28; + case AArch64::B29: return AArch64::D29; + case AArch64::B30: return AArch64::D30; + case AArch64::B31: return AArch64::D31; + } + // For anything else, return it unchanged. + return Reg; +} + +namespace AArch64CC { + +// The CondCodes constants map directly to the 4-bit encoding of the condition +// field for predicated instructions. +enum CondCode { // Meaning (integer) Meaning (floating-point) + EQ = 0x0, // Equal Equal + NE = 0x1, // Not equal Not equal, or unordered + HS = 0x2, // Unsigned higher or same >, ==, or unordered + LO = 0x3, // Unsigned lower Less than + MI = 0x4, // Minus, negative Less than + PL = 0x5, // Plus, positive or zero >, ==, or unordered + VS = 0x6, // Overflow Unordered + VC = 0x7, // No overflow Not unordered + HI = 0x8, // Unsigned higher Greater than, or unordered + LS = 0x9, // Unsigned lower or same Less than or equal + GE = 0xa, // Greater than or equal Greater than or equal + LT = 0xb, // Less than Less than, or unordered + GT = 0xc, // Greater than Greater than + LE = 0xd, // Less than or equal <, ==, or unordered + AL = 0xe, // Always (unconditional) Always (unconditional) + NV = 0xf, // Always (unconditional) Always (unconditional) + // Note the NV exists purely to disassemble 0b1111. Execution is "always". + Invalid +}; + +inline static const char *getCondCodeName(CondCode Code) { + switch (Code) { + default: llvm_unreachable("Unknown condition code"); + case EQ: return "eq"; + case NE: return "ne"; + case HS: return "hs"; + case LO: return "lo"; + case MI: return "mi"; + case PL: return "pl"; + case VS: return "vs"; + case VC: return "vc"; + case HI: return "hi"; + case LS: return "ls"; + case GE: return "ge"; + case LT: return "lt"; + case GT: return "gt"; + case LE: return "le"; + case AL: return "al"; + case NV: return "nv"; + } +} + +inline static CondCode getInvertedCondCode(CondCode Code) { + switch (Code) { + default: llvm_unreachable("Unknown condition code"); + case EQ: return NE; + case NE: return EQ; + case HS: return LO; + case LO: return HS; + case MI: return PL; + case PL: return MI; + case VS: return VC; + case VC: return VS; + case HI: return LS; + case LS: return HI; + case GE: return LT; + case LT: return GE; + case GT: return LE; + case LE: return GT; + } +} + +/// Given a condition code, return NZCV flags that would satisfy that condition. +/// The flag bits are in the format expected by the ccmp instructions. +/// Note that many different flag settings can satisfy a given condition code, +/// this function just returns one of them. +inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) { + // NZCV flags encoded as expected by ccmp instructions, ARMv8 ISA 5.5.7. + enum { N = 8, Z = 4, C = 2, V = 1 }; + switch (Code) { + default: llvm_unreachable("Unknown condition code"); + case EQ: return Z; // Z == 1 + case NE: return 0; // Z == 0 + case HS: return C; // C == 1 + case LO: return 0; // C == 0 + case MI: return N; // N == 1 + case PL: return 0; // N == 0 + case VS: return V; // V == 1 + case VC: return 0; // V == 0 + case HI: return C; // C == 1 && Z == 0 + case LS: return 0; // C == 0 || Z == 1 + case GE: return 0; // N == V + case LT: return N; // N != V + case GT: return 0; // Z == 0 && N == V + case LE: return Z; // Z == 1 || N != V + } +} +} // end namespace AArch64CC + +/// Instances of this class can perform bidirectional mapping from random +/// identifier strings to operand encodings. For example "MSR" takes a named +/// system-register which must be encoded somehow and decoded for printing. This +/// central location means that the information for those transformations is not +/// duplicated and remains in sync. +/// +/// FIXME: currently the algorithm is a completely unoptimised linear +/// search. Obviously this could be improved, but we would probably want to work +/// out just how often these instructions are emitted before working on it. It +/// might even be optimal to just reorder the tables for the common instructions +/// rather than changing the algorithm. +struct AArch64NamedImmMapper { + struct Mapping { + const char *Name; + uint32_t Value; + }; + + template + AArch64NamedImmMapper(const Mapping (&Pairs)[N], uint32_t TooBigImm) + : Pairs(&Pairs[0]), NumPairs(N), TooBigImm(TooBigImm) {} + + StringRef toString(uint32_t Value, bool &Valid) const; + uint32_t fromString(StringRef Name, bool &Valid) const; + + /// Many of the instructions allow an alternative assembly form consisting of + /// a simple immediate. Currently the only valid forms are ranges [0, N) where + /// N being 0 indicates no immediate syntax-form is allowed. + bool validImm(uint32_t Value) const; +protected: + const Mapping *Pairs; + size_t NumPairs; + uint32_t TooBigImm; +}; + +namespace AArch64AT { + enum ATValues { + Invalid = -1, // Op0 Op1 CRn CRm Op2 + S1E1R = 0x43c0, // 01 000 0111 1000 000 + S1E2R = 0x63c0, // 01 100 0111 1000 000 + S1E3R = 0x73c0, // 01 110 0111 1000 000 + S1E1W = 0x43c1, // 01 000 0111 1000 001 + S1E2W = 0x63c1, // 01 100 0111 1000 001 + S1E3W = 0x73c1, // 01 110 0111 1000 001 + S1E0R = 0x43c2, // 01 000 0111 1000 010 + S1E0W = 0x43c3, // 01 000 0111 1000 011 + S12E1R = 0x63c4, // 01 100 0111 1000 100 + S12E1W = 0x63c5, // 01 100 0111 1000 101 + S12E0R = 0x63c6, // 01 100 0111 1000 110 + S12E0W = 0x63c7 // 01 100 0111 1000 111 + }; + + struct ATMapper : AArch64NamedImmMapper { + const static Mapping ATPairs[]; + + ATMapper(); + }; + +} +namespace AArch64DB { + enum DBValues { + Invalid = -1, + OSHLD = 0x1, + OSHST = 0x2, + OSH = 0x3, + NSHLD = 0x5, + NSHST = 0x6, + NSH = 0x7, + ISHLD = 0x9, + ISHST = 0xa, + ISH = 0xb, + LD = 0xd, + ST = 0xe, + SY = 0xf + }; + + struct DBarrierMapper : AArch64NamedImmMapper { + const static Mapping DBarrierPairs[]; + + DBarrierMapper(); + }; +} + +namespace AArch64DC { + enum DCValues { + Invalid = -1, // Op1 CRn CRm Op2 + ZVA = 0x5ba1, // 01 011 0111 0100 001 + IVAC = 0x43b1, // 01 000 0111 0110 001 + ISW = 0x43b2, // 01 000 0111 0110 010 + CVAC = 0x5bd1, // 01 011 0111 1010 001 + CSW = 0x43d2, // 01 000 0111 1010 010 + CVAU = 0x5bd9, // 01 011 0111 1011 001 + CIVAC = 0x5bf1, // 01 011 0111 1110 001 + CISW = 0x43f2 // 01 000 0111 1110 010 + }; + + struct DCMapper : AArch64NamedImmMapper { + const static Mapping DCPairs[]; + + DCMapper(); + }; + +} + +namespace AArch64IC { + enum ICValues { + Invalid = -1, // Op1 CRn CRm Op2 + IALLUIS = 0x0388, // 000 0111 0001 000 + IALLU = 0x03a8, // 000 0111 0101 000 + IVAU = 0x1ba9 // 011 0111 0101 001 + }; + + + struct ICMapper : AArch64NamedImmMapper { + const static Mapping ICPairs[]; + + ICMapper(); + }; + + static inline bool NeedsRegister(ICValues Val) { + return Val == IVAU; + } +} + +namespace AArch64ISB { + enum ISBValues { + Invalid = -1, + SY = 0xf + }; + struct ISBMapper : AArch64NamedImmMapper { + const static Mapping ISBPairs[]; + + ISBMapper(); + }; +} + +namespace AArch64PRFM { + enum PRFMValues { + Invalid = -1, + PLDL1KEEP = 0x00, + PLDL1STRM = 0x01, + PLDL2KEEP = 0x02, + PLDL2STRM = 0x03, + PLDL3KEEP = 0x04, + PLDL3STRM = 0x05, + PLIL1KEEP = 0x08, + PLIL1STRM = 0x09, + PLIL2KEEP = 0x0a, + PLIL2STRM = 0x0b, + PLIL3KEEP = 0x0c, + PLIL3STRM = 0x0d, + PSTL1KEEP = 0x10, + PSTL1STRM = 0x11, + PSTL2KEEP = 0x12, + PSTL2STRM = 0x13, + PSTL3KEEP = 0x14, + PSTL3STRM = 0x15 + }; + + struct PRFMMapper : AArch64NamedImmMapper { + const static Mapping PRFMPairs[]; + + PRFMMapper(); + }; +} + +namespace AArch64PState { + enum PStateValues { + Invalid = -1, + SPSel = 0x05, + DAIFSet = 0x1e, + DAIFClr = 0x1f + }; + + struct PStateMapper : AArch64NamedImmMapper { + const static Mapping PStatePairs[]; + + PStateMapper(); + }; + +} + +namespace AArch64SE { + enum ShiftExtSpecifiers { + Invalid = -1, + LSL, + MSL, + LSR, + ASR, + ROR, + + UXTB, + UXTH, + UXTW, + UXTX, + + SXTB, + SXTH, + SXTW, + SXTX + }; +} + +namespace AArch64Layout { + enum VectorLayout { + Invalid = -1, + VL_8B, + VL_4H, + VL_2S, + VL_1D, + + VL_16B, + VL_8H, + VL_4S, + VL_2D, + + // Bare layout for the 128-bit vector + // (only show ".b", ".h", ".s", ".d" without vector number) + VL_B, + VL_H, + VL_S, + VL_D + }; +} + +inline static const char * +AArch64VectorLayoutToString(AArch64Layout::VectorLayout Layout) { + switch (Layout) { + case AArch64Layout::VL_8B: return ".8b"; + case AArch64Layout::VL_4H: return ".4h"; + case AArch64Layout::VL_2S: return ".2s"; + case AArch64Layout::VL_1D: return ".1d"; + case AArch64Layout::VL_16B: return ".16b"; + case AArch64Layout::VL_8H: return ".8h"; + case AArch64Layout::VL_4S: return ".4s"; + case AArch64Layout::VL_2D: return ".2d"; + case AArch64Layout::VL_B: return ".b"; + case AArch64Layout::VL_H: return ".h"; + case AArch64Layout::VL_S: return ".s"; + case AArch64Layout::VL_D: return ".d"; + default: llvm_unreachable("Unknown Vector Layout"); + } +} + +inline static AArch64Layout::VectorLayout +AArch64StringToVectorLayout(StringRef LayoutStr) { + return StringSwitch(LayoutStr) + .Case(".8b", AArch64Layout::VL_8B) + .Case(".4h", AArch64Layout::VL_4H) + .Case(".2s", AArch64Layout::VL_2S) + .Case(".1d", AArch64Layout::VL_1D) + .Case(".16b", AArch64Layout::VL_16B) + .Case(".8h", AArch64Layout::VL_8H) + .Case(".4s", AArch64Layout::VL_4S) + .Case(".2d", AArch64Layout::VL_2D) + .Case(".b", AArch64Layout::VL_B) + .Case(".h", AArch64Layout::VL_H) + .Case(".s", AArch64Layout::VL_S) + .Case(".d", AArch64Layout::VL_D) + .Default(AArch64Layout::Invalid); +} + +namespace AArch64SysReg { + enum SysRegROValues { + MDCCSR_EL0 = 0x9808, // 10 011 0000 0001 000 + DBGDTRRX_EL0 = 0x9828, // 10 011 0000 0101 000 + MDRAR_EL1 = 0x8080, // 10 000 0001 0000 000 + OSLSR_EL1 = 0x808c, // 10 000 0001 0001 100 + DBGAUTHSTATUS_EL1 = 0x83f6, // 10 000 0111 1110 110 + PMCEID0_EL0 = 0xdce6, // 11 011 1001 1100 110 + PMCEID1_EL0 = 0xdce7, // 11 011 1001 1100 111 + MIDR_EL1 = 0xc000, // 11 000 0000 0000 000 + CCSIDR_EL1 = 0xc800, // 11 001 0000 0000 000 + CLIDR_EL1 = 0xc801, // 11 001 0000 0000 001 + CTR_EL0 = 0xd801, // 11 011 0000 0000 001 + MPIDR_EL1 = 0xc005, // 11 000 0000 0000 101 + REVIDR_EL1 = 0xc006, // 11 000 0000 0000 110 + AIDR_EL1 = 0xc807, // 11 001 0000 0000 111 + DCZID_EL0 = 0xd807, // 11 011 0000 0000 111 + ID_PFR0_EL1 = 0xc008, // 11 000 0000 0001 000 + ID_PFR1_EL1 = 0xc009, // 11 000 0000 0001 001 + ID_DFR0_EL1 = 0xc00a, // 11 000 0000 0001 010 + ID_AFR0_EL1 = 0xc00b, // 11 000 0000 0001 011 + ID_MMFR0_EL1 = 0xc00c, // 11 000 0000 0001 100 + ID_MMFR1_EL1 = 0xc00d, // 11 000 0000 0001 101 + ID_MMFR2_EL1 = 0xc00e, // 11 000 0000 0001 110 + ID_MMFR3_EL1 = 0xc00f, // 11 000 0000 0001 111 + ID_ISAR0_EL1 = 0xc010, // 11 000 0000 0010 000 + ID_ISAR1_EL1 = 0xc011, // 11 000 0000 0010 001 + ID_ISAR2_EL1 = 0xc012, // 11 000 0000 0010 010 + ID_ISAR3_EL1 = 0xc013, // 11 000 0000 0010 011 + ID_ISAR4_EL1 = 0xc014, // 11 000 0000 0010 100 + ID_ISAR5_EL1 = 0xc015, // 11 000 0000 0010 101 + ID_A64PFR0_EL1 = 0xc020, // 11 000 0000 0100 000 + ID_A64PFR1_EL1 = 0xc021, // 11 000 0000 0100 001 + ID_A64DFR0_EL1 = 0xc028, // 11 000 0000 0101 000 + ID_A64DFR1_EL1 = 0xc029, // 11 000 0000 0101 001 + ID_A64AFR0_EL1 = 0xc02c, // 11 000 0000 0101 100 + ID_A64AFR1_EL1 = 0xc02d, // 11 000 0000 0101 101 + ID_A64ISAR0_EL1 = 0xc030, // 11 000 0000 0110 000 + ID_A64ISAR1_EL1 = 0xc031, // 11 000 0000 0110 001 + ID_A64MMFR0_EL1 = 0xc038, // 11 000 0000 0111 000 + ID_A64MMFR1_EL1 = 0xc039, // 11 000 0000 0111 001 + MVFR0_EL1 = 0xc018, // 11 000 0000 0011 000 + MVFR1_EL1 = 0xc019, // 11 000 0000 0011 001 + MVFR2_EL1 = 0xc01a, // 11 000 0000 0011 010 + RVBAR_EL1 = 0xc601, // 11 000 1100 0000 001 + RVBAR_EL2 = 0xe601, // 11 100 1100 0000 001 + RVBAR_EL3 = 0xf601, // 11 110 1100 0000 001 + ISR_EL1 = 0xc608, // 11 000 1100 0001 000 + CNTPCT_EL0 = 0xdf01, // 11 011 1110 0000 001 + CNTVCT_EL0 = 0xdf02, // 11 011 1110 0000 010 + + // Trace registers + TRCSTATR = 0x8818, // 10 001 0000 0011 000 + TRCIDR8 = 0x8806, // 10 001 0000 0000 110 + TRCIDR9 = 0x880e, // 10 001 0000 0001 110 + TRCIDR10 = 0x8816, // 10 001 0000 0010 110 + TRCIDR11 = 0x881e, // 10 001 0000 0011 110 + TRCIDR12 = 0x8826, // 10 001 0000 0100 110 + TRCIDR13 = 0x882e, // 10 001 0000 0101 110 + TRCIDR0 = 0x8847, // 10 001 0000 1000 111 + TRCIDR1 = 0x884f, // 10 001 0000 1001 111 + TRCIDR2 = 0x8857, // 10 001 0000 1010 111 + TRCIDR3 = 0x885f, // 10 001 0000 1011 111 + TRCIDR4 = 0x8867, // 10 001 0000 1100 111 + TRCIDR5 = 0x886f, // 10 001 0000 1101 111 + TRCIDR6 = 0x8877, // 10 001 0000 1110 111 + TRCIDR7 = 0x887f, // 10 001 0000 1111 111 + TRCOSLSR = 0x888c, // 10 001 0001 0001 100 + TRCPDSR = 0x88ac, // 10 001 0001 0101 100 + TRCDEVAFF0 = 0x8bd6, // 10 001 0111 1010 110 + TRCDEVAFF1 = 0x8bde, // 10 001 0111 1011 110 + TRCLSR = 0x8bee, // 10 001 0111 1101 110 + TRCAUTHSTATUS = 0x8bf6, // 10 001 0111 1110 110 + TRCDEVARCH = 0x8bfe, // 10 001 0111 1111 110 + TRCDEVID = 0x8b97, // 10 001 0111 0010 111 + TRCDEVTYPE = 0x8b9f, // 10 001 0111 0011 111 + TRCPIDR4 = 0x8ba7, // 10 001 0111 0100 111 + TRCPIDR5 = 0x8baf, // 10 001 0111 0101 111 + TRCPIDR6 = 0x8bb7, // 10 001 0111 0110 111 + TRCPIDR7 = 0x8bbf, // 10 001 0111 0111 111 + TRCPIDR0 = 0x8bc7, // 10 001 0111 1000 111 + TRCPIDR1 = 0x8bcf, // 10 001 0111 1001 111 + TRCPIDR2 = 0x8bd7, // 10 001 0111 1010 111 + TRCPIDR3 = 0x8bdf, // 10 001 0111 1011 111 + TRCCIDR0 = 0x8be7, // 10 001 0111 1100 111 + TRCCIDR1 = 0x8bef, // 10 001 0111 1101 111 + TRCCIDR2 = 0x8bf7, // 10 001 0111 1110 111 + TRCCIDR3 = 0x8bff, // 10 001 0111 1111 111 + + // GICv3 registers + ICC_IAR1_EL1 = 0xc660, // 11 000 1100 1100 000 + ICC_IAR0_EL1 = 0xc640, // 11 000 1100 1000 000 + ICC_HPPIR1_EL1 = 0xc662, // 11 000 1100 1100 010 + ICC_HPPIR0_EL1 = 0xc642, // 11 000 1100 1000 010 + ICC_RPR_EL1 = 0xc65b, // 11 000 1100 1011 011 + ICH_VTR_EL2 = 0xe659, // 11 100 1100 1011 001 + ICH_EISR_EL2 = 0xe65b, // 11 100 1100 1011 011 + ICH_ELSR_EL2 = 0xe65d // 11 100 1100 1011 101 + }; + + enum SysRegWOValues { + DBGDTRTX_EL0 = 0x9828, // 10 011 0000 0101 000 + OSLAR_EL1 = 0x8084, // 10 000 0001 0000 100 + PMSWINC_EL0 = 0xdce4, // 11 011 1001 1100 100 + + // Trace Registers + TRCOSLAR = 0x8884, // 10 001 0001 0000 100 + TRCLAR = 0x8be6, // 10 001 0111 1100 110 + + // GICv3 registers + ICC_EOIR1_EL1 = 0xc661, // 11 000 1100 1100 001 + ICC_EOIR0_EL1 = 0xc641, // 11 000 1100 1000 001 + ICC_DIR_EL1 = 0xc659, // 11 000 1100 1011 001 + ICC_SGI1R_EL1 = 0xc65d, // 11 000 1100 1011 101 + ICC_ASGI1R_EL1 = 0xc65e, // 11 000 1100 1011 110 + ICC_SGI0R_EL1 = 0xc65f // 11 000 1100 1011 111 + }; + + enum SysRegValues { + Invalid = -1, // Op0 Op1 CRn CRm Op2 + OSDTRRX_EL1 = 0x8002, // 10 000 0000 0000 010 + OSDTRTX_EL1 = 0x801a, // 10 000 0000 0011 010 + TEECR32_EL1 = 0x9000, // 10 010 0000 0000 000 + MDCCINT_EL1 = 0x8010, // 10 000 0000 0010 000 + MDSCR_EL1 = 0x8012, // 10 000 0000 0010 010 + DBGDTR_EL0 = 0x9820, // 10 011 0000 0100 000 + OSECCR_EL1 = 0x8032, // 10 000 0000 0110 010 + DBGVCR32_EL2 = 0xa038, // 10 100 0000 0111 000 + DBGBVR0_EL1 = 0x8004, // 10 000 0000 0000 100 + DBGBVR1_EL1 = 0x800c, // 10 000 0000 0001 100 + DBGBVR2_EL1 = 0x8014, // 10 000 0000 0010 100 + DBGBVR3_EL1 = 0x801c, // 10 000 0000 0011 100 + DBGBVR4_EL1 = 0x8024, // 10 000 0000 0100 100 + DBGBVR5_EL1 = 0x802c, // 10 000 0000 0101 100 + DBGBVR6_EL1 = 0x8034, // 10 000 0000 0110 100 + DBGBVR7_EL1 = 0x803c, // 10 000 0000 0111 100 + DBGBVR8_EL1 = 0x8044, // 10 000 0000 1000 100 + DBGBVR9_EL1 = 0x804c, // 10 000 0000 1001 100 + DBGBVR10_EL1 = 0x8054, // 10 000 0000 1010 100 + DBGBVR11_EL1 = 0x805c, // 10 000 0000 1011 100 + DBGBVR12_EL1 = 0x8064, // 10 000 0000 1100 100 + DBGBVR13_EL1 = 0x806c, // 10 000 0000 1101 100 + DBGBVR14_EL1 = 0x8074, // 10 000 0000 1110 100 + DBGBVR15_EL1 = 0x807c, // 10 000 0000 1111 100 + DBGBCR0_EL1 = 0x8005, // 10 000 0000 0000 101 + DBGBCR1_EL1 = 0x800d, // 10 000 0000 0001 101 + DBGBCR2_EL1 = 0x8015, // 10 000 0000 0010 101 + DBGBCR3_EL1 = 0x801d, // 10 000 0000 0011 101 + DBGBCR4_EL1 = 0x8025, // 10 000 0000 0100 101 + DBGBCR5_EL1 = 0x802d, // 10 000 0000 0101 101 + DBGBCR6_EL1 = 0x8035, // 10 000 0000 0110 101 + DBGBCR7_EL1 = 0x803d, // 10 000 0000 0111 101 + DBGBCR8_EL1 = 0x8045, // 10 000 0000 1000 101 + DBGBCR9_EL1 = 0x804d, // 10 000 0000 1001 101 + DBGBCR10_EL1 = 0x8055, // 10 000 0000 1010 101 + DBGBCR11_EL1 = 0x805d, // 10 000 0000 1011 101 + DBGBCR12_EL1 = 0x8065, // 10 000 0000 1100 101 + DBGBCR13_EL1 = 0x806d, // 10 000 0000 1101 101 + DBGBCR14_EL1 = 0x8075, // 10 000 0000 1110 101 + DBGBCR15_EL1 = 0x807d, // 10 000 0000 1111 101 + DBGWVR0_EL1 = 0x8006, // 10 000 0000 0000 110 + DBGWVR1_EL1 = 0x800e, // 10 000 0000 0001 110 + DBGWVR2_EL1 = 0x8016, // 10 000 0000 0010 110 + DBGWVR3_EL1 = 0x801e, // 10 000 0000 0011 110 + DBGWVR4_EL1 = 0x8026, // 10 000 0000 0100 110 + DBGWVR5_EL1 = 0x802e, // 10 000 0000 0101 110 + DBGWVR6_EL1 = 0x8036, // 10 000 0000 0110 110 + DBGWVR7_EL1 = 0x803e, // 10 000 0000 0111 110 + DBGWVR8_EL1 = 0x8046, // 10 000 0000 1000 110 + DBGWVR9_EL1 = 0x804e, // 10 000 0000 1001 110 + DBGWVR10_EL1 = 0x8056, // 10 000 0000 1010 110 + DBGWVR11_EL1 = 0x805e, // 10 000 0000 1011 110 + DBGWVR12_EL1 = 0x8066, // 10 000 0000 1100 110 + DBGWVR13_EL1 = 0x806e, // 10 000 0000 1101 110 + DBGWVR14_EL1 = 0x8076, // 10 000 0000 1110 110 + DBGWVR15_EL1 = 0x807e, // 10 000 0000 1111 110 + DBGWCR0_EL1 = 0x8007, // 10 000 0000 0000 111 + DBGWCR1_EL1 = 0x800f, // 10 000 0000 0001 111 + DBGWCR2_EL1 = 0x8017, // 10 000 0000 0010 111 + DBGWCR3_EL1 = 0x801f, // 10 000 0000 0011 111 + DBGWCR4_EL1 = 0x8027, // 10 000 0000 0100 111 + DBGWCR5_EL1 = 0x802f, // 10 000 0000 0101 111 + DBGWCR6_EL1 = 0x8037, // 10 000 0000 0110 111 + DBGWCR7_EL1 = 0x803f, // 10 000 0000 0111 111 + DBGWCR8_EL1 = 0x8047, // 10 000 0000 1000 111 + DBGWCR9_EL1 = 0x804f, // 10 000 0000 1001 111 + DBGWCR10_EL1 = 0x8057, // 10 000 0000 1010 111 + DBGWCR11_EL1 = 0x805f, // 10 000 0000 1011 111 + DBGWCR12_EL1 = 0x8067, // 10 000 0000 1100 111 + DBGWCR13_EL1 = 0x806f, // 10 000 0000 1101 111 + DBGWCR14_EL1 = 0x8077, // 10 000 0000 1110 111 + DBGWCR15_EL1 = 0x807f, // 10 000 0000 1111 111 + TEEHBR32_EL1 = 0x9080, // 10 010 0001 0000 000 + OSDLR_EL1 = 0x809c, // 10 000 0001 0011 100 + DBGPRCR_EL1 = 0x80a4, // 10 000 0001 0100 100 + DBGCLAIMSET_EL1 = 0x83c6, // 10 000 0111 1000 110 + DBGCLAIMCLR_EL1 = 0x83ce, // 10 000 0111 1001 110 + CSSELR_EL1 = 0xd000, // 11 010 0000 0000 000 + VPIDR_EL2 = 0xe000, // 11 100 0000 0000 000 + VMPIDR_EL2 = 0xe005, // 11 100 0000 0000 101 + CPACR_EL1 = 0xc082, // 11 000 0001 0000 010 + SCTLR_EL1 = 0xc080, // 11 000 0001 0000 000 + SCTLR_EL2 = 0xe080, // 11 100 0001 0000 000 + SCTLR_EL3 = 0xf080, // 11 110 0001 0000 000 + ACTLR_EL1 = 0xc081, // 11 000 0001 0000 001 + ACTLR_EL2 = 0xe081, // 11 100 0001 0000 001 + ACTLR_EL3 = 0xf081, // 11 110 0001 0000 001 + HCR_EL2 = 0xe088, // 11 100 0001 0001 000 + SCR_EL3 = 0xf088, // 11 110 0001 0001 000 + MDCR_EL2 = 0xe089, // 11 100 0001 0001 001 + SDER32_EL3 = 0xf089, // 11 110 0001 0001 001 + CPTR_EL2 = 0xe08a, // 11 100 0001 0001 010 + CPTR_EL3 = 0xf08a, // 11 110 0001 0001 010 + HSTR_EL2 = 0xe08b, // 11 100 0001 0001 011 + HACR_EL2 = 0xe08f, // 11 100 0001 0001 111 + MDCR_EL3 = 0xf099, // 11 110 0001 0011 001 + TTBR0_EL1 = 0xc100, // 11 000 0010 0000 000 + TTBR0_EL2 = 0xe100, // 11 100 0010 0000 000 + TTBR0_EL3 = 0xf100, // 11 110 0010 0000 000 + TTBR1_EL1 = 0xc101, // 11 000 0010 0000 001 + TCR_EL1 = 0xc102, // 11 000 0010 0000 010 + TCR_EL2 = 0xe102, // 11 100 0010 0000 010 + TCR_EL3 = 0xf102, // 11 110 0010 0000 010 + VTTBR_EL2 = 0xe108, // 11 100 0010 0001 000 + VTCR_EL2 = 0xe10a, // 11 100 0010 0001 010 + DACR32_EL2 = 0xe180, // 11 100 0011 0000 000 + SPSR_EL1 = 0xc200, // 11 000 0100 0000 000 + SPSR_EL2 = 0xe200, // 11 100 0100 0000 000 + SPSR_EL3 = 0xf200, // 11 110 0100 0000 000 + ELR_EL1 = 0xc201, // 11 000 0100 0000 001 + ELR_EL2 = 0xe201, // 11 100 0100 0000 001 + ELR_EL3 = 0xf201, // 11 110 0100 0000 001 + SP_EL0 = 0xc208, // 11 000 0100 0001 000 + SP_EL1 = 0xe208, // 11 100 0100 0001 000 + SP_EL2 = 0xf208, // 11 110 0100 0001 000 + SPSel = 0xc210, // 11 000 0100 0010 000 + NZCV = 0xda10, // 11 011 0100 0010 000 + DAIF = 0xda11, // 11 011 0100 0010 001 + CurrentEL = 0xc212, // 11 000 0100 0010 010 + SPSR_irq = 0xe218, // 11 100 0100 0011 000 + SPSR_abt = 0xe219, // 11 100 0100 0011 001 + SPSR_und = 0xe21a, // 11 100 0100 0011 010 + SPSR_fiq = 0xe21b, // 11 100 0100 0011 011 + FPCR = 0xda20, // 11 011 0100 0100 000 + FPSR = 0xda21, // 11 011 0100 0100 001 + DSPSR_EL0 = 0xda28, // 11 011 0100 0101 000 + DLR_EL0 = 0xda29, // 11 011 0100 0101 001 + IFSR32_EL2 = 0xe281, // 11 100 0101 0000 001 + AFSR0_EL1 = 0xc288, // 11 000 0101 0001 000 + AFSR0_EL2 = 0xe288, // 11 100 0101 0001 000 + AFSR0_EL3 = 0xf288, // 11 110 0101 0001 000 + AFSR1_EL1 = 0xc289, // 11 000 0101 0001 001 + AFSR1_EL2 = 0xe289, // 11 100 0101 0001 001 + AFSR1_EL3 = 0xf289, // 11 110 0101 0001 001 + ESR_EL1 = 0xc290, // 11 000 0101 0010 000 + ESR_EL2 = 0xe290, // 11 100 0101 0010 000 + ESR_EL3 = 0xf290, // 11 110 0101 0010 000 + FPEXC32_EL2 = 0xe298, // 11 100 0101 0011 000 + FAR_EL1 = 0xc300, // 11 000 0110 0000 000 + FAR_EL2 = 0xe300, // 11 100 0110 0000 000 + FAR_EL3 = 0xf300, // 11 110 0110 0000 000 + HPFAR_EL2 = 0xe304, // 11 100 0110 0000 100 + PAR_EL1 = 0xc3a0, // 11 000 0111 0100 000 + PMCR_EL0 = 0xdce0, // 11 011 1001 1100 000 + PMCNTENSET_EL0 = 0xdce1, // 11 011 1001 1100 001 + PMCNTENCLR_EL0 = 0xdce2, // 11 011 1001 1100 010 + PMOVSCLR_EL0 = 0xdce3, // 11 011 1001 1100 011 + PMSELR_EL0 = 0xdce5, // 11 011 1001 1100 101 + PMCCNTR_EL0 = 0xdce8, // 11 011 1001 1101 000 + PMXEVTYPER_EL0 = 0xdce9, // 11 011 1001 1101 001 + PMXEVCNTR_EL0 = 0xdcea, // 11 011 1001 1101 010 + PMUSERENR_EL0 = 0xdcf0, // 11 011 1001 1110 000 + PMINTENSET_EL1 = 0xc4f1, // 11 000 1001 1110 001 + PMINTENCLR_EL1 = 0xc4f2, // 11 000 1001 1110 010 + PMOVSSET_EL0 = 0xdcf3, // 11 011 1001 1110 011 + MAIR_EL1 = 0xc510, // 11 000 1010 0010 000 + MAIR_EL2 = 0xe510, // 11 100 1010 0010 000 + MAIR_EL3 = 0xf510, // 11 110 1010 0010 000 + AMAIR_EL1 = 0xc518, // 11 000 1010 0011 000 + AMAIR_EL2 = 0xe518, // 11 100 1010 0011 000 + AMAIR_EL3 = 0xf518, // 11 110 1010 0011 000 + VBAR_EL1 = 0xc600, // 11 000 1100 0000 000 + VBAR_EL2 = 0xe600, // 11 100 1100 0000 000 + VBAR_EL3 = 0xf600, // 11 110 1100 0000 000 + RMR_EL1 = 0xc602, // 11 000 1100 0000 010 + RMR_EL2 = 0xe602, // 11 100 1100 0000 010 + RMR_EL3 = 0xf602, // 11 110 1100 0000 010 + CONTEXTIDR_EL1 = 0xc681, // 11 000 1101 0000 001 + TPIDR_EL0 = 0xde82, // 11 011 1101 0000 010 + TPIDR_EL2 = 0xe682, // 11 100 1101 0000 010 + TPIDR_EL3 = 0xf682, // 11 110 1101 0000 010 + TPIDRRO_EL0 = 0xde83, // 11 011 1101 0000 011 + TPIDR_EL1 = 0xc684, // 11 000 1101 0000 100 + CNTFRQ_EL0 = 0xdf00, // 11 011 1110 0000 000 + CNTVOFF_EL2 = 0xe703, // 11 100 1110 0000 011 + CNTKCTL_EL1 = 0xc708, // 11 000 1110 0001 000 + CNTHCTL_EL2 = 0xe708, // 11 100 1110 0001 000 + CNTP_TVAL_EL0 = 0xdf10, // 11 011 1110 0010 000 + CNTHP_TVAL_EL2 = 0xe710, // 11 100 1110 0010 000 + CNTPS_TVAL_EL1 = 0xff10, // 11 111 1110 0010 000 + CNTP_CTL_EL0 = 0xdf11, // 11 011 1110 0010 001 + CNTHP_CTL_EL2 = 0xe711, // 11 100 1110 0010 001 + CNTPS_CTL_EL1 = 0xff11, // 11 111 1110 0010 001 + CNTP_CVAL_EL0 = 0xdf12, // 11 011 1110 0010 010 + CNTHP_CVAL_EL2 = 0xe712, // 11 100 1110 0010 010 + CNTPS_CVAL_EL1 = 0xff12, // 11 111 1110 0010 010 + CNTV_TVAL_EL0 = 0xdf18, // 11 011 1110 0011 000 + CNTV_CTL_EL0 = 0xdf19, // 11 011 1110 0011 001 + CNTV_CVAL_EL0 = 0xdf1a, // 11 011 1110 0011 010 + PMEVCNTR0_EL0 = 0xdf40, // 11 011 1110 1000 000 + PMEVCNTR1_EL0 = 0xdf41, // 11 011 1110 1000 001 + PMEVCNTR2_EL0 = 0xdf42, // 11 011 1110 1000 010 + PMEVCNTR3_EL0 = 0xdf43, // 11 011 1110 1000 011 + PMEVCNTR4_EL0 = 0xdf44, // 11 011 1110 1000 100 + PMEVCNTR5_EL0 = 0xdf45, // 11 011 1110 1000 101 + PMEVCNTR6_EL0 = 0xdf46, // 11 011 1110 1000 110 + PMEVCNTR7_EL0 = 0xdf47, // 11 011 1110 1000 111 + PMEVCNTR8_EL0 = 0xdf48, // 11 011 1110 1001 000 + PMEVCNTR9_EL0 = 0xdf49, // 11 011 1110 1001 001 + PMEVCNTR10_EL0 = 0xdf4a, // 11 011 1110 1001 010 + PMEVCNTR11_EL0 = 0xdf4b, // 11 011 1110 1001 011 + PMEVCNTR12_EL0 = 0xdf4c, // 11 011 1110 1001 100 + PMEVCNTR13_EL0 = 0xdf4d, // 11 011 1110 1001 101 + PMEVCNTR14_EL0 = 0xdf4e, // 11 011 1110 1001 110 + PMEVCNTR15_EL0 = 0xdf4f, // 11 011 1110 1001 111 + PMEVCNTR16_EL0 = 0xdf50, // 11 011 1110 1010 000 + PMEVCNTR17_EL0 = 0xdf51, // 11 011 1110 1010 001 + PMEVCNTR18_EL0 = 0xdf52, // 11 011 1110 1010 010 + PMEVCNTR19_EL0 = 0xdf53, // 11 011 1110 1010 011 + PMEVCNTR20_EL0 = 0xdf54, // 11 011 1110 1010 100 + PMEVCNTR21_EL0 = 0xdf55, // 11 011 1110 1010 101 + PMEVCNTR22_EL0 = 0xdf56, // 11 011 1110 1010 110 + PMEVCNTR23_EL0 = 0xdf57, // 11 011 1110 1010 111 + PMEVCNTR24_EL0 = 0xdf58, // 11 011 1110 1011 000 + PMEVCNTR25_EL0 = 0xdf59, // 11 011 1110 1011 001 + PMEVCNTR26_EL0 = 0xdf5a, // 11 011 1110 1011 010 + PMEVCNTR27_EL0 = 0xdf5b, // 11 011 1110 1011 011 + PMEVCNTR28_EL0 = 0xdf5c, // 11 011 1110 1011 100 + PMEVCNTR29_EL0 = 0xdf5d, // 11 011 1110 1011 101 + PMEVCNTR30_EL0 = 0xdf5e, // 11 011 1110 1011 110 + PMCCFILTR_EL0 = 0xdf7f, // 11 011 1110 1111 111 + PMEVTYPER0_EL0 = 0xdf60, // 11 011 1110 1100 000 + PMEVTYPER1_EL0 = 0xdf61, // 11 011 1110 1100 001 + PMEVTYPER2_EL0 = 0xdf62, // 11 011 1110 1100 010 + PMEVTYPER3_EL0 = 0xdf63, // 11 011 1110 1100 011 + PMEVTYPER4_EL0 = 0xdf64, // 11 011 1110 1100 100 + PMEVTYPER5_EL0 = 0xdf65, // 11 011 1110 1100 101 + PMEVTYPER6_EL0 = 0xdf66, // 11 011 1110 1100 110 + PMEVTYPER7_EL0 = 0xdf67, // 11 011 1110 1100 111 + PMEVTYPER8_EL0 = 0xdf68, // 11 011 1110 1101 000 + PMEVTYPER9_EL0 = 0xdf69, // 11 011 1110 1101 001 + PMEVTYPER10_EL0 = 0xdf6a, // 11 011 1110 1101 010 + PMEVTYPER11_EL0 = 0xdf6b, // 11 011 1110 1101 011 + PMEVTYPER12_EL0 = 0xdf6c, // 11 011 1110 1101 100 + PMEVTYPER13_EL0 = 0xdf6d, // 11 011 1110 1101 101 + PMEVTYPER14_EL0 = 0xdf6e, // 11 011 1110 1101 110 + PMEVTYPER15_EL0 = 0xdf6f, // 11 011 1110 1101 111 + PMEVTYPER16_EL0 = 0xdf70, // 11 011 1110 1110 000 + PMEVTYPER17_EL0 = 0xdf71, // 11 011 1110 1110 001 + PMEVTYPER18_EL0 = 0xdf72, // 11 011 1110 1110 010 + PMEVTYPER19_EL0 = 0xdf73, // 11 011 1110 1110 011 + PMEVTYPER20_EL0 = 0xdf74, // 11 011 1110 1110 100 + PMEVTYPER21_EL0 = 0xdf75, // 11 011 1110 1110 101 + PMEVTYPER22_EL0 = 0xdf76, // 11 011 1110 1110 110 + PMEVTYPER23_EL0 = 0xdf77, // 11 011 1110 1110 111 + PMEVTYPER24_EL0 = 0xdf78, // 11 011 1110 1111 000 + PMEVTYPER25_EL0 = 0xdf79, // 11 011 1110 1111 001 + PMEVTYPER26_EL0 = 0xdf7a, // 11 011 1110 1111 010 + PMEVTYPER27_EL0 = 0xdf7b, // 11 011 1110 1111 011 + PMEVTYPER28_EL0 = 0xdf7c, // 11 011 1110 1111 100 + PMEVTYPER29_EL0 = 0xdf7d, // 11 011 1110 1111 101 + PMEVTYPER30_EL0 = 0xdf7e, // 11 011 1110 1111 110 + + // Trace registers + TRCPRGCTLR = 0x8808, // 10 001 0000 0001 000 + TRCPROCSELR = 0x8810, // 10 001 0000 0010 000 + TRCCONFIGR = 0x8820, // 10 001 0000 0100 000 + TRCAUXCTLR = 0x8830, // 10 001 0000 0110 000 + TRCEVENTCTL0R = 0x8840, // 10 001 0000 1000 000 + TRCEVENTCTL1R = 0x8848, // 10 001 0000 1001 000 + TRCSTALLCTLR = 0x8858, // 10 001 0000 1011 000 + TRCTSCTLR = 0x8860, // 10 001 0000 1100 000 + TRCSYNCPR = 0x8868, // 10 001 0000 1101 000 + TRCCCCTLR = 0x8870, // 10 001 0000 1110 000 + TRCBBCTLR = 0x8878, // 10 001 0000 1111 000 + TRCTRACEIDR = 0x8801, // 10 001 0000 0000 001 + TRCQCTLR = 0x8809, // 10 001 0000 0001 001 + TRCVICTLR = 0x8802, // 10 001 0000 0000 010 + TRCVIIECTLR = 0x880a, // 10 001 0000 0001 010 + TRCVISSCTLR = 0x8812, // 10 001 0000 0010 010 + TRCVIPCSSCTLR = 0x881a, // 10 001 0000 0011 010 + TRCVDCTLR = 0x8842, // 10 001 0000 1000 010 + TRCVDSACCTLR = 0x884a, // 10 001 0000 1001 010 + TRCVDARCCTLR = 0x8852, // 10 001 0000 1010 010 + TRCSEQEVR0 = 0x8804, // 10 001 0000 0000 100 + TRCSEQEVR1 = 0x880c, // 10 001 0000 0001 100 + TRCSEQEVR2 = 0x8814, // 10 001 0000 0010 100 + TRCSEQRSTEVR = 0x8834, // 10 001 0000 0110 100 + TRCSEQSTR = 0x883c, // 10 001 0000 0111 100 + TRCEXTINSELR = 0x8844, // 10 001 0000 1000 100 + TRCCNTRLDVR0 = 0x8805, // 10 001 0000 0000 101 + TRCCNTRLDVR1 = 0x880d, // 10 001 0000 0001 101 + TRCCNTRLDVR2 = 0x8815, // 10 001 0000 0010 101 + TRCCNTRLDVR3 = 0x881d, // 10 001 0000 0011 101 + TRCCNTCTLR0 = 0x8825, // 10 001 0000 0100 101 + TRCCNTCTLR1 = 0x882d, // 10 001 0000 0101 101 + TRCCNTCTLR2 = 0x8835, // 10 001 0000 0110 101 + TRCCNTCTLR3 = 0x883d, // 10 001 0000 0111 101 + TRCCNTVR0 = 0x8845, // 10 001 0000 1000 101 + TRCCNTVR1 = 0x884d, // 10 001 0000 1001 101 + TRCCNTVR2 = 0x8855, // 10 001 0000 1010 101 + TRCCNTVR3 = 0x885d, // 10 001 0000 1011 101 + TRCIMSPEC0 = 0x8807, // 10 001 0000 0000 111 + TRCIMSPEC1 = 0x880f, // 10 001 0000 0001 111 + TRCIMSPEC2 = 0x8817, // 10 001 0000 0010 111 + TRCIMSPEC3 = 0x881f, // 10 001 0000 0011 111 + TRCIMSPEC4 = 0x8827, // 10 001 0000 0100 111 + TRCIMSPEC5 = 0x882f, // 10 001 0000 0101 111 + TRCIMSPEC6 = 0x8837, // 10 001 0000 0110 111 + TRCIMSPEC7 = 0x883f, // 10 001 0000 0111 111 + TRCRSCTLR2 = 0x8890, // 10 001 0001 0010 000 + TRCRSCTLR3 = 0x8898, // 10 001 0001 0011 000 + TRCRSCTLR4 = 0x88a0, // 10 001 0001 0100 000 + TRCRSCTLR5 = 0x88a8, // 10 001 0001 0101 000 + TRCRSCTLR6 = 0x88b0, // 10 001 0001 0110 000 + TRCRSCTLR7 = 0x88b8, // 10 001 0001 0111 000 + TRCRSCTLR8 = 0x88c0, // 10 001 0001 1000 000 + TRCRSCTLR9 = 0x88c8, // 10 001 0001 1001 000 + TRCRSCTLR10 = 0x88d0, // 10 001 0001 1010 000 + TRCRSCTLR11 = 0x88d8, // 10 001 0001 1011 000 + TRCRSCTLR12 = 0x88e0, // 10 001 0001 1100 000 + TRCRSCTLR13 = 0x88e8, // 10 001 0001 1101 000 + TRCRSCTLR14 = 0x88f0, // 10 001 0001 1110 000 + TRCRSCTLR15 = 0x88f8, // 10 001 0001 1111 000 + TRCRSCTLR16 = 0x8881, // 10 001 0001 0000 001 + TRCRSCTLR17 = 0x8889, // 10 001 0001 0001 001 + TRCRSCTLR18 = 0x8891, // 10 001 0001 0010 001 + TRCRSCTLR19 = 0x8899, // 10 001 0001 0011 001 + TRCRSCTLR20 = 0x88a1, // 10 001 0001 0100 001 + TRCRSCTLR21 = 0x88a9, // 10 001 0001 0101 001 + TRCRSCTLR22 = 0x88b1, // 10 001 0001 0110 001 + TRCRSCTLR23 = 0x88b9, // 10 001 0001 0111 001 + TRCRSCTLR24 = 0x88c1, // 10 001 0001 1000 001 + TRCRSCTLR25 = 0x88c9, // 10 001 0001 1001 001 + TRCRSCTLR26 = 0x88d1, // 10 001 0001 1010 001 + TRCRSCTLR27 = 0x88d9, // 10 001 0001 1011 001 + TRCRSCTLR28 = 0x88e1, // 10 001 0001 1100 001 + TRCRSCTLR29 = 0x88e9, // 10 001 0001 1101 001 + TRCRSCTLR30 = 0x88f1, // 10 001 0001 1110 001 + TRCRSCTLR31 = 0x88f9, // 10 001 0001 1111 001 + TRCSSCCR0 = 0x8882, // 10 001 0001 0000 010 + TRCSSCCR1 = 0x888a, // 10 001 0001 0001 010 + TRCSSCCR2 = 0x8892, // 10 001 0001 0010 010 + TRCSSCCR3 = 0x889a, // 10 001 0001 0011 010 + TRCSSCCR4 = 0x88a2, // 10 001 0001 0100 010 + TRCSSCCR5 = 0x88aa, // 10 001 0001 0101 010 + TRCSSCCR6 = 0x88b2, // 10 001 0001 0110 010 + TRCSSCCR7 = 0x88ba, // 10 001 0001 0111 010 + TRCSSCSR0 = 0x88c2, // 10 001 0001 1000 010 + TRCSSCSR1 = 0x88ca, // 10 001 0001 1001 010 + TRCSSCSR2 = 0x88d2, // 10 001 0001 1010 010 + TRCSSCSR3 = 0x88da, // 10 001 0001 1011 010 + TRCSSCSR4 = 0x88e2, // 10 001 0001 1100 010 + TRCSSCSR5 = 0x88ea, // 10 001 0001 1101 010 + TRCSSCSR6 = 0x88f2, // 10 001 0001 1110 010 + TRCSSCSR7 = 0x88fa, // 10 001 0001 1111 010 + TRCSSPCICR0 = 0x8883, // 10 001 0001 0000 011 + TRCSSPCICR1 = 0x888b, // 10 001 0001 0001 011 + TRCSSPCICR2 = 0x8893, // 10 001 0001 0010 011 + TRCSSPCICR3 = 0x889b, // 10 001 0001 0011 011 + TRCSSPCICR4 = 0x88a3, // 10 001 0001 0100 011 + TRCSSPCICR5 = 0x88ab, // 10 001 0001 0101 011 + TRCSSPCICR6 = 0x88b3, // 10 001 0001 0110 011 + TRCSSPCICR7 = 0x88bb, // 10 001 0001 0111 011 + TRCPDCR = 0x88a4, // 10 001 0001 0100 100 + TRCACVR0 = 0x8900, // 10 001 0010 0000 000 + TRCACVR1 = 0x8910, // 10 001 0010 0010 000 + TRCACVR2 = 0x8920, // 10 001 0010 0100 000 + TRCACVR3 = 0x8930, // 10 001 0010 0110 000 + TRCACVR4 = 0x8940, // 10 001 0010 1000 000 + TRCACVR5 = 0x8950, // 10 001 0010 1010 000 + TRCACVR6 = 0x8960, // 10 001 0010 1100 000 + TRCACVR7 = 0x8970, // 10 001 0010 1110 000 + TRCACVR8 = 0x8901, // 10 001 0010 0000 001 + TRCACVR9 = 0x8911, // 10 001 0010 0010 001 + TRCACVR10 = 0x8921, // 10 001 0010 0100 001 + TRCACVR11 = 0x8931, // 10 001 0010 0110 001 + TRCACVR12 = 0x8941, // 10 001 0010 1000 001 + TRCACVR13 = 0x8951, // 10 001 0010 1010 001 + TRCACVR14 = 0x8961, // 10 001 0010 1100 001 + TRCACVR15 = 0x8971, // 10 001 0010 1110 001 + TRCACATR0 = 0x8902, // 10 001 0010 0000 010 + TRCACATR1 = 0x8912, // 10 001 0010 0010 010 + TRCACATR2 = 0x8922, // 10 001 0010 0100 010 + TRCACATR3 = 0x8932, // 10 001 0010 0110 010 + TRCACATR4 = 0x8942, // 10 001 0010 1000 010 + TRCACATR5 = 0x8952, // 10 001 0010 1010 010 + TRCACATR6 = 0x8962, // 10 001 0010 1100 010 + TRCACATR7 = 0x8972, // 10 001 0010 1110 010 + TRCACATR8 = 0x8903, // 10 001 0010 0000 011 + TRCACATR9 = 0x8913, // 10 001 0010 0010 011 + TRCACATR10 = 0x8923, // 10 001 0010 0100 011 + TRCACATR11 = 0x8933, // 10 001 0010 0110 011 + TRCACATR12 = 0x8943, // 10 001 0010 1000 011 + TRCACATR13 = 0x8953, // 10 001 0010 1010 011 + TRCACATR14 = 0x8963, // 10 001 0010 1100 011 + TRCACATR15 = 0x8973, // 10 001 0010 1110 011 + TRCDVCVR0 = 0x8904, // 10 001 0010 0000 100 + TRCDVCVR1 = 0x8924, // 10 001 0010 0100 100 + TRCDVCVR2 = 0x8944, // 10 001 0010 1000 100 + TRCDVCVR3 = 0x8964, // 10 001 0010 1100 100 + TRCDVCVR4 = 0x8905, // 10 001 0010 0000 101 + TRCDVCVR5 = 0x8925, // 10 001 0010 0100 101 + TRCDVCVR6 = 0x8945, // 10 001 0010 1000 101 + TRCDVCVR7 = 0x8965, // 10 001 0010 1100 101 + TRCDVCMR0 = 0x8906, // 10 001 0010 0000 110 + TRCDVCMR1 = 0x8926, // 10 001 0010 0100 110 + TRCDVCMR2 = 0x8946, // 10 001 0010 1000 110 + TRCDVCMR3 = 0x8966, // 10 001 0010 1100 110 + TRCDVCMR4 = 0x8907, // 10 001 0010 0000 111 + TRCDVCMR5 = 0x8927, // 10 001 0010 0100 111 + TRCDVCMR6 = 0x8947, // 10 001 0010 1000 111 + TRCDVCMR7 = 0x8967, // 10 001 0010 1100 111 + TRCCIDCVR0 = 0x8980, // 10 001 0011 0000 000 + TRCCIDCVR1 = 0x8990, // 10 001 0011 0010 000 + TRCCIDCVR2 = 0x89a0, // 10 001 0011 0100 000 + TRCCIDCVR3 = 0x89b0, // 10 001 0011 0110 000 + TRCCIDCVR4 = 0x89c0, // 10 001 0011 1000 000 + TRCCIDCVR5 = 0x89d0, // 10 001 0011 1010 000 + TRCCIDCVR6 = 0x89e0, // 10 001 0011 1100 000 + TRCCIDCVR7 = 0x89f0, // 10 001 0011 1110 000 + TRCVMIDCVR0 = 0x8981, // 10 001 0011 0000 001 + TRCVMIDCVR1 = 0x8991, // 10 001 0011 0010 001 + TRCVMIDCVR2 = 0x89a1, // 10 001 0011 0100 001 + TRCVMIDCVR3 = 0x89b1, // 10 001 0011 0110 001 + TRCVMIDCVR4 = 0x89c1, // 10 001 0011 1000 001 + TRCVMIDCVR5 = 0x89d1, // 10 001 0011 1010 001 + TRCVMIDCVR6 = 0x89e1, // 10 001 0011 1100 001 + TRCVMIDCVR7 = 0x89f1, // 10 001 0011 1110 001 + TRCCIDCCTLR0 = 0x8982, // 10 001 0011 0000 010 + TRCCIDCCTLR1 = 0x898a, // 10 001 0011 0001 010 + TRCVMIDCCTLR0 = 0x8992, // 10 001 0011 0010 010 + TRCVMIDCCTLR1 = 0x899a, // 10 001 0011 0011 010 + TRCITCTRL = 0x8b84, // 10 001 0111 0000 100 + TRCCLAIMSET = 0x8bc6, // 10 001 0111 1000 110 + TRCCLAIMCLR = 0x8bce, // 10 001 0111 1001 110 + + // GICv3 registers + ICC_BPR1_EL1 = 0xc663, // 11 000 1100 1100 011 + ICC_BPR0_EL1 = 0xc643, // 11 000 1100 1000 011 + ICC_PMR_EL1 = 0xc230, // 11 000 0100 0110 000 + ICC_CTLR_EL1 = 0xc664, // 11 000 1100 1100 100 + ICC_CTLR_EL3 = 0xf664, // 11 110 1100 1100 100 + ICC_SRE_EL1 = 0xc665, // 11 000 1100 1100 101 + ICC_SRE_EL2 = 0xe64d, // 11 100 1100 1001 101 + ICC_SRE_EL3 = 0xf665, // 11 110 1100 1100 101 + ICC_IGRPEN0_EL1 = 0xc666, // 11 000 1100 1100 110 + ICC_IGRPEN1_EL1 = 0xc667, // 11 000 1100 1100 111 + ICC_IGRPEN1_EL3 = 0xf667, // 11 110 1100 1100 111 + ICC_SEIEN_EL1 = 0xc668, // 11 000 1100 1101 000 + ICC_AP0R0_EL1 = 0xc644, // 11 000 1100 1000 100 + ICC_AP0R1_EL1 = 0xc645, // 11 000 1100 1000 101 + ICC_AP0R2_EL1 = 0xc646, // 11 000 1100 1000 110 + ICC_AP0R3_EL1 = 0xc647, // 11 000 1100 1000 111 + ICC_AP1R0_EL1 = 0xc648, // 11 000 1100 1001 000 + ICC_AP1R1_EL1 = 0xc649, // 11 000 1100 1001 001 + ICC_AP1R2_EL1 = 0xc64a, // 11 000 1100 1001 010 + ICC_AP1R3_EL1 = 0xc64b, // 11 000 1100 1001 011 + ICH_AP0R0_EL2 = 0xe640, // 11 100 1100 1000 000 + ICH_AP0R1_EL2 = 0xe641, // 11 100 1100 1000 001 + ICH_AP0R2_EL2 = 0xe642, // 11 100 1100 1000 010 + ICH_AP0R3_EL2 = 0xe643, // 11 100 1100 1000 011 + ICH_AP1R0_EL2 = 0xe648, // 11 100 1100 1001 000 + ICH_AP1R1_EL2 = 0xe649, // 11 100 1100 1001 001 + ICH_AP1R2_EL2 = 0xe64a, // 11 100 1100 1001 010 + ICH_AP1R3_EL2 = 0xe64b, // 11 100 1100 1001 011 + ICH_HCR_EL2 = 0xe658, // 11 100 1100 1011 000 + ICH_MISR_EL2 = 0xe65a, // 11 100 1100 1011 010 + ICH_VMCR_EL2 = 0xe65f, // 11 100 1100 1011 111 + ICH_VSEIR_EL2 = 0xe64c, // 11 100 1100 1001 100 + ICH_LR0_EL2 = 0xe660, // 11 100 1100 1100 000 + ICH_LR1_EL2 = 0xe661, // 11 100 1100 1100 001 + ICH_LR2_EL2 = 0xe662, // 11 100 1100 1100 010 + ICH_LR3_EL2 = 0xe663, // 11 100 1100 1100 011 + ICH_LR4_EL2 = 0xe664, // 11 100 1100 1100 100 + ICH_LR5_EL2 = 0xe665, // 11 100 1100 1100 101 + ICH_LR6_EL2 = 0xe666, // 11 100 1100 1100 110 + ICH_LR7_EL2 = 0xe667, // 11 100 1100 1100 111 + ICH_LR8_EL2 = 0xe668, // 11 100 1100 1101 000 + ICH_LR9_EL2 = 0xe669, // 11 100 1100 1101 001 + ICH_LR10_EL2 = 0xe66a, // 11 100 1100 1101 010 + ICH_LR11_EL2 = 0xe66b, // 11 100 1100 1101 011 + ICH_LR12_EL2 = 0xe66c, // 11 100 1100 1101 100 + ICH_LR13_EL2 = 0xe66d, // 11 100 1100 1101 101 + ICH_LR14_EL2 = 0xe66e, // 11 100 1100 1101 110 + ICH_LR15_EL2 = 0xe66f, // 11 100 1100 1101 111 + }; + + // Cyclone specific system registers + enum CycloneSysRegValues { + CPM_IOACC_CTL_EL3 = 0xff90 + }; + + // Note that these do not inherit from AArch64NamedImmMapper. This class is + // sufficiently different in its behaviour that I don't believe it's worth + // burdening the common AArch64NamedImmMapper with abstractions only needed in + // this one case. + struct SysRegMapper { + static const AArch64NamedImmMapper::Mapping SysRegPairs[]; + static const AArch64NamedImmMapper::Mapping CycloneSysRegPairs[]; + + const AArch64NamedImmMapper::Mapping *InstPairs; + size_t NumInstPairs; + uint64_t FeatureBits; + + SysRegMapper(uint64_t FeatureBits) : FeatureBits(FeatureBits) { } + uint32_t fromString(StringRef Name, bool &Valid) const; + std::string toString(uint32_t Bits, bool &Valid) const; + }; + + struct MSRMapper : SysRegMapper { + static const AArch64NamedImmMapper::Mapping MSRPairs[]; + MSRMapper(uint64_t FeatureBits); + }; + + struct MRSMapper : SysRegMapper { + static const AArch64NamedImmMapper::Mapping MRSPairs[]; + MRSMapper(uint64_t FeatureBits); + }; + + uint32_t ParseGenericRegister(StringRef Name, bool &Valid); +} + +namespace AArch64TLBI { + enum TLBIValues { + Invalid = -1, // Op0 Op1 CRn CRm Op2 + IPAS2E1IS = 0x6401, // 01 100 1000 0000 001 + IPAS2LE1IS = 0x6405, // 01 100 1000 0000 101 + VMALLE1IS = 0x4418, // 01 000 1000 0011 000 + ALLE2IS = 0x6418, // 01 100 1000 0011 000 + ALLE3IS = 0x7418, // 01 110 1000 0011 000 + VAE1IS = 0x4419, // 01 000 1000 0011 001 + VAE2IS = 0x6419, // 01 100 1000 0011 001 + VAE3IS = 0x7419, // 01 110 1000 0011 001 + ASIDE1IS = 0x441a, // 01 000 1000 0011 010 + VAAE1IS = 0x441b, // 01 000 1000 0011 011 + ALLE1IS = 0x641c, // 01 100 1000 0011 100 + VALE1IS = 0x441d, // 01 000 1000 0011 101 + VALE2IS = 0x641d, // 01 100 1000 0011 101 + VALE3IS = 0x741d, // 01 110 1000 0011 101 + VMALLS12E1IS = 0x641e, // 01 100 1000 0011 110 + VAALE1IS = 0x441f, // 01 000 1000 0011 111 + IPAS2E1 = 0x6421, // 01 100 1000 0100 001 + IPAS2LE1 = 0x6425, // 01 100 1000 0100 101 + VMALLE1 = 0x4438, // 01 000 1000 0111 000 + ALLE2 = 0x6438, // 01 100 1000 0111 000 + ALLE3 = 0x7438, // 01 110 1000 0111 000 + VAE1 = 0x4439, // 01 000 1000 0111 001 + VAE2 = 0x6439, // 01 100 1000 0111 001 + VAE3 = 0x7439, // 01 110 1000 0111 001 + ASIDE1 = 0x443a, // 01 000 1000 0111 010 + VAAE1 = 0x443b, // 01 000 1000 0111 011 + ALLE1 = 0x643c, // 01 100 1000 0111 100 + VALE1 = 0x443d, // 01 000 1000 0111 101 + VALE2 = 0x643d, // 01 100 1000 0111 101 + VALE3 = 0x743d, // 01 110 1000 0111 101 + VMALLS12E1 = 0x643e, // 01 100 1000 0111 110 + VAALE1 = 0x443f // 01 000 1000 0111 111 + }; + + struct TLBIMapper : AArch64NamedImmMapper { + const static Mapping TLBIPairs[]; + + TLBIMapper(); + }; + + static inline bool NeedsRegister(TLBIValues Val) { + switch (Val) { + case VMALLE1IS: + case ALLE2IS: + case ALLE3IS: + case ALLE1IS: + case VMALLS12E1IS: + case VMALLE1: + case ALLE2: + case ALLE3: + case ALLE1: + case VMALLS12E1: + return false; + default: + return true; + } + } +} + +namespace AArch64II { + /// Target Operand Flag enum. + enum TOF { + //===------------------------------------------------------------------===// + // AArch64 Specific MachineOperand flags. + + MO_NO_FLAG, + + MO_FRAGMENT = 0x7, + + /// MO_PAGE - A symbol operand with this flag represents the pc-relative + /// offset of the 4K page containing the symbol. This is used with the + /// ADRP instruction. + MO_PAGE = 1, + + /// MO_PAGEOFF - A symbol operand with this flag represents the offset of + /// that symbol within a 4K page. This offset is added to the page address + /// to produce the complete address. + MO_PAGEOFF = 2, + + /// MO_G3 - A symbol operand with this flag (granule 3) represents the high + /// 16-bits of a 64-bit address, used in a MOVZ or MOVK instruction + MO_G3 = 3, + + /// MO_G2 - A symbol operand with this flag (granule 2) represents the bits + /// 32-47 of a 64-bit address, used in a MOVZ or MOVK instruction + MO_G2 = 4, + + /// MO_G1 - A symbol operand with this flag (granule 1) represents the bits + /// 16-31 of a 64-bit address, used in a MOVZ or MOVK instruction + MO_G1 = 5, + + /// MO_G0 - A symbol operand with this flag (granule 0) represents the bits + /// 0-15 of a 64-bit address, used in a MOVZ or MOVK instruction + MO_G0 = 6, + + /// MO_GOT - This flag indicates that a symbol operand represents the + /// address of the GOT entry for the symbol, rather than the address of + /// the symbol itself. + MO_GOT = 8, + + /// MO_NC - Indicates whether the linker is expected to check the symbol + /// reference for overflow. For example in an ADRP/ADD pair of relocations + /// the ADRP usually does check, but not the ADD. + MO_NC = 0x10, + + /// MO_TLS - Indicates that the operand being accessed is some kind of + /// thread-local symbol. On Darwin, only one type of thread-local access + /// exists (pre linker-relaxation), but on ELF the TLSModel used for the + /// referee will affect interpretation. + MO_TLS = 0x20 + }; +} // end namespace AArch64II + +} // end namespace llvm + +#endif diff --git a/lib/Target/AArch64/Utils/CMakeLists.txt b/lib/Target/AArch64/Utils/CMakeLists.txt new file mode 100644 index 00000000000..8ee03a7571b --- /dev/null +++ b/lib/Target/AArch64/Utils/CMakeLists.txt @@ -0,0 +1,3 @@ +add_llvm_library(LLVMAArch64Utils + AArch64BaseInfo.cpp + ) diff --git a/lib/Target/AArch64/Utils/LLVMBuild.txt b/lib/Target/AArch64/Utils/LLVMBuild.txt new file mode 100644 index 00000000000..bcefeb672f7 --- /dev/null +++ b/lib/Target/AArch64/Utils/LLVMBuild.txt @@ -0,0 +1,23 @@ +;===- ./lib/Target/AArch64/Utils/LLVMBuild.txt ----------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = AArch64Utils +parent = AArch64 +required_libraries = Support +add_to_library_groups = AArch64 diff --git a/lib/Target/AArch64/Utils/Makefile b/lib/Target/AArch64/Utils/Makefile new file mode 100644 index 00000000000..0b80f82f2b9 --- /dev/null +++ b/lib/Target/AArch64/Utils/Makefile @@ -0,0 +1,16 @@ +##===- lib/Target/AArch64/Utils/Makefile -------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## +LEVEL = ../../../.. +LIBRARYNAME = LLVMAArch64Utils + +# Hack: we need to include 'main' AArch64 target directory to grab private +# headers +CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. + +include $(LEVEL)/Makefile.common diff --git a/lib/Target/ARM64/ARM64.h b/lib/Target/ARM64/ARM64.h deleted file mode 100644 index debb9002eb4..00000000000 --- a/lib/Target/ARM64/ARM64.h +++ /dev/null @@ -1,48 +0,0 @@ -//===-- ARM64.h - Top-level interface for ARM64 representation --*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the entry points for global functions defined in the LLVM -// ARM64 back-end. -// -//===----------------------------------------------------------------------===// - -#ifndef TARGET_ARM64_H -#define TARGET_ARM64_H - -#include "Utils/ARM64BaseInfo.h" -#include "MCTargetDesc/ARM64MCTargetDesc.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Support/DataTypes.h" - -namespace llvm { - -class ARM64TargetMachine; -class FunctionPass; -class MachineFunctionPass; - -FunctionPass *createARM64DeadRegisterDefinitions(); -FunctionPass *createARM64ConditionalCompares(); -FunctionPass *createARM64AdvSIMDScalar(); -FunctionPass *createARM64BranchRelaxation(); -FunctionPass *createARM64ISelDag(ARM64TargetMachine &TM, - CodeGenOpt::Level OptLevel); -FunctionPass *createARM64StorePairSuppressPass(); -FunctionPass *createARM64ExpandPseudoPass(); -FunctionPass *createARM64LoadStoreOptimizationPass(); -ModulePass *createARM64PromoteConstantPass(); -FunctionPass *createARM64AddressTypePromotionPass(); -/// \brief Creates an ARM-specific Target Transformation Info pass. -ImmutablePass *createARM64TargetTransformInfoPass(const ARM64TargetMachine *TM); - -FunctionPass *createARM64CleanupLocalDynamicTLSPass(); - -FunctionPass *createARM64CollectLOHPass(); -} // end namespace llvm - -#endif diff --git a/lib/Target/ARM64/ARM64.td b/lib/Target/ARM64/ARM64.td deleted file mode 100644 index c473205f17c..00000000000 --- a/lib/Target/ARM64/ARM64.td +++ /dev/null @@ -1,134 +0,0 @@ -//===- ARM64.td - Describe the ARM64 Target Machine --------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// Target-independent interfaces which we are implementing -//===----------------------------------------------------------------------===// - -include "llvm/Target/Target.td" - -//===----------------------------------------------------------------------===// -// ARM64 Subtarget features. -// - -def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true", - "Enable ARMv8 FP">; - -def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true", - "Enable Advanced SIMD instructions", [FeatureFPARMv8]>; - -def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", - "Enable cryptographic instructions">; - -def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", - "Enable ARMv8 CRC-32 checksum instructions">; - -/// Cyclone has register move instructions which are "free". -def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", - "Has zero-cycle register moves">; - -/// Cyclone has instructions which zero registers for "free". -def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", - "Has zero-cycle zeroing instructions">; - -//===----------------------------------------------------------------------===// -// Register File Description -//===----------------------------------------------------------------------===// - -include "ARM64RegisterInfo.td" -include "ARM64CallingConvention.td" - -//===----------------------------------------------------------------------===// -// Instruction Descriptions -//===----------------------------------------------------------------------===// - -include "ARM64Schedule.td" -include "ARM64InstrInfo.td" - -def ARM64InstrInfo : InstrInfo; - -//===----------------------------------------------------------------------===// -// ARM64 Processors supported. -// -include "ARM64SchedA53.td" -include "ARM64SchedCyclone.td" - -def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", - "Cortex-A53 ARM processors", - [FeatureFPARMv8, - FeatureNEON, - FeatureCrypto, - FeatureCRC]>; - -def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", - "Cortex-A57 ARM processors", - [FeatureFPARMv8, - FeatureNEON, - FeatureCrypto, - FeatureCRC]>; - -def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone", - "Cyclone", - [FeatureFPARMv8, - FeatureNEON, - FeatureCrypto, - FeatureCRC, - FeatureZCRegMove, FeatureZCZeroing]>; - -def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8, - FeatureNEON, - FeatureCRC]>; - -def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; -def : ProcessorModel<"cortex-a57", NoSchedModel, [ProcA57]>; -def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; - -//===----------------------------------------------------------------------===// -// Assembly parser -//===----------------------------------------------------------------------===// - -def GenericAsmParserVariant : AsmParserVariant { - int Variant = 0; - string Name = "generic"; -} - -def AppleAsmParserVariant : AsmParserVariant { - int Variant = 1; - string Name = "apple-neon"; -} - -//===----------------------------------------------------------------------===// -// Assembly printer -//===----------------------------------------------------------------------===// -// ARM64 Uses the MC printer for asm output, so make sure the TableGen -// AsmWriter bits get associated with the correct class. -def GenericAsmWriter : AsmWriter { - string AsmWriterClassName = "InstPrinter"; - int Variant = 0; - bit isMCAsmWriter = 1; -} - -def AppleAsmWriter : AsmWriter { - let AsmWriterClassName = "AppleInstPrinter"; - int Variant = 1; - int isMCAsmWriter = 1; -} - -//===----------------------------------------------------------------------===// -// Target Declaration -//===----------------------------------------------------------------------===// - -def ARM64 : Target { - let InstructionSet = ARM64InstrInfo; - let AssemblyParserVariants = [GenericAsmParserVariant, AppleAsmParserVariant]; - let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter]; -} diff --git a/lib/Target/ARM64/ARM64AddressTypePromotion.cpp b/lib/Target/ARM64/ARM64AddressTypePromotion.cpp deleted file mode 100644 index be2b5eed2ad..00000000000 --- a/lib/Target/ARM64/ARM64AddressTypePromotion.cpp +++ /dev/null @@ -1,493 +0,0 @@ - -//===-- ARM64AddressTypePromotion.cpp --- Promote type for addr accesses -===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass tries to promote the computations use to obtained a sign extended -// value used into memory accesses. -// E.g. -// a = add nsw i32 b, 3 -// d = sext i32 a to i64 -// e = getelementptr ..., i64 d -// -// => -// f = sext i32 b to i64 -// a = add nsw i64 f, 3 -// e = getelementptr ..., i64 a -// -// This is legal to do so if the computations are markers with either nsw or nuw -// markers. -// Moreover, the current heuristic is simple: it does not create new sext -// operations, i.e., it gives up when a sext would have forked (e.g., if -// a = add i32 b, c, two sexts are required to promote the computation). -// -// FIXME: This pass may be useful for other targets too. -// ===---------------------------------------------------------------------===// - -#include "ARM64.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Operator.h" -#include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" - -using namespace llvm; - -#define DEBUG_TYPE "arm64-type-promotion" - -static cl::opt -EnableAddressTypePromotion("arm64-type-promotion", cl::Hidden, - cl::desc("Enable the type promotion pass"), - cl::init(true)); -static cl::opt -EnableMerge("arm64-type-promotion-merge", cl::Hidden, - cl::desc("Enable merging of redundant sexts when one is dominating" - " the other."), - cl::init(true)); - -//===----------------------------------------------------------------------===// -// ARM64AddressTypePromotion -//===----------------------------------------------------------------------===// - -namespace llvm { -void initializeARM64AddressTypePromotionPass(PassRegistry &); -} - -namespace { -class ARM64AddressTypePromotion : public FunctionPass { - -public: - static char ID; - ARM64AddressTypePromotion() - : FunctionPass(ID), Func(nullptr), ConsideredSExtType(nullptr) { - initializeARM64AddressTypePromotionPass(*PassRegistry::getPassRegistry()); - } - - const char *getPassName() const override { - return "ARM64 Address Type Promotion"; - } - - /// Iterate over the functions and promote the computation of interesting - // sext instructions. - bool runOnFunction(Function &F) override; - -private: - /// The current function. - Function *Func; - /// Filter out all sexts that does not have this type. - /// Currently initialized with Int64Ty. - Type *ConsideredSExtType; - - // This transformation requires dominator info. - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired(); - AU.addPreserved(); - FunctionPass::getAnalysisUsage(AU); - } - - typedef SmallPtrSet SetOfInstructions; - typedef SmallVector Instructions; - typedef DenseMap ValueToInsts; - - /// Check if it is profitable to move a sext through this instruction. - /// Currently, we consider it is profitable if: - /// - Inst is used only once (no need to insert truncate). - /// - Inst has only one operand that will require a sext operation (we do - /// do not create new sext operation). - bool shouldGetThrough(const Instruction *Inst); - - /// Check if it is possible and legal to move a sext through this - /// instruction. - /// Current heuristic considers that we can get through: - /// - Arithmetic operation marked with the nsw or nuw flag. - /// - Other sext operation. - /// - Truncate operation if it was just dropping sign extended bits. - bool canGetThrough(const Instruction *Inst); - - /// Move sext operations through safe to sext instructions. - bool propagateSignExtension(Instructions &SExtInsts); - - /// Is this sext should be considered for code motion. - /// We look for sext with ConsideredSExtType and uses in at least one - // GetElementPtrInst. - bool shouldConsiderSExt(const Instruction *SExt) const; - - /// Collect all interesting sext operations, i.e., the ones with the right - /// type and used in memory accesses. - /// More precisely, a sext instruction is considered as interesting if it - /// is used in a "complex" getelementptr or it exits at least another - /// sext instruction that sign extended the same initial value. - /// A getelementptr is considered as "complex" if it has more than 2 - // operands. - void analyzeSExtension(Instructions &SExtInsts); - - /// Merge redundant sign extension operations in common dominator. - void mergeSExts(ValueToInsts &ValToSExtendedUses, - SetOfInstructions &ToRemove); -}; -} // end anonymous namespace. - -char ARM64AddressTypePromotion::ID = 0; - -INITIALIZE_PASS_BEGIN(ARM64AddressTypePromotion, "arm64-type-promotion", - "ARM64 Type Promotion Pass", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_END(ARM64AddressTypePromotion, "arm64-type-promotion", - "ARM64 Type Promotion Pass", false, false) - -FunctionPass *llvm::createARM64AddressTypePromotionPass() { - return new ARM64AddressTypePromotion(); -} - -bool ARM64AddressTypePromotion::canGetThrough(const Instruction *Inst) { - if (isa(Inst)) - return true; - - const BinaryOperator *BinOp = dyn_cast(Inst); - if (BinOp && isa(BinOp) && - (BinOp->hasNoUnsignedWrap() || BinOp->hasNoSignedWrap())) - return true; - - // sext(trunc(sext)) --> sext - if (isa(Inst) && isa(Inst->getOperand(0))) { - const Instruction *Opnd = cast(Inst->getOperand(0)); - // Check that the truncate just drop sign extended bits. - if (Inst->getType()->getIntegerBitWidth() >= - Opnd->getOperand(0)->getType()->getIntegerBitWidth() && - Inst->getOperand(0)->getType()->getIntegerBitWidth() <= - ConsideredSExtType->getIntegerBitWidth()) - return true; - } - - return false; -} - -bool ARM64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) { - // If the type of the sext is the same as the considered one, this sext - // will become useless. - // Otherwise, we will have to do something to preserve the original value, - // unless it is used once. - if (isa(Inst) && - (Inst->getType() == ConsideredSExtType || Inst->hasOneUse())) - return true; - - // If the Inst is used more that once, we may need to insert truncate - // operations and we don't do that at the moment. - if (!Inst->hasOneUse()) - return false; - - // This truncate is used only once, thus if we can get thourgh, it will become - // useless. - if (isa(Inst)) - return true; - - // If both operands are not constant, a new sext will be created here. - // Current heuristic is: each step should be profitable. - // Therefore we don't allow to increase the number of sext even if it may - // be profitable later on. - if (isa(Inst) && isa(Inst->getOperand(1))) - return true; - - return false; -} - -static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) { - if (isa(Inst) && OpIdx == 0) - return false; - return true; -} - -bool -ARM64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const { - if (SExt->getType() != ConsideredSExtType) - return false; - - for (const Use &U : SExt->uses()) { - if (isa(*U)) - return true; - } - - return false; -} - -// Input: -// - SExtInsts contains all the sext instructions that are use direclty in -// GetElementPtrInst, i.e., access to memory. -// Algorithm: -// - For each sext operation in SExtInsts: -// Let var be the operand of sext. -// while it is profitable (see shouldGetThrough), legal, and safe -// (see canGetThrough) to move sext through var's definition: -// * promote the type of var's definition. -// * fold var into sext uses. -// * move sext above var's definition. -// * update sext operand to use the operand of var that should be sign -// extended (by construction there is only one). -// -// E.g., -// a = ... i32 c, 3 -// b = sext i32 a to i64 <- is it legal/safe/profitable to get through 'a' -// ... -// = b -// => Yes, update the code -// b = sext i32 c to i64 -// a = ... i64 b, 3 -// ... -// = a -// Iterate on 'c'. -bool -ARM64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) { - DEBUG(dbgs() << "*** Propagate Sign Extension ***\n"); - - bool LocalChange = false; - SetOfInstructions ToRemove; - ValueToInsts ValToSExtendedUses; - while (!SExtInsts.empty()) { - // Get through simple chain. - Instruction *SExt = SExtInsts.pop_back_val(); - - DEBUG(dbgs() << "Consider:\n" << *SExt << '\n'); - - // If this SExt has already been merged continue. - if (SExt->use_empty() && ToRemove.count(SExt)) { - DEBUG(dbgs() << "No uses => marked as delete\n"); - continue; - } - - // Now try to get through the chain of definitions. - while (isa(SExt->getOperand(0))) { - Instruction *Inst = dyn_cast(SExt->getOperand(0)); - DEBUG(dbgs() << "Try to get through:\n" << *Inst << '\n'); - if (!canGetThrough(Inst) || !shouldGetThrough(Inst)) { - // We cannot get through something that is not an Instruction - // or not safe to SExt. - DEBUG(dbgs() << "Cannot get through\n"); - break; - } - - LocalChange = true; - // If this is a sign extend, it becomes useless. - if (isa(Inst) || isa(Inst)) { - DEBUG(dbgs() << "SExt or trunc, mark it as to remove\n"); - // We cannot use replaceAllUsesWith here because we may trigger some - // assertion on the type as all involved sext operation may have not - // been moved yet. - while (!Inst->use_empty()) { - Value::use_iterator UseIt = Inst->use_begin(); - Instruction *UseInst = dyn_cast(*UseIt); - assert(UseInst && "Use of sext is not an Instruction!"); - UseInst->setOperand(UseIt->getOperandNo(), SExt); - } - ToRemove.insert(Inst); - SExt->setOperand(0, Inst->getOperand(0)); - SExt->moveBefore(Inst); - continue; - } - - // Get through the Instruction: - // 1. Update its type. - // 2. Replace the uses of SExt by Inst. - // 3. Sign extend each operand that needs to be sign extended. - - // Step #1. - Inst->mutateType(SExt->getType()); - // Step #2. - SExt->replaceAllUsesWith(Inst); - // Step #3. - Instruction *SExtForOpnd = SExt; - - DEBUG(dbgs() << "Propagate SExt to operands\n"); - for (int OpIdx = 0, EndOpIdx = Inst->getNumOperands(); OpIdx != EndOpIdx; - ++OpIdx) { - DEBUG(dbgs() << "Operand:\n" << *(Inst->getOperand(OpIdx)) << '\n'); - if (Inst->getOperand(OpIdx)->getType() == SExt->getType() || - !shouldSExtOperand(Inst, OpIdx)) { - DEBUG(dbgs() << "No need to propagate\n"); - continue; - } - // Check if we can statically sign extend the operand. - Value *Opnd = Inst->getOperand(OpIdx); - if (const ConstantInt *Cst = dyn_cast(Opnd)) { - DEBUG(dbgs() << "Statically sign extend\n"); - Inst->setOperand(OpIdx, ConstantInt::getSigned(SExt->getType(), - Cst->getSExtValue())); - continue; - } - // UndefValue are typed, so we have to statically sign extend them. - if (isa(Opnd)) { - DEBUG(dbgs() << "Statically sign extend\n"); - Inst->setOperand(OpIdx, UndefValue::get(SExt->getType())); - continue; - } - - // Otherwise we have to explicity sign extend it. - assert(SExtForOpnd && - "Only one operand should have been sign extended"); - - SExtForOpnd->setOperand(0, Opnd); - - DEBUG(dbgs() << "Move before:\n" << *Inst << "\nSign extend\n"); - // Move the sign extension before the insertion point. - SExtForOpnd->moveBefore(Inst); - Inst->setOperand(OpIdx, SExtForOpnd); - // If more sext are required, new instructions will have to be created. - SExtForOpnd = nullptr; - } - if (SExtForOpnd == SExt) { - DEBUG(dbgs() << "Sign extension is useless now\n"); - ToRemove.insert(SExt); - break; - } - } - - // If the use is already of the right type, connect its uses to its argument - // and delete it. - // This can happen for an Instruction which all uses are sign extended. - if (!ToRemove.count(SExt) && - SExt->getType() == SExt->getOperand(0)->getType()) { - DEBUG(dbgs() << "Sign extension is useless, attach its use to " - "its argument\n"); - SExt->replaceAllUsesWith(SExt->getOperand(0)); - ToRemove.insert(SExt); - } else - ValToSExtendedUses[SExt->getOperand(0)].push_back(SExt); - } - - if (EnableMerge) - mergeSExts(ValToSExtendedUses, ToRemove); - - // Remove all instructions marked as ToRemove. - for (Instruction *I: ToRemove) - I->eraseFromParent(); - return LocalChange; -} - -void ARM64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses, - SetOfInstructions &ToRemove) { - DominatorTree &DT = getAnalysis().getDomTree(); - - for (auto &Entry : ValToSExtendedUses) { - Instructions &Insts = Entry.second; - Instructions CurPts; - for (Instruction *Inst : Insts) { - if (ToRemove.count(Inst)) - continue; - bool inserted = false; - for (auto Pt : CurPts) { - if (DT.dominates(Inst, Pt)) { - DEBUG(dbgs() << "Replace all uses of:\n" << *Pt << "\nwith:\n" - << *Inst << '\n'); - (Pt)->replaceAllUsesWith(Inst); - ToRemove.insert(Pt); - Pt = Inst; - inserted = true; - break; - } - if (!DT.dominates(Pt, Inst)) - // Give up if we need to merge in a common dominator as the - // expermients show it is not profitable. - continue; - - DEBUG(dbgs() << "Replace all uses of:\n" << *Inst << "\nwith:\n" - << *Pt << '\n'); - Inst->replaceAllUsesWith(Pt); - ToRemove.insert(Inst); - inserted = true; - break; - } - if (!inserted) - CurPts.push_back(Inst); - } - } -} - -void ARM64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) { - DEBUG(dbgs() << "*** Analyze Sign Extensions ***\n"); - - DenseMap SeenChains; - - for (auto &BB : *Func) { - for (auto &II : BB) { - Instruction *SExt = &II; - - // Collect all sext operation per type. - if (!isa(SExt) || !shouldConsiderSExt(SExt)) - continue; - - DEBUG(dbgs() << "Found:\n" << (*SExt) << '\n'); - - // Cases where we actually perform the optimization: - // 1. SExt is used in a getelementptr with more than 2 operand => - // likely we can merge some computation if they are done on 64 bits. - // 2. The beginning of the SExt chain is SExt several time. => - // code sharing is possible. - - bool insert = false; - // #1. - for (const Use &U : SExt->uses()) { - const Instruction *Inst = dyn_cast(U); - if (Inst && Inst->getNumOperands() > 2) { - DEBUG(dbgs() << "Interesting use in GetElementPtrInst\n" << *Inst - << '\n'); - insert = true; - break; - } - } - - // #2. - // Check the head of the chain. - Instruction *Inst = SExt; - Value *Last; - do { - int OpdIdx = 0; - const BinaryOperator *BinOp = dyn_cast(Inst); - if (BinOp && isa(BinOp->getOperand(0))) - OpdIdx = 1; - Last = Inst->getOperand(OpdIdx); - Inst = dyn_cast(Last); - } while (Inst && canGetThrough(Inst) && shouldGetThrough(Inst)); - - DEBUG(dbgs() << "Head of the chain:\n" << *Last << '\n'); - DenseMap::iterator AlreadySeen = - SeenChains.find(Last); - if (insert || AlreadySeen != SeenChains.end()) { - DEBUG(dbgs() << "Insert\n"); - SExtInsts.push_back(SExt); - if (AlreadySeen != SeenChains.end() && AlreadySeen->second != nullptr) { - DEBUG(dbgs() << "Insert chain member\n"); - SExtInsts.push_back(AlreadySeen->second); - SeenChains[Last] = nullptr; - } - } else { - DEBUG(dbgs() << "Record its chain membership\n"); - SeenChains[Last] = SExt; - } - } - } -} - -bool ARM64AddressTypePromotion::runOnFunction(Function &F) { - if (!EnableAddressTypePromotion || F.isDeclaration()) - return false; - Func = &F; - ConsideredSExtType = Type::getInt64Ty(Func->getContext()); - - DEBUG(dbgs() << "*** " << getPassName() << ": " << Func->getName() << '\n'); - - Instructions SExtInsts; - analyzeSExtension(SExtInsts); - return propagateSignExtension(SExtInsts); -} diff --git a/lib/Target/ARM64/ARM64AdvSIMDScalarPass.cpp b/lib/Target/ARM64/ARM64AdvSIMDScalarPass.cpp deleted file mode 100644 index 5950a8f18e1..00000000000 --- a/lib/Target/ARM64/ARM64AdvSIMDScalarPass.cpp +++ /dev/null @@ -1,385 +0,0 @@ -//===-- ARM64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// When profitable, replace GPR targeting i64 instructions with their -// AdvSIMD scalar equivalents. Generally speaking, "profitable" is defined -// as minimizing the number of cross-class register copies. -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// TODO: Graph based predicate heuristics. -// Walking the instruction list linearly will get many, perhaps most, of -// the cases, but to do a truly thorough job of this, we need a more -// wholistic approach. -// -// This optimization is very similar in spirit to the register allocator's -// spill placement, only here we're determining where to place cross-class -// register copies rather than spills. As such, a similar approach is -// called for. -// -// We want to build up a set of graphs of all instructions which are candidates -// for transformation along with instructions which generate their inputs and -// consume their outputs. For each edge in the graph, we assign a weight -// based on whether there is a copy required there (weight zero if not) and -// the block frequency of the block containing the defining or using -// instruction, whichever is less. Our optimization is then a graph problem -// to minimize the total weight of all the graphs, then transform instructions -// and add or remove copy instructions as called for to implement the -// solution. -//===----------------------------------------------------------------------===// - -#include "ARM64.h" -#include "ARM64InstrInfo.h" -#include "ARM64RegisterInfo.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -using namespace llvm; - -#define DEBUG_TYPE "arm64-simd-scalar" - -// Allow forcing all i64 operations with equivalent SIMD instructions to use -// them. For stress-testing the transformation function. -static cl::opt -TransformAll("arm64-simd-scalar-force-all", - cl::desc("Force use of AdvSIMD scalar instructions everywhere"), - cl::init(false), cl::Hidden); - -STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used"); -STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted"); -STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted"); - -namespace { -class ARM64AdvSIMDScalar : public MachineFunctionPass { - MachineRegisterInfo *MRI; - const ARM64InstrInfo *TII; - -private: - // isProfitableToTransform - Predicate function to determine whether an - // instruction should be transformed to its equivalent AdvSIMD scalar - // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example. - bool isProfitableToTransform(const MachineInstr *MI) const; - - // transformInstruction - Perform the transformation of an instruction - // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs - // to be the correct register class, minimizing cross-class copies. - void transformInstruction(MachineInstr *MI); - - // processMachineBasicBlock - Main optimzation loop. - bool processMachineBasicBlock(MachineBasicBlock *MBB); - -public: - static char ID; // Pass identification, replacement for typeid. - explicit ARM64AdvSIMDScalar() : MachineFunctionPass(ID) {} - - bool runOnMachineFunction(MachineFunction &F) override; - - const char *getPassName() const override { - return "AdvSIMD Scalar Operation Optimization"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; -char ARM64AdvSIMDScalar::ID = 0; -} // end anonymous namespace - -static bool isGPR64(unsigned Reg, unsigned SubReg, - const MachineRegisterInfo *MRI) { - if (SubReg) - return false; - if (TargetRegisterInfo::isVirtualRegister(Reg)) - return MRI->getRegClass(Reg)->hasSuperClassEq(&ARM64::GPR64RegClass); - return ARM64::GPR64RegClass.contains(Reg); -} - -static bool isFPR64(unsigned Reg, unsigned SubReg, - const MachineRegisterInfo *MRI) { - if (TargetRegisterInfo::isVirtualRegister(Reg)) - return (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM64::FPR64RegClass) && - SubReg == 0) || - (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM64::FPR128RegClass) && - SubReg == ARM64::dsub); - // Physical register references just check the register class directly. - return (ARM64::FPR64RegClass.contains(Reg) && SubReg == 0) || - (ARM64::FPR128RegClass.contains(Reg) && SubReg == ARM64::dsub); -} - -// getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64 -// copy instruction. Return zero_reg if the instruction is not a copy. -static unsigned getSrcFromCopy(const MachineInstr *MI, - const MachineRegisterInfo *MRI, - unsigned &SubReg) { - SubReg = 0; - // The "FMOV Xd, Dn" instruction is the typical form. - if (MI->getOpcode() == ARM64::FMOVDXr || MI->getOpcode() == ARM64::FMOVXDr) - return MI->getOperand(1).getReg(); - // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see - // these at this stage, but it's easy to check for. - if (MI->getOpcode() == ARM64::UMOVvi64 && MI->getOperand(2).getImm() == 0) { - SubReg = ARM64::dsub; - return MI->getOperand(1).getReg(); - } - // Or just a plain COPY instruction. This can be directly to/from FPR64, - // or it can be a dsub subreg reference to an FPR128. - if (MI->getOpcode() == ARM64::COPY) { - if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(), - MRI) && - isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI)) - return MI->getOperand(1).getReg(); - if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(), - MRI) && - isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), - MRI)) { - SubReg = MI->getOperand(1).getSubReg(); - return MI->getOperand(1).getReg(); - } - } - - // Otherwise, this is some other kind of instruction. - return 0; -} - -// getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent -// that we're considering transforming to, return that AdvSIMD opcode. For all -// others, return the original opcode. -static int getTransformOpcode(unsigned Opc) { - switch (Opc) { - default: - break; - // FIXME: Lots more possibilities. - case ARM64::ADDXrr: - return ARM64::ADDv1i64; - case ARM64::SUBXrr: - return ARM64::SUBv1i64; - } - // No AdvSIMD equivalent, so just return the original opcode. - return Opc; -} - -static bool isTransformable(const MachineInstr *MI) { - int Opc = MI->getOpcode(); - return Opc != getTransformOpcode(Opc); -} - -// isProfitableToTransform - Predicate function to determine whether an -// instruction should be transformed to its equivalent AdvSIMD scalar -// instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example. -bool ARM64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const { - // If this instruction isn't eligible to be transformed (no SIMD equivalent), - // early exit since that's the common case. - if (!isTransformable(MI)) - return false; - - // Count the number of copies we'll need to add and approximate the number - // of copies that a transform will enable us to remove. - unsigned NumNewCopies = 3; - unsigned NumRemovableCopies = 0; - - unsigned OrigSrc0 = MI->getOperand(1).getReg(); - unsigned OrigSrc1 = MI->getOperand(2).getReg(); - unsigned Src0 = 0, SubReg0; - unsigned Src1 = 0, SubReg1; - if (!MRI->def_empty(OrigSrc0)) { - MachineRegisterInfo::def_instr_iterator Def = - MRI->def_instr_begin(OrigSrc0); - assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); - Src0 = getSrcFromCopy(&*Def, MRI, SubReg0); - // If the source was from a copy, we don't need to insert a new copy. - if (Src0) - --NumNewCopies; - // If there are no other users of the original source, we can delete - // that instruction. - if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) - ++NumRemovableCopies; - } - if (!MRI->def_empty(OrigSrc1)) { - MachineRegisterInfo::def_instr_iterator Def = - MRI->def_instr_begin(OrigSrc1); - assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); - Src1 = getSrcFromCopy(&*Def, MRI, SubReg1); - if (Src1) - --NumNewCopies; - // If there are no other users of the original source, we can delete - // that instruction. - if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) - ++NumRemovableCopies; - } - - // If any of the uses of the original instructions is a cross class copy, - // that's a copy that will be removable if we transform. Likewise, if - // any of the uses is a transformable instruction, it's likely the tranforms - // will chain, enabling us to save a copy there, too. This is an aggressive - // heuristic that approximates the graph based cost analysis described above. - unsigned Dst = MI->getOperand(0).getReg(); - bool AllUsesAreCopies = true; - for (MachineRegisterInfo::use_instr_nodbg_iterator - Use = MRI->use_instr_nodbg_begin(Dst), - E = MRI->use_instr_nodbg_end(); - Use != E; ++Use) { - unsigned SubReg; - if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(&*Use)) - ++NumRemovableCopies; - // If the use is an INSERT_SUBREG, that's still something that can - // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's - // preferable to have it use the FPR64 in most cases, as if the source - // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely. - // Ditto for a lane insert. - else if (Use->getOpcode() == ARM64::INSERT_SUBREG || - Use->getOpcode() == ARM64::INSvi64gpr) - ; - else - AllUsesAreCopies = false; - } - // If all of the uses of the original destination register are copies to - // FPR64, then we won't end up having a new copy back to GPR64 either. - if (AllUsesAreCopies) - --NumNewCopies; - - // If a transform will not increase the number of cross-class copies required, - // return true. - if (NumNewCopies <= NumRemovableCopies) - return true; - - // Finally, even if we otherwise wouldn't transform, check if we're forcing - // transformation of everything. - return TransformAll; -} - -static MachineInstr *insertCopy(const ARM64InstrInfo *TII, MachineInstr *MI, - unsigned Dst, unsigned Src, bool IsKill) { - MachineInstrBuilder MIB = - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(ARM64::COPY), - Dst) - .addReg(Src, getKillRegState(IsKill)); - DEBUG(dbgs() << " adding copy: " << *MIB); - ++NumCopiesInserted; - return MIB; -} - -// transformInstruction - Perform the transformation of an instruction -// to its equivalant AdvSIMD scalar instruction. Update inputs and outputs -// to be the correct register class, minimizing cross-class copies. -void ARM64AdvSIMDScalar::transformInstruction(MachineInstr *MI) { - DEBUG(dbgs() << "Scalar transform: " << *MI); - - MachineBasicBlock *MBB = MI->getParent(); - int OldOpc = MI->getOpcode(); - int NewOpc = getTransformOpcode(OldOpc); - assert(OldOpc != NewOpc && "transform an instruction to itself?!"); - - // Check if we need a copy for the source registers. - unsigned OrigSrc0 = MI->getOperand(1).getReg(); - unsigned OrigSrc1 = MI->getOperand(2).getReg(); - unsigned Src0 = 0, SubReg0; - unsigned Src1 = 0, SubReg1; - if (!MRI->def_empty(OrigSrc0)) { - MachineRegisterInfo::def_instr_iterator Def = - MRI->def_instr_begin(OrigSrc0); - assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); - Src0 = getSrcFromCopy(&*Def, MRI, SubReg0); - // If there are no other users of the original source, we can delete - // that instruction. - if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) { - assert(Src0 && "Can't delete copy w/o a valid original source!"); - Def->eraseFromParent(); - ++NumCopiesDeleted; - } - } - if (!MRI->def_empty(OrigSrc1)) { - MachineRegisterInfo::def_instr_iterator Def = - MRI->def_instr_begin(OrigSrc1); - assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!"); - Src1 = getSrcFromCopy(&*Def, MRI, SubReg1); - // If there are no other users of the original source, we can delete - // that instruction. - if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) { - assert(Src1 && "Can't delete copy w/o a valid original source!"); - Def->eraseFromParent(); - ++NumCopiesDeleted; - } - } - // If we weren't able to reference the original source directly, create a - // copy. - if (!Src0) { - SubReg0 = 0; - Src0 = MRI->createVirtualRegister(&ARM64::FPR64RegClass); - insertCopy(TII, MI, Src0, OrigSrc0, true); - } - if (!Src1) { - SubReg1 = 0; - Src1 = MRI->createVirtualRegister(&ARM64::FPR64RegClass); - insertCopy(TII, MI, Src1, OrigSrc1, true); - } - - // Create a vreg for the destination. - // FIXME: No need to do this if the ultimate user expects an FPR64. - // Check for that and avoid the copy if possible. - unsigned Dst = MRI->createVirtualRegister(&ARM64::FPR64RegClass); - - // For now, all of the new instructions have the same simple three-register - // form, so no need to special case based on what instruction we're - // building. - BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(NewOpc), Dst) - .addReg(Src0, getKillRegState(true), SubReg0) - .addReg(Src1, getKillRegState(true), SubReg1); - - // Now copy the result back out to a GPR. - // FIXME: Try to avoid this if all uses could actually just use the FPR64 - // directly. - insertCopy(TII, MI, MI->getOperand(0).getReg(), Dst, true); - - // Erase the old instruction. - MI->eraseFromParent(); - - ++NumScalarInsnsUsed; -} - -// processMachineBasicBlock - Main optimzation loop. -bool ARM64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) { - bool Changed = false; - for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) { - MachineInstr *MI = I; - ++I; - if (isProfitableToTransform(MI)) { - transformInstruction(MI); - Changed = true; - } - } - return Changed; -} - -// runOnMachineFunction - Pass entry point from PassManager. -bool ARM64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) { - bool Changed = false; - DEBUG(dbgs() << "***** ARM64AdvSIMDScalar *****\n"); - - const TargetMachine &TM = mf.getTarget(); - MRI = &mf.getRegInfo(); - TII = static_cast(TM.getInstrInfo()); - - // Just check things on a one-block-at-a-time basis. - for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I) - if (processMachineBasicBlock(I)) - Changed = true; - return Changed; -} - -// createARM64AdvSIMDScalar - Factory function used by ARM64TargetMachine -// to add the pass to the PassManager. -FunctionPass *llvm::createARM64AdvSIMDScalar() { - return new ARM64AdvSIMDScalar(); -} diff --git a/lib/Target/ARM64/ARM64AsmPrinter.cpp b/lib/Target/ARM64/ARM64AsmPrinter.cpp deleted file mode 100644 index 7e17985bf4a..00000000000 --- a/lib/Target/ARM64/ARM64AsmPrinter.cpp +++ /dev/null @@ -1,514 +0,0 @@ -//===-- ARM64AsmPrinter.cpp - ARM64 LLVM assembly writer ------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains a printer that converts from our internal representation -// of machine-dependent LLVM code to the ARM64 assembly language. -// -//===----------------------------------------------------------------------===// - -#include "ARM64.h" -#include "ARM64MachineFunctionInfo.h" -#include "ARM64MCInstLower.h" -#include "ARM64RegisterInfo.h" -#include "ARM64Subtarget.h" -#include "InstPrinter/ARM64InstPrinter.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/StringSwitch.h" -#include "llvm/ADT/Twine.h" -#include "llvm/CodeGen/AsmPrinter.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/StackMaps.h" -#include "llvm/CodeGen/MachineModuleInfoImpls.h" -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/DebugInfo.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstBuilder.h" -#include "llvm/MC/MCLinkerOptimizationHint.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/TargetRegistry.h" -using namespace llvm; - -#define DEBUG_TYPE "asm-printer" - -namespace { - -class ARM64AsmPrinter : public AsmPrinter { - /// Subtarget - Keep a pointer to the ARM64Subtarget around so that we can - /// make the right decision when printing asm code for different targets. - const ARM64Subtarget *Subtarget; - - ARM64MCInstLower MCInstLowering; - StackMaps SM; - -public: - ARM64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer) - : AsmPrinter(TM, Streamer), Subtarget(&TM.getSubtarget()), - MCInstLowering(OutContext, *Mang, *this), SM(*this), ARM64FI(nullptr), - LOHLabelCounter(0) {} - - const char *getPassName() const override { return "ARM64 Assembly Printer"; } - - /// \brief Wrapper for MCInstLowering.lowerOperand() for the - /// tblgen'erated pseudo lowering. - bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const { - return MCInstLowering.lowerOperand(MO, MCOp); - } - - void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, - const MachineInstr &MI); - void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, - const MachineInstr &MI); - /// \brief tblgen'erated driver function for lowering simple MI->MC - /// pseudo instructions. - bool emitPseudoExpansionLowering(MCStreamer &OutStreamer, - const MachineInstr *MI); - - void EmitInstruction(const MachineInstr *MI) override; - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AsmPrinter::getAnalysisUsage(AU); - AU.setPreservesAll(); - } - - bool runOnMachineFunction(MachineFunction &F) override { - ARM64FI = F.getInfo(); - return AsmPrinter::runOnMachineFunction(F); - } - -private: - MachineLocation getDebugValueLocation(const MachineInstr *MI) const; - void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O); - bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O); - bool printAsmRegInClass(const MachineOperand &MO, - const TargetRegisterClass *RC, bool isVector, - raw_ostream &O); - - bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; - bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum, - unsigned AsmVariant, const char *ExtraCode, - raw_ostream &O) override; - - void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS); - - void EmitFunctionBodyEnd() override; - - MCSymbol *GetCPISymbol(unsigned CPID) const override; - void EmitEndOfAsmFile(Module &M) override; - ARM64FunctionInfo *ARM64FI; - - /// \brief Emit the LOHs contained in ARM64FI. - void EmitLOHs(); - - typedef std::map MInstToMCSymbol; - MInstToMCSymbol LOHInstToLabel; - unsigned LOHLabelCounter; -}; - -} // end of anonymous namespace - -//===----------------------------------------------------------------------===// - -void ARM64AsmPrinter::EmitEndOfAsmFile(Module &M) { - if (Subtarget->isTargetMachO()) { - // Funny Darwin hack: This flag tells the linker that no global symbols - // contain code that falls through to other global symbols (e.g. the obvious - // implementation of multiple entry points). If this doesn't occur, the - // linker can safely perform dead code stripping. Since LLVM never - // generates code that does this, it is always safe to set. - OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); - SM.serializeToStackMapSection(); - } - - // Emit a .data.rel section containing any stubs that were created. - if (Subtarget->isTargetELF()) { - const TargetLoweringObjectFileELF &TLOFELF = - static_cast(getObjFileLowering()); - - MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo(); - - // Output stubs for external and common global variables. - MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList(); - if (!Stubs.empty()) { - OutStreamer.SwitchSection(TLOFELF.getDataRelSection()); - const DataLayout *TD = TM.getDataLayout(); - - for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { - OutStreamer.EmitLabel(Stubs[i].first); - OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(), - TD->getPointerSize(0)); - } - Stubs.clear(); - } - } - -} - -MachineLocation -ARM64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const { - MachineLocation Location; - assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!"); - // Frame address. Currently handles register +- offset only. - if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm()) - Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm()); - else { - DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n"); - } - return Location; -} - -void ARM64AsmPrinter::EmitLOHs() { - SmallVector MCArgs; - - for (const auto &D : ARM64FI->getLOHContainer()) { - for (const MachineInstr *MI : D.getArgs()) { - MInstToMCSymbol::iterator LabelIt = LOHInstToLabel.find(MI); - assert(LabelIt != LOHInstToLabel.end() && - "Label hasn't been inserted for LOH related instruction"); - MCArgs.push_back(LabelIt->second); - } - OutStreamer.EmitLOHDirective(D.getKind(), MCArgs); - MCArgs.clear(); - } -} - -void ARM64AsmPrinter::EmitFunctionBodyEnd() { - if (!ARM64FI->getLOHRelated().empty()) - EmitLOHs(); -} - -/// GetCPISymbol - Return the symbol for the specified constant pool entry. -MCSymbol *ARM64AsmPrinter::GetCPISymbol(unsigned CPID) const { - // Darwin uses a linker-private symbol name for constant-pools (to - // avoid addends on the relocation?), ELF has no such concept and - // uses a normal private symbol. - if (getDataLayout().getLinkerPrivateGlobalPrefix()[0]) - return OutContext.GetOrCreateSymbol( - Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" + - Twine(getFunctionNumber()) + "_" + Twine(CPID)); - - return OutContext.GetOrCreateSymbol( - Twine(getDataLayout().getPrivateGlobalPrefix()) + "CPI" + - Twine(getFunctionNumber()) + "_" + Twine(CPID)); -} - -void ARM64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum, - raw_ostream &O) { - const MachineOperand &MO = MI->getOperand(OpNum); - switch (MO.getType()) { - default: - assert(0 && ""); - case MachineOperand::MO_Register: { - unsigned Reg = MO.getReg(); - assert(TargetRegisterInfo::isPhysicalRegister(Reg)); - assert(!MO.getSubReg() && "Subregs should be eliminated!"); - O << ARM64InstPrinter::getRegisterName(Reg); - break; - } - case MachineOperand::MO_Immediate: { - int64_t Imm = MO.getImm(); - O << '#' << Imm; - break; - } - } -} - -bool ARM64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode, - raw_ostream &O) { - unsigned Reg = MO.getReg(); - switch (Mode) { - default: - return true; // Unknown mode. - case 'w': - Reg = getWRegFromXReg(Reg); - break; - case 'x': - Reg = getXRegFromWReg(Reg); - break; - } - - O << ARM64InstPrinter::getRegisterName(Reg); - return false; -} - -// Prints the register in MO using class RC using the offset in the -// new register class. This should not be used for cross class -// printing. -bool ARM64AsmPrinter::printAsmRegInClass(const MachineOperand &MO, - const TargetRegisterClass *RC, - bool isVector, raw_ostream &O) { - assert(MO.isReg() && "Should only get here with a register!"); - const ARM64RegisterInfo *RI = - static_cast(TM.getRegisterInfo()); - unsigned Reg = MO.getReg(); - unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg)); - assert(RI->regsOverlap(RegToPrint, Reg)); - O << ARM64InstPrinter::getRegisterName( - RegToPrint, isVector ? ARM64::vreg : ARM64::NoRegAltName); - return false; -} - -bool ARM64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, - unsigned AsmVariant, - const char *ExtraCode, raw_ostream &O) { - const MachineOperand &MO = MI->getOperand(OpNum); - // Does this asm operand have a single letter operand modifier? - if (ExtraCode && ExtraCode[0]) { - if (ExtraCode[1] != 0) - return true; // Unknown modifier. - - switch (ExtraCode[0]) { - default: - return true; // Unknown modifier. - case 'w': // Print W register - case 'x': // Print X register - if (MO.isReg()) - return printAsmMRegister(MO, ExtraCode[0], O); - if (MO.isImm() && MO.getImm() == 0) { - unsigned Reg = ExtraCode[0] == 'w' ? ARM64::WZR : ARM64::XZR; - O << ARM64InstPrinter::getRegisterName(Reg); - return false; - } - printOperand(MI, OpNum, O); - return false; - case 'b': // Print B register. - case 'h': // Print H register. - case 's': // Print S register. - case 'd': // Print D register. - case 'q': // Print Q register. - if (MO.isReg()) { - const TargetRegisterClass *RC; - switch (ExtraCode[0]) { - case 'b': - RC = &ARM64::FPR8RegClass; - break; - case 'h': - RC = &ARM64::FPR16RegClass; - break; - case 's': - RC = &ARM64::FPR32RegClass; - break; - case 'd': - RC = &ARM64::FPR64RegClass; - break; - case 'q': - RC = &ARM64::FPR128RegClass; - break; - default: - return true; - } - return printAsmRegInClass(MO, RC, false /* vector */, O); - } - printOperand(MI, OpNum, O); - return false; - } - } - - // According to ARM, we should emit x and v registers unless we have a - // modifier. - if (MO.isReg()) { - unsigned Reg = MO.getReg(); - - // If this is a w or x register, print an x register. - if (ARM64::GPR32allRegClass.contains(Reg) || - ARM64::GPR64allRegClass.contains(Reg)) - return printAsmMRegister(MO, 'x', O); - - // If this is a b, h, s, d, or q register, print it as a v register. - return printAsmRegInClass(MO, &ARM64::FPR128RegClass, true /* vector */, O); - } - - printOperand(MI, OpNum, O); - return false; -} - -bool ARM64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, - unsigned OpNum, unsigned AsmVariant, - const char *ExtraCode, - raw_ostream &O) { - if (ExtraCode && ExtraCode[0]) - return true; // Unknown modifier. - - const MachineOperand &MO = MI->getOperand(OpNum); - assert(MO.isReg() && "unexpected inline asm memory operand"); - O << "[" << ARM64InstPrinter::getRegisterName(MO.getReg()) << "]"; - return false; -} - -void ARM64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI, - raw_ostream &OS) { - unsigned NOps = MI->getNumOperands(); - assert(NOps == 4); - OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: "; - // cast away const; DIetc do not take const operands for some reason. - DIVariable V(const_cast(MI->getOperand(NOps - 1).getMetadata())); - OS << V.getName(); - OS << " <- "; - // Frame address. Currently handles register +- offset only. - assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm()); - OS << '['; - printOperand(MI, 0, OS); - OS << '+'; - printOperand(MI, 1, OS); - OS << ']'; - OS << "+"; - printOperand(MI, NOps - 2, OS); -} - -void ARM64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, - const MachineInstr &MI) { - unsigned NumNOPBytes = MI.getOperand(1).getImm(); - - SM.recordStackMap(MI); - // Emit padding. - assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); - for (unsigned i = 0; i < NumNOPBytes; i += 4) - EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::HINT).addImm(0)); -} - -// Lower a patchpoint of the form: -// [], , , , -void ARM64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, - const MachineInstr &MI) { - SM.recordPatchPoint(MI); - - PatchPointOpers Opers(&MI); - - int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm(); - unsigned EncodedBytes = 0; - if (CallTarget) { - assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget && - "High 16 bits of call target should be zero."); - unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg(); - EncodedBytes = 16; - // Materialize the jump address: - EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::MOVZWi) - .addReg(ScratchReg) - .addImm((CallTarget >> 32) & 0xFFFF) - .addImm(32)); - EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::MOVKWi) - .addReg(ScratchReg) - .addReg(ScratchReg) - .addImm((CallTarget >> 16) & 0xFFFF) - .addImm(16)); - EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::MOVKWi) - .addReg(ScratchReg) - .addReg(ScratchReg) - .addImm(CallTarget & 0xFFFF) - .addImm(0)); - EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::BLR).addReg(ScratchReg)); - } - // Emit padding. - unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm(); - assert(NumBytes >= EncodedBytes && - "Patchpoint can't request size less than the length of a call."); - assert((NumBytes - EncodedBytes) % 4 == 0 && - "Invalid number of NOP bytes requested!"); - for (unsigned i = EncodedBytes; i < NumBytes; i += 4) - EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::HINT).addImm(0)); -} - -// Simple pseudo-instructions have their lowering (with expansion to real -// instructions) auto-generated. -#include "ARM64GenMCPseudoLowering.inc" - -void ARM64AsmPrinter::EmitInstruction(const MachineInstr *MI) { - // Do any auto-generated pseudo lowerings. - if (emitPseudoExpansionLowering(OutStreamer, MI)) - return; - - if (ARM64FI->getLOHRelated().count(MI)) { - // Generate a label for LOH related instruction - MCSymbol *LOHLabel = GetTempSymbol("loh", LOHLabelCounter++); - // Associate the instruction with the label - LOHInstToLabel[MI] = LOHLabel; - OutStreamer.EmitLabel(LOHLabel); - } - - // Do any manual lowerings. - switch (MI->getOpcode()) { - default: - break; - case ARM64::DBG_VALUE: { - if (isVerbose() && OutStreamer.hasRawTextSupport()) { - SmallString<128> TmpStr; - raw_svector_ostream OS(TmpStr); - PrintDebugValueComment(MI, OS); - OutStreamer.EmitRawText(StringRef(OS.str())); - } - return; - } - - // Tail calls use pseudo instructions so they have the proper code-gen - // attributes (isCall, isReturn, etc.). We lower them to the real - // instruction here. - case ARM64::TCRETURNri: { - MCInst TmpInst; - TmpInst.setOpcode(ARM64::BR); - TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg())); - EmitToStreamer(OutStreamer, TmpInst); - return; - } - case ARM64::TCRETURNdi: { - MCOperand Dest; - MCInstLowering.lowerOperand(MI->getOperand(0), Dest); - MCInst TmpInst; - TmpInst.setOpcode(ARM64::B); - TmpInst.addOperand(Dest); - EmitToStreamer(OutStreamer, TmpInst); - return; - } - case ARM64::TLSDESC_BLR: { - MCOperand Callee, Sym; - MCInstLowering.lowerOperand(MI->getOperand(0), Callee); - MCInstLowering.lowerOperand(MI->getOperand(1), Sym); - - // First emit a relocation-annotation. This expands to no code, but requests - // the following instruction gets an R_AARCH64_TLSDESC_CALL. - MCInst TLSDescCall; - TLSDescCall.setOpcode(ARM64::TLSDESCCALL); - TLSDescCall.addOperand(Sym); - EmitToStreamer(OutStreamer, TLSDescCall); - - // Other than that it's just a normal indirect call to the function loaded - // from the descriptor. - MCInst BLR; - BLR.setOpcode(ARM64::BLR); - BLR.addOperand(Callee); - EmitToStreamer(OutStreamer, BLR); - - return; - } - - case TargetOpcode::STACKMAP: - return LowerSTACKMAP(OutStreamer, SM, *MI); - - case TargetOpcode::PATCHPOINT: - return LowerPATCHPOINT(OutStreamer, SM, *MI); - } - - // Finally, do the automated lowerings for everything else. - MCInst TmpInst; - MCInstLowering.Lower(MI, TmpInst); - EmitToStreamer(OutStreamer, TmpInst); -} - -// Force static initialization. -extern "C" void LLVMInitializeARM64AsmPrinter() { - RegisterAsmPrinter X(TheARM64leTarget); - RegisterAsmPrinter Y(TheARM64beTarget); - - RegisterAsmPrinter Z(TheAArch64leTarget); - RegisterAsmPrinter W(TheAArch64beTarget); -} diff --git a/lib/Target/ARM64/ARM64BranchRelaxation.cpp b/lib/Target/ARM64/ARM64BranchRelaxation.cpp deleted file mode 100644 index 73be3504790..00000000000 --- a/lib/Target/ARM64/ARM64BranchRelaxation.cpp +++ /dev/null @@ -1,509 +0,0 @@ -//===-- ARM64BranchRelaxation.cpp - ARM64 branch relaxation ---------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -//===----------------------------------------------------------------------===// - -#include "ARM64.h" -#include "ARM64InstrInfo.h" -#include "ARM64MachineFunctionInfo.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/Format.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Support/CommandLine.h" -using namespace llvm; - -#define DEBUG_TYPE "arm64-branch-relax" - -static cl::opt -BranchRelaxation("arm64-branch-relax", cl::Hidden, cl::init(true), - cl::desc("Relax out of range conditional branches")); - -static cl::opt -TBZDisplacementBits("arm64-tbz-offset-bits", cl::Hidden, cl::init(14), - cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); - -static cl::opt -CBZDisplacementBits("arm64-cbz-offset-bits", cl::Hidden, cl::init(19), - cl::desc("Restrict range of CB[N]Z instructions (DEBUG)")); - -static cl::opt -BCCDisplacementBits("arm64-bcc-offset-bits", cl::Hidden, cl::init(19), - cl::desc("Restrict range of Bcc instructions (DEBUG)")); - -STATISTIC(NumSplit, "Number of basic blocks split"); -STATISTIC(NumRelaxed, "Number of conditional branches relaxed"); - -namespace { -class ARM64BranchRelaxation : public MachineFunctionPass { - /// BasicBlockInfo - Information about the offset and size of a single - /// basic block. - struct BasicBlockInfo { - /// Offset - Distance from the beginning of the function to the beginning - /// of this basic block. - /// - /// The offset is always aligned as required by the basic block. - unsigned Offset; - - /// Size - Size of the basic block in bytes. If the block contains - /// inline assembly, this is a worst case estimate. - /// - /// The size does not include any alignment padding whether from the - /// beginning of the block, or from an aligned jump table at the end. - unsigned Size; - - BasicBlockInfo() : Offset(0), Size(0) {} - - /// Compute the offset immediately following this block. If LogAlign is - /// specified, return the offset the successor block will get if it has - /// this alignment. - unsigned postOffset(unsigned LogAlign = 0) const { - unsigned PO = Offset + Size; - unsigned Align = 1 << LogAlign; - return (PO + Align - 1) / Align * Align; - } - }; - - SmallVector BlockInfo; - - MachineFunction *MF; - const ARM64InstrInfo *TII; - - bool relaxBranchInstructions(); - void scanFunction(); - MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI); - void adjustBlockOffsets(MachineBasicBlock &MBB); - bool isBlockInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp); - bool fixupConditionalBranch(MachineInstr *MI); - void computeBlockSize(const MachineBasicBlock &MBB); - unsigned getInstrOffset(MachineInstr *MI) const; - void dumpBBs(); - void verify(); - -public: - static char ID; - ARM64BranchRelaxation() : MachineFunctionPass(ID) {} - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "ARM64 branch relaxation pass"; - } -}; -char ARM64BranchRelaxation::ID = 0; -} - -/// verify - check BBOffsets, BBSizes, alignment of islands -void ARM64BranchRelaxation::verify() { -#ifndef NDEBUG - unsigned PrevNum = MF->begin()->getNumber(); - for (MachineBasicBlock &MBB : *MF) { - unsigned Align = MBB.getAlignment(); - unsigned Num = MBB.getNumber(); - assert(BlockInfo[Num].Offset % (1u << Align) == 0); - assert(!Num || BlockInfo[PrevNum].postOffset() <= BlockInfo[Num].Offset); - PrevNum = Num; - } -#endif -} - -/// print block size and offset information - debugging -void ARM64BranchRelaxation::dumpBBs() { - for (auto &MBB : *MF) { - const BasicBlockInfo &BBI = BlockInfo[MBB.getNumber()]; - dbgs() << format("BB#%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset) - << format("size=%#x\n", BBI.Size); - } -} - -/// BBHasFallthrough - Return true if the specified basic block can fallthrough -/// into the block immediately after it. -static bool BBHasFallthrough(MachineBasicBlock *MBB) { - // Get the next machine basic block in the function. - MachineFunction::iterator MBBI = MBB; - // Can't fall off end of function. - MachineBasicBlock *NextBB = std::next(MBBI); - if (NextBB == MBB->getParent()->end()) - return false; - - for (MachineBasicBlock *S : MBB->successors()) - if (S == NextBB) - return true; - - return false; -} - -/// scanFunction - Do the initial scan of the function, building up -/// information about each block. -void ARM64BranchRelaxation::scanFunction() { - BlockInfo.clear(); - BlockInfo.resize(MF->getNumBlockIDs()); - - // First thing, compute the size of all basic blocks, and see if the function - // has any inline assembly in it. If so, we have to be conservative about - // alignment assumptions, as we don't know for sure the size of any - // instructions in the inline assembly. - for (MachineBasicBlock &MBB : *MF) - computeBlockSize(MBB); - - // Compute block offsets and known bits. - adjustBlockOffsets(*MF->begin()); -} - -/// computeBlockSize - Compute the size for MBB. -/// This function updates BlockInfo directly. -void ARM64BranchRelaxation::computeBlockSize(const MachineBasicBlock &MBB) { - unsigned Size = 0; - for (const MachineInstr &MI : MBB) - Size += TII->GetInstSizeInBytes(&MI); - BlockInfo[MBB.getNumber()].Size = Size; -} - -/// getInstrOffset - Return the current offset of the specified machine -/// instruction from the start of the function. This offset changes as stuff is -/// moved around inside the function. -unsigned ARM64BranchRelaxation::getInstrOffset(MachineInstr *MI) const { - MachineBasicBlock *MBB = MI->getParent(); - - // The offset is composed of two things: the sum of the sizes of all MBB's - // before this instruction's block, and the offset from the start of the block - // it is in. - unsigned Offset = BlockInfo[MBB->getNumber()].Offset; - - // Sum instructions before MI in MBB. - for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) { - assert(I != MBB->end() && "Didn't find MI in its own basic block?"); - Offset += TII->GetInstSizeInBytes(I); - } - return Offset; -} - -void ARM64BranchRelaxation::adjustBlockOffsets(MachineBasicBlock &Start) { - unsigned PrevNum = Start.getNumber(); - for (auto &MBB : make_range(MachineFunction::iterator(Start), MF->end())) { - unsigned Num = MBB.getNumber(); - if (!Num) // block zero is never changed from offset zero. - continue; - // Get the offset and known bits at the end of the layout predecessor. - // Include the alignment of the current block. - unsigned LogAlign = MBB.getAlignment(); - BlockInfo[Num].Offset = BlockInfo[PrevNum].postOffset(LogAlign); - PrevNum = Num; - } -} - -/// Split the basic block containing MI into two blocks, which are joined by -/// an unconditional branch. Update data structures and renumber blocks to -/// account for this change and returns the newly created block. -/// NOTE: Successor list of the original BB is out of date after this function, -/// and must be updated by the caller! Other transforms follow using this -/// utility function, so no point updating now rather than waiting. -MachineBasicBlock * -ARM64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) { - MachineBasicBlock *OrigBB = MI->getParent(); - - // Create a new MBB for the code after the OrigBB. - MachineBasicBlock *NewBB = - MF->CreateMachineBasicBlock(OrigBB->getBasicBlock()); - MachineFunction::iterator MBBI = OrigBB; - ++MBBI; - MF->insert(MBBI, NewBB); - - // Splice the instructions starting with MI over to NewBB. - NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end()); - - // Add an unconditional branch from OrigBB to NewBB. - // Note the new unconditional branch is not being recorded. - // There doesn't seem to be meaningful DebugInfo available; this doesn't - // correspond to anything in the source. - BuildMI(OrigBB, DebugLoc(), TII->get(ARM64::B)).addMBB(NewBB); - - // Insert an entry into BlockInfo to align it properly with the block numbers. - BlockInfo.insert(BlockInfo.begin() + NewBB->getNumber(), BasicBlockInfo()); - - // Figure out how large the OrigBB is. As the first half of the original - // block, it cannot contain a tablejump. The size includes - // the new jump we added. (It should be possible to do this without - // recounting everything, but it's very confusing, and this is rarely - // executed.) - computeBlockSize(*OrigBB); - - // Figure out how large the NewMBB is. As the second half of the original - // block, it may contain a tablejump. - computeBlockSize(*NewBB); - - // All BBOffsets following these blocks must be modified. - adjustBlockOffsets(*OrigBB); - - ++NumSplit; - - return NewBB; -} - -/// isBlockInRange - Returns true if the distance between specific MI and -/// specific BB can fit in MI's displacement field. -bool ARM64BranchRelaxation::isBlockInRange(MachineInstr *MI, - MachineBasicBlock *DestBB, - unsigned Bits) { - unsigned MaxOffs = ((1 << (Bits - 1)) - 1) << 2; - unsigned BrOffset = getInstrOffset(MI); - unsigned DestOffset = BlockInfo[DestBB->getNumber()].Offset; - - DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber() - << " from BB#" << MI->getParent()->getNumber() - << " max delta=" << MaxOffs << " from " << getInstrOffset(MI) - << " to " << DestOffset << " offset " - << int(DestOffset - BrOffset) << "\t" << *MI); - - // Branch before the Dest. - if (BrOffset <= DestOffset) - return (DestOffset - BrOffset <= MaxOffs); - return (BrOffset - DestOffset <= MaxOffs); -} - -static bool isConditionalBranch(unsigned Opc) { - switch (Opc) { - default: - return false; - case ARM64::TBZW: - case ARM64::TBNZW: - case ARM64::TBZX: - case ARM64::TBNZX: - case ARM64::CBZW: - case ARM64::CBNZW: - case ARM64::CBZX: - case ARM64::CBNZX: - case ARM64::Bcc: - return true; - } -} - -static MachineBasicBlock *getDestBlock(MachineInstr *MI) { - switch (MI->getOpcode()) { - default: - assert(0 && "unexpected opcode!"); - case ARM64::TBZW: - case ARM64::TBNZW: - case ARM64::TBZX: - case ARM64::TBNZX: - return MI->getOperand(2).getMBB(); - case ARM64::CBZW: - case ARM64::CBNZW: - case ARM64::CBZX: - case ARM64::CBNZX: - case ARM64::Bcc: - return MI->getOperand(1).getMBB(); - } -} - -static unsigned getOppositeConditionOpcode(unsigned Opc) { - switch (Opc) { - default: - assert(0 && "unexpected opcode!"); - case ARM64::TBNZW: return ARM64::TBZW; - case ARM64::TBNZX: return ARM64::TBZX; - case ARM64::TBZW: return ARM64::TBNZW; - case ARM64::TBZX: return ARM64::TBNZX; - case ARM64::CBNZW: return ARM64::CBZW; - case ARM64::CBNZX: return ARM64::CBZX; - case ARM64::CBZW: return ARM64::CBNZW; - case ARM64::CBZX: return ARM64::CBNZX; - case ARM64::Bcc: return ARM64::Bcc; // Condition is an operand for Bcc. - } -} - -static unsigned getBranchDisplacementBits(unsigned Opc) { - switch (Opc) { - default: - assert(0 && "unexpected opcode!"); - case ARM64::TBNZW: - case ARM64::TBZW: - case ARM64::TBNZX: - case ARM64::TBZX: - return TBZDisplacementBits; - case ARM64::CBNZW: - case ARM64::CBZW: - case ARM64::CBNZX: - case ARM64::CBZX: - return CBZDisplacementBits; - case ARM64::Bcc: - return BCCDisplacementBits; - } -} - -static inline void invertBccCondition(MachineInstr *MI) { - assert(MI->getOpcode() == ARM64::Bcc && "Unexpected opcode!"); - ARM64CC::CondCode CC = (ARM64CC::CondCode)MI->getOperand(0).getImm(); - CC = ARM64CC::getInvertedCondCode(CC); - MI->getOperand(0).setImm((int64_t)CC); -} - -/// fixupConditionalBranch - Fix up a conditional branch whose destination is -/// too far away to fit in its displacement field. It is converted to an inverse -/// conditional branch + an unconditional branch to the destination. -bool ARM64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) { - MachineBasicBlock *DestBB = getDestBlock(MI); - - // Add an unconditional branch to the destination and invert the branch - // condition to jump over it: - // tbz L1 - // => - // tbnz L2 - // b L1 - // L2: - - // If the branch is at the end of its MBB and that has a fall-through block, - // direct the updated conditional branch to the fall-through block. Otherwise, - // split the MBB before the next instruction. - MachineBasicBlock *MBB = MI->getParent(); - MachineInstr *BMI = &MBB->back(); - bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB); - - if (BMI != MI) { - if (std::next(MachineBasicBlock::iterator(MI)) == - std::prev(MBB->getLastNonDebugInstr()) && - BMI->getOpcode() == ARM64::B) { - // Last MI in the BB is an unconditional branch. Can we simply invert the - // condition and swap destinations: - // beq L1 - // b L2 - // => - // bne L2 - // b L1 - MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB(); - if (isBlockInRange(MI, NewDest, - getBranchDisplacementBits(MI->getOpcode()))) { - DEBUG(dbgs() << " Invert condition and swap its destination with " - << *BMI); - BMI->getOperand(0).setMBB(DestBB); - unsigned OpNum = - (MI->getOpcode() == ARM64::TBZW || MI->getOpcode() == ARM64::TBNZW || - MI->getOpcode() == ARM64::TBZX || MI->getOpcode() == ARM64::TBNZX) - ? 2 - : 1; - MI->getOperand(OpNum).setMBB(NewDest); - MI->setDesc(TII->get(getOppositeConditionOpcode(MI->getOpcode()))); - if (MI->getOpcode() == ARM64::Bcc) - invertBccCondition(MI); - return true; - } - } - } - - if (NeedSplit) { - // Analyze the branch so we know how to update the successor lists. - MachineBasicBlock *TBB, *FBB; - SmallVector Cond; - TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, false); - - MachineBasicBlock *NewBB = splitBlockBeforeInstr(MI); - // No need for the branch to the next block. We're adding an unconditional - // branch to the destination. - int delta = TII->GetInstSizeInBytes(&MBB->back()); - BlockInfo[MBB->getNumber()].Size -= delta; - MBB->back().eraseFromParent(); - // BlockInfo[SplitBB].Offset is wrong temporarily, fixed below - - // Update the successor lists according to the transformation to follow. - // Do it here since if there's no split, no update is needed. - MBB->replaceSuccessor(FBB, NewBB); - NewBB->addSuccessor(FBB); - } - MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB)); - - DEBUG(dbgs() << " Insert B to BB#" << DestBB->getNumber() - << ", invert condition and change dest. to BB#" - << NextBB->getNumber() << "\n"); - - // Insert a new conditional branch and a new unconditional branch. - MachineInstrBuilder MIB = BuildMI( - MBB, DebugLoc(), TII->get(getOppositeConditionOpcode(MI->getOpcode()))) - .addOperand(MI->getOperand(0)); - if (MI->getOpcode() == ARM64::TBZW || MI->getOpcode() == ARM64::TBNZW || - MI->getOpcode() == ARM64::TBZX || MI->getOpcode() == ARM64::TBNZX) - MIB.addOperand(MI->getOperand(1)); - if (MI->getOpcode() == ARM64::Bcc) - invertBccCondition(MIB); - MIB.addMBB(NextBB); - BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back()); - BuildMI(MBB, DebugLoc(), TII->get(ARM64::B)).addMBB(DestBB); - BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back()); - - // Remove the old conditional branch. It may or may not still be in MBB. - BlockInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI); - MI->eraseFromParent(); - - // Finally, keep the block offsets up to date. - adjustBlockOffsets(*MBB); - return true; -} - -bool ARM64BranchRelaxation::relaxBranchInstructions() { - bool Changed = false; - // Relaxing branches involves creating new basic blocks, so re-eval - // end() for termination. - for (auto &MBB : *MF) { - MachineInstr *MI = MBB.getFirstTerminator(); - if (isConditionalBranch(MI->getOpcode()) && - !isBlockInRange(MI, getDestBlock(MI), - getBranchDisplacementBits(MI->getOpcode()))) { - fixupConditionalBranch(MI); - ++NumRelaxed; - Changed = true; - } - } - return Changed; -} - -bool ARM64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) { - MF = &mf; - - // If the pass is disabled, just bail early. - if (!BranchRelaxation) - return false; - - DEBUG(dbgs() << "***** ARM64BranchRelaxation *****\n"); - - TII = (const ARM64InstrInfo *)MF->getTarget().getInstrInfo(); - - // Renumber all of the machine basic blocks in the function, guaranteeing that - // the numbers agree with the position of the block in the function. - MF->RenumberBlocks(); - - // Do the initial scan of the function, building up information about the - // sizes of each block. - scanFunction(); - - DEBUG(dbgs() << " Basic blocks before relaxation\n"); - DEBUG(dumpBBs()); - - bool MadeChange = false; - while (relaxBranchInstructions()) - MadeChange = true; - - // After a while, this might be made debug-only, but it is not expensive. - verify(); - - DEBUG(dbgs() << " Basic blocks after relaxation\n"); - DEBUG(dbgs() << '\n'; dumpBBs()); - - BlockInfo.clear(); - - return MadeChange; -} - -/// createARM64BranchRelaxation - returns an instance of the constpool -/// island pass. -FunctionPass *llvm::createARM64BranchRelaxation() { - return new ARM64BranchRelaxation(); -} diff --git a/lib/Target/ARM64/ARM64CallingConv.h b/lib/Target/ARM64/ARM64CallingConv.h deleted file mode 100644 index f24ba59dfb9..00000000000 --- a/lib/Target/ARM64/ARM64CallingConv.h +++ /dev/null @@ -1,94 +0,0 @@ -//=== ARM64CallingConv.h - Custom Calling Convention Routines -*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the custom routines for the ARM64 Calling Convention that -// aren't done by tablegen. -// -//===----------------------------------------------------------------------===// - -#ifndef ARM64CALLINGCONV_H -#define ARM64CALLINGCONV_H - -#include "ARM64InstrInfo.h" -#include "llvm/IR/CallingConv.h" -#include "llvm/CodeGen/CallingConvLower.h" -#include "llvm/Target/TargetInstrInfo.h" - -namespace llvm { - -/// CC_ARM64_Custom_i1i8i16_Reg - customized handling of passing i1/i8/i16 via -/// register. Here, ValVT can be i1/i8/i16 or i32 depending on whether the -/// argument is already promoted and LocVT is i1/i8/i16. We only promote the -/// argument to i32 if we are sure this argument will be passed in register. -static bool CC_ARM64_Custom_i1i8i16_Reg(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, - CCState &State, - bool IsWebKitJS = false) { - static const MCPhysReg RegList1[] = { ARM64::W0, ARM64::W1, ARM64::W2, - ARM64::W3, ARM64::W4, ARM64::W5, - ARM64::W6, ARM64::W7 }; - static const MCPhysReg RegList2[] = { ARM64::X0, ARM64::X1, ARM64::X2, - ARM64::X3, ARM64::X4, ARM64::X5, - ARM64::X6, ARM64::X7 }; - static const MCPhysReg WebKitRegList1[] = { ARM64::W0 }; - static const MCPhysReg WebKitRegList2[] = { ARM64::X0 }; - - const MCPhysReg *List1 = IsWebKitJS ? WebKitRegList1 : RegList1; - const MCPhysReg *List2 = IsWebKitJS ? WebKitRegList2 : RegList2; - - if (unsigned Reg = State.AllocateReg(List1, List2, 8)) { - // Customized extra section for handling i1/i8/i16: - // We need to promote the argument to i32 if it is not done already. - if (ValVT != MVT::i32) { - if (ArgFlags.isSExt()) - LocInfo = CCValAssign::SExt; - else if (ArgFlags.isZExt()) - LocInfo = CCValAssign::ZExt; - else - LocInfo = CCValAssign::AExt; - ValVT = MVT::i32; - } - // Set LocVT to i32 as well if passing via register. - LocVT = MVT::i32; - State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); - return true; - } - return false; -} - -/// CC_ARM64_WebKit_JS_i1i8i16_Reg - customized handling of passing i1/i8/i16 -/// via register. This behaves the same as CC_ARM64_Custom_i1i8i16_Reg, but only -/// uses the first register. -static bool CC_ARM64_WebKit_JS_i1i8i16_Reg(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, - CCState &State) { - return CC_ARM64_Custom_i1i8i16_Reg(ValNo, ValVT, LocVT, LocInfo, ArgFlags, - State, true); -} - -/// CC_ARM64_Custom_i1i8i16_Stack: customized handling of passing i1/i8/i16 on -/// stack. Here, ValVT can be i1/i8/i16 or i32 depending on whether the argument -/// is already promoted and LocVT is i1/i8/i16. If ValVT is already promoted, -/// it will be truncated back to i1/i8/i16. -static bool CC_ARM64_Custom_i1i8i16_Stack(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, - CCState &State) { - unsigned Space = ((LocVT == MVT::i1 || LocVT == MVT::i8) ? 1 : 2); - unsigned Offset12 = State.AllocateStack(Space, Space); - ValVT = LocVT; - State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset12, LocVT, LocInfo)); - return true; -} - -} // End llvm namespace - -#endif diff --git a/lib/Target/ARM64/ARM64CallingConvention.td b/lib/Target/ARM64/ARM64CallingConvention.td deleted file mode 100644 index 0ef5601718d..00000000000 --- a/lib/Target/ARM64/ARM64CallingConvention.td +++ /dev/null @@ -1,236 +0,0 @@ -//===- ARM64CallingConv.td - Calling Conventions for ARM64 -*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This describes the calling conventions for ARM64 architecture. -// -//===----------------------------------------------------------------------===// - -/// CCIfAlign - Match of the original alignment of the arg -class CCIfAlign : - CCIf; -/// CCIfBigEndian - Match only if we're in big endian mode. -class CCIfBigEndian : - CCIf<"State.getTarget().getDataLayout()->isBigEndian()", A>; - -//===----------------------------------------------------------------------===// -// ARM AAPCS64 Calling Convention -//===----------------------------------------------------------------------===// - -def CC_ARM64_AAPCS : CallingConv<[ - CCIfType<[v2f32], CCBitConvertToType>, - CCIfType<[v2f64, v4f32], CCBitConvertToType>, - - // Big endian vectors must be passed as if they were 1-element vectors so that - // their lanes are in a consistent order. - CCIfBigEndian>>, - CCIfBigEndian>>, - - // An SRet is passed in X8, not X0 like a normal pointer parameter. - CCIfSRet>>, - - // Put ByVal arguments directly on the stack. Minimum size and alignment of a - // slot is 64-bit. - CCIfByVal>, - - // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, - // up to eight each of GPR and FPR. - CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_Custom_i1i8i16_Reg">>, - CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], - [X0, X1, X2, X3, X4, X5, X6, X7]>>, - // i128 is split to two i64s, we can't fit half to register X7. - CCIfType<[i64], CCIfSplit>>, - - // i128 is split to two i64s, and its stack alignment is 16 bytes. - CCIfType<[i64], CCIfSplit>>, - - CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], - [W0, W1, W2, W3, W4, W5, W6, W7]>>, - CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], - CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], - CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - - // If more than will fit in registers, pass them on the stack instead. - CCIfType<[i1, i8, i16], CCAssignToStack<8, 8>>, - CCIfType<[i32, f32], CCAssignToStack<8, 8>>, - CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8], - CCAssignToStack<8, 8>>, - CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], - CCAssignToStack<16, 16>> -]>; - -def RetCC_ARM64_AAPCS : CallingConv<[ - CCIfType<[v2f32], CCBitConvertToType>, - CCIfType<[v2f64, v4f32], CCBitConvertToType>, - - // Big endian vectors must be passed as if they were 1-element vectors so that - // their lanes are in a consistent order. - CCIfBigEndian>>, - CCIfBigEndian>>, - - CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], - [X0, X1, X2, X3, X4, X5, X6, X7]>>, - CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], - [W0, W1, W2, W3, W4, W5, W6, W7]>>, - CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], - CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], - CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>> -]>; - - -// Darwin uses a calling convention which differs in only two ways -// from the standard one at this level: -// + i128s (i.e. split i64s) don't need even registers. -// + Stack slots are sized as needed rather than being at least 64-bit. -def CC_ARM64_DarwinPCS : CallingConv<[ - CCIfType<[v2f32], CCBitConvertToType>, - CCIfType<[v2f64, v4f32, f128], CCBitConvertToType>, - - // An SRet is passed in X8, not X0 like a normal pointer parameter. - CCIfSRet>>, - - // Put ByVal arguments directly on the stack. Minimum size and alignment of a - // slot is 64-bit. - CCIfByVal>, - - // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers, - // up to eight each of GPR and FPR. - CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_Custom_i1i8i16_Reg">>, - CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], - [X0, X1, X2, X3, X4, X5, X6, X7]>>, - // i128 is split to two i64s, we can't fit half to register X7. - CCIfType<[i64], - CCIfSplit>>, - // i128 is split to two i64s, and its stack alignment is 16 bytes. - CCIfType<[i64], CCIfSplit>>, - - CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], - [W0, W1, W2, W3, W4, W5, W6, W7]>>, - CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], - CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], - CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - - // If more than will fit in registers, pass them on the stack instead. - CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_Custom_i1i8i16_Stack">>, - CCIfType<[i32, f32], CCAssignToStack<4, 4>>, - CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8], - CCAssignToStack<8, 8>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>> -]>; - -def CC_ARM64_DarwinPCS_VarArg : CallingConv<[ - CCIfType<[v2f32], CCBitConvertToType>, - CCIfType<[v2f64, v4f32, f128], CCBitConvertToType>, - - // Handle all scalar types as either i64 or f64. - CCIfType<[i8, i16, i32], CCPromoteToType>, - CCIfType<[f32], CCPromoteToType>, - - // Everything is on the stack. - // i128 is split to two i64s, and its stack alignment is 16 bytes. - CCIfType<[i64], CCIfSplit>>, - CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], CCAssignToStack<8, 8>>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>> -]>; - -// The WebKit_JS calling convention only passes the first argument (the callee) -// in register and the remaining arguments on stack. We allow 32bit stack slots, -// so that WebKit can write partial values in the stack and define the other -// 32bit quantity as undef. -def CC_ARM64_WebKit_JS : CallingConv<[ - // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0). - CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_WebKit_JS_i1i8i16_Reg">>, - CCIfType<[i32], CCAssignToRegWithShadow<[W0], [X0]>>, - CCIfType<[i64], CCAssignToRegWithShadow<[X0], [W0]>>, - - // Pass the remaining arguments on the stack instead. - CCIfType<[i1, i8, i16], CCAssignToStack<4, 4>>, - CCIfType<[i32, f32], CCAssignToStack<4, 4>>, - CCIfType<[i64, f64], CCAssignToStack<8, 8>> -]>; - -def RetCC_ARM64_WebKit_JS : CallingConv<[ - CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7], - [X0, X1, X2, X3, X4, X5, X6, X7]>>, - CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7], - [W0, W1, W2, W3, W4, W5, W6, W7]>>, - CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, - CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7], - [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>> -]>; - -// FIXME: LR is only callee-saved in the sense that *we* preserve it and are -// presumably a callee to someone. External functions may not do so, but this -// is currently safe since BL has LR as an implicit-def and what happens after a -// tail call doesn't matter. -// -// It would be better to model its preservation semantics properly (create a -// vreg on entry, use it in RET & tail call generation; make that vreg def if we -// end up saving LR as part of a call frame). Watch this space... -def CSR_ARM64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22, - X23, X24, X25, X26, X27, X28, - D8, D9, D10, D11, - D12, D13, D14, D15)>; - -// Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since -// 'this' and the pointer return value are both passed in X0 in these cases, -// this can be partially modelled by treating X0 as a callee-saved register; -// only the resulting RegMask is used; the SaveList is ignored -// -// (For generic ARM 64-bit ABI code, clang will not generate constructors or -// destructors with 'this' returns, so this RegMask will not be used in that -// case) -def CSR_ARM64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_ARM64_AAPCS, X0)>; - -// The function used by Darwin to obtain the address of a thread-local variable -// guarantees more than a normal AAPCS function. x16 and x17 are used on the -// fast path for calculation, but other registers except X0 (argument/return) -// and LR (it is a call, after all) are preserved. -def CSR_ARM64_TLS_Darwin - : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17), - FP, - (sequence "Q%u", 0, 31))>; - -// The ELF stub used for TLS-descriptor access saves every feasible -// register. Only X0 and LR are clobbered. -def CSR_ARM64_TLS_ELF - : CalleeSavedRegs<(add (sequence "X%u", 1, 28), FP, - (sequence "Q%u", 0, 31))>; - -def CSR_ARM64_AllRegs - : CalleeSavedRegs<(add (sequence "W%u", 0, 30), WSP, - (sequence "X%u", 0, 28), FP, LR, SP, - (sequence "B%u", 0, 31), (sequence "H%u", 0, 31), - (sequence "S%u", 0, 31), (sequence "D%u", 0, 31), - (sequence "Q%u", 0, 31))>; - diff --git a/lib/Target/ARM64/ARM64CleanupLocalDynamicTLSPass.cpp b/lib/Target/ARM64/ARM64CleanupLocalDynamicTLSPass.cpp deleted file mode 100644 index dce1301b92e..00000000000 --- a/lib/Target/ARM64/ARM64CleanupLocalDynamicTLSPass.cpp +++ /dev/null @@ -1,147 +0,0 @@ -//===-- ARM64CleanupLocalDynamicTLSPass.cpp -----------------------*- C++ -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Local-dynamic access to thread-local variables proceeds in three stages. -// -// 1. The offset of this Module's thread-local area from TPIDR_EL0 is calculated -// in much the same way as a general-dynamic TLS-descriptor access against -// the special symbol _TLS_MODULE_BASE. -// 2. The variable's offset from _TLS_MODULE_BASE_ is calculated using -// instructions with "dtprel" modifiers. -// 3. These two are added, together with TPIDR_EL0, to obtain the variable's -// true address. -// -// This is only better than general-dynamic access to the variable if two or -// more of the first stage TLS-descriptor calculations can be combined. This -// pass looks through a function and performs such combinations. -// -//===----------------------------------------------------------------------===// -#include "ARM64.h" -#include "ARM64InstrInfo.h" -#include "ARM64MachineFunctionInfo.h" -#include "ARM64TargetMachine.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -using namespace llvm; - -namespace { -struct LDTLSCleanup : public MachineFunctionPass { - static char ID; - LDTLSCleanup() : MachineFunctionPass(ID) {} - - bool runOnMachineFunction(MachineFunction &MF) override { - ARM64FunctionInfo *AFI = MF.getInfo(); - if (AFI->getNumLocalDynamicTLSAccesses() < 2) { - // No point folding accesses if there isn't at least two. - return false; - } - - MachineDominatorTree *DT = &getAnalysis(); - return VisitNode(DT->getRootNode(), 0); - } - - // Visit the dominator subtree rooted at Node in pre-order. - // If TLSBaseAddrReg is non-null, then use that to replace any - // TLS_base_addr instructions. Otherwise, create the register - // when the first such instruction is seen, and then use it - // as we encounter more instructions. - bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) { - MachineBasicBlock *BB = Node->getBlock(); - bool Changed = false; - - // Traverse the current block. - for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; - ++I) { - switch (I->getOpcode()) { - case ARM64::TLSDESC_BLR: - // Make sure it's a local dynamic access. - if (!I->getOperand(1).isSymbol() || - strcmp(I->getOperand(1).getSymbolName(), "_TLS_MODULE_BASE_")) - break; - - if (TLSBaseAddrReg) - I = replaceTLSBaseAddrCall(I, TLSBaseAddrReg); - else - I = setRegister(I, &TLSBaseAddrReg); - Changed = true; - break; - default: - break; - } - } - - // Visit the children of this block in the dominator tree. - for (MachineDomTreeNode *N : *Node) { - Changed |= VisitNode(N, TLSBaseAddrReg); - } - - return Changed; - } - - // Replace the TLS_base_addr instruction I with a copy from - // TLSBaseAddrReg, returning the new instruction. - MachineInstr *replaceTLSBaseAddrCall(MachineInstr *I, - unsigned TLSBaseAddrReg) { - MachineFunction *MF = I->getParent()->getParent(); - const ARM64TargetMachine *TM = - static_cast(&MF->getTarget()); - const ARM64InstrInfo *TII = TM->getInstrInfo(); - - // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the - // code sequence assumes the address will be. - MachineInstr *Copy = - BuildMI(*I->getParent(), I, I->getDebugLoc(), - TII->get(TargetOpcode::COPY), ARM64::X0).addReg(TLSBaseAddrReg); - - // Erase the TLS_base_addr instruction. - I->eraseFromParent(); - - return Copy; - } - - // Create a virtal register in *TLSBaseAddrReg, and populate it by - // inserting a copy instruction after I. Returns the new instruction. - MachineInstr *setRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) { - MachineFunction *MF = I->getParent()->getParent(); - const ARM64TargetMachine *TM = - static_cast(&MF->getTarget()); - const ARM64InstrInfo *TII = TM->getInstrInfo(); - - // Create a virtual register for the TLS base address. - MachineRegisterInfo &RegInfo = MF->getRegInfo(); - *TLSBaseAddrReg = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass); - - // Insert a copy from X0 to TLSBaseAddrReg for later. - MachineInstr *Next = I->getNextNode(); - MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(), - TII->get(TargetOpcode::COPY), - *TLSBaseAddrReg).addReg(ARM64::X0); - - return Copy; - } - - const char *getPassName() const override { - return "Local Dynamic TLS Access Clean-up"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; -} - -char LDTLSCleanup::ID = 0; -FunctionPass *llvm::createARM64CleanupLocalDynamicTLSPass() { - return new LDTLSCleanup(); -} diff --git a/lib/Target/ARM64/ARM64CollectLOH.cpp b/lib/Target/ARM64/ARM64CollectLOH.cpp deleted file mode 100644 index 8b48f3ae9b2..00000000000 --- a/lib/Target/ARM64/ARM64CollectLOH.cpp +++ /dev/null @@ -1,1117 +0,0 @@ -//===-------------- ARM64CollectLOH.cpp - ARM64 collect LOH pass --*- C++ -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains a pass that collect the Linker Optimization Hint (LOH). -// This pass should be run at the very end of the compilation flow, just before -// assembly printer. -// To be useful for the linker, the LOH must be printed into the assembly file. -// -// A LOH describes a sequence of instructions that may be optimized by the -// linker. -// This same sequence cannot be optimized by the compiler because some of -// the information will be known at link time. -// For instance, consider the following sequence: -// L1: adrp xA, sym@PAGE -// L2: add xB, xA, sym@PAGEOFF -// L3: ldr xC, [xB, #imm] -// This sequence can be turned into: -// A literal load if sym@PAGE + sym@PAGEOFF + #imm - address(L3) is < 1MB: -// L3: ldr xC, sym+#imm -// It may also be turned into either the following more efficient -// code sequences: -// - If sym@PAGEOFF + #imm fits the encoding space of L3. -// L1: adrp xA, sym@PAGE -// L3: ldr xC, [xB, sym@PAGEOFF + #imm] -// - If sym@PAGE + sym@PAGEOFF - address(L1) < 1MB: -// L1: adr xA, sym -// L3: ldr xC, [xB, #imm] -// -// To be valid a LOH must meet all the requirements needed by all the related -// possible linker transformations. -// For instance, using the running example, the constraints to emit -// ".loh AdrpAddLdr" are: -// - L1, L2, and L3 instructions are of the expected type, i.e., -// respectively ADRP, ADD (immediate), and LD. -// - The result of L1 is used only by L2. -// - The register argument (xA) used in the ADD instruction is defined -// only by L1. -// - The result of L2 is used only by L3. -// - The base address (xB) in L3 is defined only L2. -// - The ADRP in L1 and the ADD in L2 must reference the same symbol using -// @PAGE/@PAGEOFF with no additional constants -// -// Currently supported LOHs are: -// * So called non-ADRP-related: -// - .loh AdrpAddLdr L1, L2, L3: -// L1: adrp xA, sym@PAGE -// L2: add xB, xA, sym@PAGEOFF -// L3: ldr xC, [xB, #imm] -// - .loh AdrpLdrGotLdr L1, L2, L3: -// L1: adrp xA, sym@GOTPAGE -// L2: ldr xB, [xA, sym@GOTPAGEOFF] -// L3: ldr xC, [xB, #imm] -// - .loh AdrpLdr L1, L3: -// L1: adrp xA, sym@PAGE -// L3: ldr xC, [xA, sym@PAGEOFF] -// - .loh AdrpAddStr L1, L2, L3: -// L1: adrp xA, sym@PAGE -// L2: add xB, xA, sym@PAGEOFF -// L3: str xC, [xB, #imm] -// - .loh AdrpLdrGotStr L1, L2, L3: -// L1: adrp xA, sym@GOTPAGE -// L2: ldr xB, [xA, sym@GOTPAGEOFF] -// L3: str xC, [xB, #imm] -// - .loh AdrpAdd L1, L2: -// L1: adrp xA, sym@PAGE -// L2: add xB, xA, sym@PAGEOFF -// For all these LOHs, L1, L2, L3 form a simple chain: -// L1 result is used only by L2 and L2 result by L3. -// L3 LOH-related argument is defined only by L2 and L2 LOH-related argument -// by L1. -// All these LOHs aim at using more efficient load/store patterns by folding -// some instructions used to compute the address directly into the load/store. -// -// * So called ADRP-related: -// - .loh AdrpAdrp L2, L1: -// L2: ADRP xA, sym1@PAGE -// L1: ADRP xA, sym2@PAGE -// L2 dominates L1 and xA is not redifined between L2 and L1 -// This LOH aims at getting rid of redundant ADRP instructions. -// -// The overall design for emitting the LOHs is: -// 1. ARM64CollectLOH (this pass) records the LOHs in the ARM64FunctionInfo. -// 2. ARM64AsmPrinter reads the LOHs from ARM64FunctionInfo and it: -// 1. Associates them a label. -// 2. Emits them in a MCStreamer (EmitLOHDirective). -// - The MCMachOStreamer records them into the MCAssembler. -// - The MCAsmStreamer prints them. -// - Other MCStreamers ignore them. -// 3. Closes the MCStreamer: -// - The MachObjectWriter gets them from the MCAssembler and writes -// them in the object file. -// - Other ObjectWriters ignore them. -//===----------------------------------------------------------------------===// - -#include "ARM64.h" -#include "ARM64InstrInfo.h" -#include "ARM64MachineFunctionInfo.h" -#include "MCTargetDesc/ARM64AddressingModes.h" -#include "llvm/ADT/BitVector.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/MapVector.h" -#include "llvm/ADT/SetVector.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetRegisterInfo.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/Statistic.h" -using namespace llvm; - -#define DEBUG_TYPE "arm64-collect-loh" - -static cl::opt -PreCollectRegister("arm64-collect-loh-pre-collect-register", cl::Hidden, - cl::desc("Restrict analysis to registers invovled" - " in LOHs"), - cl::init(true)); - -static cl::opt -BasicBlockScopeOnly("arm64-collect-loh-bb-only", cl::Hidden, - cl::desc("Restrict analysis at basic block scope"), - cl::init(true)); - -STATISTIC(NumADRPSimpleCandidate, - "Number of simplifiable ADRP dominate by another"); -STATISTIC(NumADRPComplexCandidate2, - "Number of simplifiable ADRP reachable by 2 defs"); -STATISTIC(NumADRPComplexCandidate3, - "Number of simplifiable ADRP reachable by 3 defs"); -STATISTIC(NumADRPComplexCandidateOther, - "Number of simplifiable ADRP reachable by 4 or more defs"); -STATISTIC(NumADDToSTRWithImm, - "Number of simplifiable STR with imm reachable by ADD"); -STATISTIC(NumLDRToSTRWithImm, - "Number of simplifiable STR with imm reachable by LDR"); -STATISTIC(NumADDToSTR, "Number of simplifiable STR reachable by ADD"); -STATISTIC(NumLDRToSTR, "Number of simplifiable STR reachable by LDR"); -STATISTIC(NumADDToLDRWithImm, - "Number of simplifiable LDR with imm reachable by ADD"); -STATISTIC(NumLDRToLDRWithImm, - "Number of simplifiable LDR with imm reachable by LDR"); -STATISTIC(NumADDToLDR, "Number of simplifiable LDR reachable by ADD"); -STATISTIC(NumLDRToLDR, "Number of simplifiable LDR reachable by LDR"); -STATISTIC(NumADRPToLDR, "Number of simplifiable LDR reachable by ADRP"); -STATISTIC(NumCplxLvl1, "Number of complex case of level 1"); -STATISTIC(NumTooCplxLvl1, "Number of too complex case of level 1"); -STATISTIC(NumCplxLvl2, "Number of complex case of level 2"); -STATISTIC(NumTooCplxLvl2, "Number of too complex case of level 2"); -STATISTIC(NumADRSimpleCandidate, "Number of simplifiable ADRP + ADD"); -STATISTIC(NumADRComplexCandidate, "Number of too complex ADRP + ADD"); - -namespace llvm { -void initializeARM64CollectLOHPass(PassRegistry &); -} - -namespace { -struct ARM64CollectLOH : public MachineFunctionPass { - static char ID; - ARM64CollectLOH() : MachineFunctionPass(ID) { - initializeARM64CollectLOHPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "ARM64 Collect Linker Optimization Hint (LOH)"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); - AU.addRequired(); - } - -private: -}; - -/// A set of MachineInstruction. -typedef SetVector SetOfMachineInstr; -/// Map a basic block to a set of instructions per register. -/// This is used to represent the exposed uses of a basic block -/// per register. -typedef MapVector -BlockToSetOfInstrsPerColor; -/// Map a basic block to an instruction per register. -/// This is used to represent the live-out definitions of a basic block -/// per register. -typedef MapVector -BlockToInstrPerColor; -/// Map an instruction to a set of instructions. Used to represent the -/// mapping def to reachable uses or use to definitions. -typedef MapVector InstrToInstrs; -/// Map a basic block to a BitVector. -/// This is used to record the kill registers per basic block. -typedef MapVector BlockToRegSet; - -/// Map a register to a dense id. -typedef DenseMap MapRegToId; -/// Map a dense id to a register. Used for debug purposes. -typedef SmallVector MapIdToReg; -} // end anonymous namespace. - -char ARM64CollectLOH::ID = 0; - -INITIALIZE_PASS_BEGIN(ARM64CollectLOH, "arm64-collect-loh", - "ARM64 Collect Linker Optimization Hint (LOH)", false, - false) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_END(ARM64CollectLOH, "arm64-collect-loh", - "ARM64 Collect Linker Optimization Hint (LOH)", false, - false) - -/// Given a couple (MBB, reg) get the corresponding set of instruction from -/// the given "sets". -/// If this couple does not reference any set, an empty set is added to "sets" -/// for this couple and returned. -/// \param nbRegs is used internally allocate some memory. It must be consistent -/// with the way sets is used. -static SetOfMachineInstr &getSet(BlockToSetOfInstrsPerColor &sets, - const MachineBasicBlock &MBB, unsigned reg, - unsigned nbRegs) { - SetOfMachineInstr *result; - BlockToSetOfInstrsPerColor::iterator it = sets.find(&MBB); - if (it != sets.end()) - result = it->second; - else - result = sets[&MBB] = new SetOfMachineInstr[nbRegs]; - - return result[reg]; -} - -/// Given a couple (reg, MI) get the corresponding set of instructions from the -/// the given "sets". -/// This is used to get the uses record in sets of a definition identified by -/// MI and reg, i.e., MI defines reg. -/// If the couple does not reference anything, an empty set is added to -/// "sets[reg]". -/// \pre set[reg] is valid. -static SetOfMachineInstr &getUses(InstrToInstrs *sets, unsigned reg, - const MachineInstr &MI) { - return sets[reg][&MI]; -} - -/// Same as getUses but does not modify the input map: sets. -/// \return NULL if the couple (reg, MI) is not in sets. -static const SetOfMachineInstr *getUses(const InstrToInstrs *sets, unsigned reg, - const MachineInstr &MI) { - InstrToInstrs::const_iterator Res = sets[reg].find(&MI); - if (Res != sets[reg].end()) - return &(Res->second); - return nullptr; -} - -/// Initialize the reaching definition algorithm: -/// For each basic block BB in MF, record: -/// - its kill set. -/// - its reachable uses (uses that are exposed to BB's predecessors). -/// - its the generated definitions. -/// \param DummyOp if not NULL, specifies a Dummy Operation to be added to -/// the list of uses of exposed defintions. -/// \param ADRPMode specifies to only consider ADRP instructions for generated -/// definition. It also consider definitions of ADRP instructions as uses and -/// ignore other uses. The ADRPMode is used to collect the information for LHO -/// that involve ADRP operation only. -static void initReachingDef(MachineFunction &MF, - InstrToInstrs *ColorOpToReachedUses, - BlockToInstrPerColor &Gen, BlockToRegSet &Kill, - BlockToSetOfInstrsPerColor &ReachableUses, - const MapRegToId &RegToId, - const MachineInstr *DummyOp, bool ADRPMode) { - const TargetMachine &TM = MF.getTarget(); - const TargetRegisterInfo *TRI = TM.getRegisterInfo(); - - unsigned NbReg = RegToId.size(); - - for (MachineBasicBlock &MBB : MF) { - const MachineInstr **&BBGen = Gen[&MBB]; - BBGen = new const MachineInstr *[NbReg]; - memset(BBGen, 0, sizeof(const MachineInstr *) * NbReg); - - BitVector &BBKillSet = Kill[&MBB]; - BBKillSet.resize(NbReg); - for (const MachineInstr &MI : MBB) { - bool IsADRP = MI.getOpcode() == ARM64::ADRP; - - // Process uses first. - if (IsADRP || !ADRPMode) - for (const MachineOperand &MO : MI.operands()) { - // Treat ADRP def as use, as the goal of the analysis is to find - // ADRP defs reached by other ADRP defs. - if (!MO.isReg() || (!ADRPMode && !MO.isUse()) || - (ADRPMode && (!IsADRP || !MO.isDef()))) - continue; - unsigned CurReg = MO.getReg(); - MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg); - if (ItCurRegId == RegToId.end()) - continue; - CurReg = ItCurRegId->second; - - // if CurReg has not been defined, this use is reachable. - if (!BBGen[CurReg] && !BBKillSet.test(CurReg)) - getSet(ReachableUses, MBB, CurReg, NbReg).insert(&MI); - // current basic block definition for this color, if any, is in Gen. - if (BBGen[CurReg]) - getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(&MI); - } - - // Process clobbers. - for (const MachineOperand &MO : MI.operands()) { - if (!MO.isRegMask()) - continue; - // Clobbers kill the related colors. - const uint32_t *PreservedRegs = MO.getRegMask(); - - // Set generated regs. - for (const auto Entry : RegToId) { - unsigned Reg = Entry.second; - // Use the global register ID when querying APIs external to this - // pass. - if (MachineOperand::clobbersPhysReg(PreservedRegs, Entry.first)) { - // Do not register clobbered definition for no ADRP. - // This definition is not used anyway (otherwise register - // allocation is wrong). - BBGen[Reg] = ADRPMode ? &MI : nullptr; - BBKillSet.set(Reg); - } - } - } - - // Process register defs. - for (const MachineOperand &MO : MI.operands()) { - if (!MO.isReg() || !MO.isDef()) - continue; - unsigned CurReg = MO.getReg(); - MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg); - if (ItCurRegId == RegToId.end()) - continue; - - for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) { - MapRegToId::const_iterator ItRegId = RegToId.find(*AI); - assert(ItRegId != RegToId.end() && - "Sub-register of an " - "involved register, not recorded as involved!"); - BBKillSet.set(ItRegId->second); - BBGen[ItRegId->second] = &MI; - } - BBGen[ItCurRegId->second] = &MI; - } - } - - // If we restrict our analysis to basic block scope, conservatively add a - // dummy - // use for each generated value. - if (!ADRPMode && DummyOp && !MBB.succ_empty()) - for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg) - if (BBGen[CurReg]) - getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(DummyOp); - } -} - -/// Reaching def core algorithm: -/// while an Out has changed -/// for each bb -/// for each color -/// In[bb][color] = U Out[bb.predecessors][color] -/// insert reachableUses[bb][color] in each in[bb][color] -/// op.reachedUses -/// -/// Out[bb] = Gen[bb] U (In[bb] - Kill[bb]) -static void reachingDefAlgorithm(MachineFunction &MF, - InstrToInstrs *ColorOpToReachedUses, - BlockToSetOfInstrsPerColor &In, - BlockToSetOfInstrsPerColor &Out, - BlockToInstrPerColor &Gen, BlockToRegSet &Kill, - BlockToSetOfInstrsPerColor &ReachableUses, - unsigned NbReg) { - bool HasChanged; - do { - HasChanged = false; - for (MachineBasicBlock &MBB : MF) { - unsigned CurReg; - for (CurReg = 0; CurReg < NbReg; ++CurReg) { - SetOfMachineInstr &BBInSet = getSet(In, MBB, CurReg, NbReg); - SetOfMachineInstr &BBReachableUses = - getSet(ReachableUses, MBB, CurReg, NbReg); - SetOfMachineInstr &BBOutSet = getSet(Out, MBB, CurReg, NbReg); - unsigned Size = BBOutSet.size(); - // In[bb][color] = U Out[bb.predecessors][color] - for (MachineBasicBlock *PredMBB : MBB.predecessors()) { - SetOfMachineInstr &PredOutSet = getSet(Out, *PredMBB, CurReg, NbReg); - BBInSet.insert(PredOutSet.begin(), PredOutSet.end()); - } - // insert reachableUses[bb][color] in each in[bb][color] op.reachedses - for (const MachineInstr *MI : BBInSet) { - SetOfMachineInstr &OpReachedUses = - getUses(ColorOpToReachedUses, CurReg, *MI); - OpReachedUses.insert(BBReachableUses.begin(), BBReachableUses.end()); - } - // Out[bb] = Gen[bb] U (In[bb] - Kill[bb]) - if (!Kill[&MBB].test(CurReg)) - BBOutSet.insert(BBInSet.begin(), BBInSet.end()); - if (Gen[&MBB][CurReg]) - BBOutSet.insert(Gen[&MBB][CurReg]); - HasChanged |= BBOutSet.size() != Size; - } - } - } while (HasChanged); -} - -/// Release all memory dynamically allocated during the reaching -/// definition algorithm. -static void finitReachingDef(BlockToSetOfInstrsPerColor &In, - BlockToSetOfInstrsPerColor &Out, - BlockToInstrPerColor &Gen, - BlockToSetOfInstrsPerColor &ReachableUses) { - for (auto &IT : Out) - delete[] IT.second; - for (auto &IT : In) - delete[] IT.second; - for (auto &IT : ReachableUses) - delete[] IT.second; - for (auto &IT : Gen) - delete[] IT.second; -} - -/// Reaching definition algorithm. -/// \param MF function on which the algorithm will operate. -/// \param[out] ColorOpToReachedUses will contain the result of the reaching -/// def algorithm. -/// \param ADRPMode specify whether the reaching def algorithm should be tuned -/// for ADRP optimization. \see initReachingDef for more details. -/// \param DummyOp if not NULL, the algorithm will work at -/// basic block scope and will set for every exposed definition a use to -/// @p DummyOp. -/// \pre ColorOpToReachedUses is an array of at least number of registers of -/// InstrToInstrs. -static void reachingDef(MachineFunction &MF, - InstrToInstrs *ColorOpToReachedUses, - const MapRegToId &RegToId, bool ADRPMode = false, - const MachineInstr *DummyOp = nullptr) { - // structures: - // For each basic block. - // Out: a set per color of definitions that reach the - // out boundary of this block. - // In: Same as Out but for in boundary. - // Gen: generated color in this block (one operation per color). - // Kill: register set of killed color in this block. - // ReachableUses: a set per color of uses (operation) reachable - // for "In" definitions. - BlockToSetOfInstrsPerColor Out, In, ReachableUses; - BlockToInstrPerColor Gen; - BlockToRegSet Kill; - - // Initialize Gen, kill and reachableUses. - initReachingDef(MF, ColorOpToReachedUses, Gen, Kill, ReachableUses, RegToId, - DummyOp, ADRPMode); - - // Algo. - if (!DummyOp) - reachingDefAlgorithm(MF, ColorOpToReachedUses, In, Out, Gen, Kill, - ReachableUses, RegToId.size()); - - // finit. - finitReachingDef(In, Out, Gen, ReachableUses); -} - -#ifndef NDEBUG -/// print the result of the reaching definition algorithm. -static void printReachingDef(const InstrToInstrs *ColorOpToReachedUses, - unsigned NbReg, const TargetRegisterInfo *TRI, - const MapIdToReg &IdToReg) { - unsigned CurReg; - for (CurReg = 0; CurReg < NbReg; ++CurReg) { - if (ColorOpToReachedUses[CurReg].empty()) - continue; - DEBUG(dbgs() << "*** Reg " << PrintReg(IdToReg[CurReg], TRI) << " ***\n"); - - for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) { - DEBUG(dbgs() << "Def:\n"); - DEBUG(DefsIt.first->print(dbgs())); - DEBUG(dbgs() << "Reachable uses:\n"); - for (const MachineInstr *MI : DefsIt.second) { - DEBUG(MI->print(dbgs())); - } - } - } -} -#endif // NDEBUG - -/// Answer the following question: Can Def be one of the definition -/// involved in a part of a LOH? -static bool canDefBePartOfLOH(const MachineInstr *Def) { - unsigned Opc = Def->getOpcode(); - // Accept ADRP, ADDLow and LOADGot. - switch (Opc) { - default: - return false; - case ARM64::ADRP: - return true; - case ARM64::ADDXri: - // Check immediate to see if the immediate is an address. - switch (Def->getOperand(2).getType()) { - default: - return false; - case MachineOperand::MO_GlobalAddress: - case MachineOperand::MO_JumpTableIndex: - case MachineOperand::MO_ConstantPoolIndex: - case MachineOperand::MO_BlockAddress: - return true; - } - case ARM64::LDRXui: - // Check immediate to see if the immediate is an address. - switch (Def->getOperand(2).getType()) { - default: - return false; - case MachineOperand::MO_GlobalAddress: - return true; - } - } - // Unreachable. - return false; -} - -/// Check whether the given instruction can the end of a LOH chain involving a -/// store. -static bool isCandidateStore(const MachineInstr *Instr) { - switch (Instr->getOpcode()) { - default: - return false; - case ARM64::STRBui: - case ARM64::STRHui: - case ARM64::STRWui: - case ARM64::STRXui: - case ARM64::STRSui: - case ARM64::STRDui: - case ARM64::STRQui: - // In case we have str xA, [xA, #imm], this is two different uses - // of xA and we cannot fold, otherwise the xA stored may be wrong, - // even if #imm == 0. - if (Instr->getOperand(0).getReg() != Instr->getOperand(1).getReg()) - return true; - } - return false; -} - -/// Given the result of a reaching definition algorithm in ColorOpToReachedUses, -/// Build the Use to Defs information and filter out obvious non-LOH candidates. -/// In ADRPMode, non-LOH candidates are "uses" with non-ADRP definitions. -/// In non-ADRPMode, non-LOH candidates are "uses" with several definition, -/// i.e., no simple chain. -/// \param ADRPMode -- \see initReachingDef. -static void reachedUsesToDefs(InstrToInstrs &UseToReachingDefs, - const InstrToInstrs *ColorOpToReachedUses, - const MapRegToId &RegToId, - bool ADRPMode = false) { - - SetOfMachineInstr NotCandidate; - unsigned NbReg = RegToId.size(); - MapRegToId::const_iterator EndIt = RegToId.end(); - for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg) { - // If this color is never defined, continue. - if (ColorOpToReachedUses[CurReg].empty()) - continue; - - for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) { - for (const MachineInstr *MI : DefsIt.second) { - const MachineInstr *Def = DefsIt.first; - MapRegToId::const_iterator It; - // if all the reaching defs are not adrp, this use will not be - // simplifiable. - if ((ADRPMode && Def->getOpcode() != ARM64::ADRP) || - (!ADRPMode && !canDefBePartOfLOH(Def)) || - (!ADRPMode && isCandidateStore(MI) && - // store are LOH candidate iff the end of the chain is used as - // base. - ((It = RegToId.find((MI)->getOperand(1).getReg())) == EndIt || - It->second != CurReg))) { - NotCandidate.insert(MI); - continue; - } - // Do not consider self reaching as a simplifiable case for ADRP. - if (!ADRPMode || MI != DefsIt.first) { - UseToReachingDefs[MI].insert(DefsIt.first); - // If UsesIt has several reaching definitions, it is not - // candidate for simplificaton in non-ADRPMode. - if (!ADRPMode && UseToReachingDefs[MI].size() > 1) - NotCandidate.insert(MI); - } - } - } - } - for (const MachineInstr *Elem : NotCandidate) { - DEBUG(dbgs() << "Too many reaching defs: " << *Elem << "\n"); - // It would have been better if we could just remove the entry - // from the map. Because of that, we have to filter the garbage - // (second.empty) in the subsequence analysis. - UseToReachingDefs[Elem].clear(); - } -} - -/// Based on the use to defs information (in ADRPMode), compute the -/// opportunities of LOH ADRP-related. -static void computeADRP(const InstrToInstrs &UseToDefs, - ARM64FunctionInfo &ARM64FI, - const MachineDominatorTree *MDT) { - DEBUG(dbgs() << "*** Compute LOH for ADRP\n"); - for (const auto &Entry : UseToDefs) { - unsigned Size = Entry.second.size(); - if (Size == 0) - continue; - if (Size == 1) { - const MachineInstr *L2 = *Entry.second.begin(); - const MachineInstr *L1 = Entry.first; - if (!MDT->dominates(L2, L1)) { - DEBUG(dbgs() << "Dominance check failed:\n" << *L2 << '\n' << *L1 - << '\n'); - continue; - } - DEBUG(dbgs() << "Record AdrpAdrp:\n" << *L2 << '\n' << *L1 << '\n'); - SmallVector Args; - Args.push_back(L2); - Args.push_back(L1); - ARM64FI.addLOHDirective(MCLOH_AdrpAdrp, Args); - ++NumADRPSimpleCandidate; - } -#ifdef DEBUG - else if (Size == 2) - ++NumADRPComplexCandidate2; - else if (Size == 3) - ++NumADRPComplexCandidate3; - else - ++NumADRPComplexCandidateOther; -#endif - // if Size < 1, the use should have been removed from the candidates - assert(Size >= 1 && "No reaching defs for that use!"); - } -} - -/// Check whether the given instruction can be the end of a LOH chain -/// involving a load. -static bool isCandidateLoad(const MachineInstr *Instr) { - switch (Instr->getOpcode()) { - default: - return false; - case ARM64::LDRSBWui: - case ARM64::LDRSBXui: - case ARM64::LDRSHWui: - case ARM64::LDRSHXui: - case ARM64::LDRSWui: - case ARM64::LDRBui: - case ARM64::LDRHui: - case ARM64::LDRWui: - case ARM64::LDRXui: - case ARM64::LDRSui: - case ARM64::LDRDui: - case ARM64::LDRQui: - if (Instr->getOperand(2).getTargetFlags() & ARM64II::MO_GOT) - return false; - return true; - } - // Unreachable. - return false; -} - -/// Check whether the given instruction can load a litteral. -static bool supportLoadFromLiteral(const MachineInstr *Instr) { - switch (Instr->getOpcode()) { - default: - return false; - case ARM64::LDRSWui: - case ARM64::LDRWui: - case ARM64::LDRXui: - case ARM64::LDRSui: - case ARM64::LDRDui: - case ARM64::LDRQui: - return true; - } - // Unreachable. - return false; -} - -/// Check whether the given instruction is a LOH candidate. -/// \param UseToDefs is used to check that Instr is at the end of LOH supported -/// chain. -/// \pre UseToDefs contains only on def per use, i.e., obvious non candidate are -/// already been filtered out. -static bool isCandidate(const MachineInstr *Instr, - const InstrToInstrs &UseToDefs, - const MachineDominatorTree *MDT) { - if (!isCandidateLoad(Instr) && !isCandidateStore(Instr)) - return false; - - const MachineInstr *Def = *UseToDefs.find(Instr)->second.begin(); - if (Def->getOpcode() != ARM64::ADRP) { - // At this point, Def is ADDXri or LDRXui of the right type of - // symbol, because we filtered out the uses that were not defined - // by these kind of instructions (+ ADRP). - - // Check if this forms a simple chain: each intermediate node must - // dominates the next one. - if (!MDT->dominates(Def, Instr)) - return false; - // Move one node up in the simple chain. - if (UseToDefs.find(Def) == - UseToDefs.end() - // The map may contain garbage we have to ignore. - || - UseToDefs.find(Def)->second.empty()) - return false; - Instr = Def; - Def = *UseToDefs.find(Def)->second.begin(); - } - // Check if we reached the top of the simple chain: - // - top is ADRP. - // - check the simple chain property: each intermediate node must - // dominates the next one. - if (Def->getOpcode() == ARM64::ADRP) - return MDT->dominates(Def, Instr); - return false; -} - -static bool registerADRCandidate(const MachineInstr &Use, - const InstrToInstrs &UseToDefs, - const InstrToInstrs *DefsPerColorToUses, - ARM64FunctionInfo &ARM64FI, - SetOfMachineInstr *InvolvedInLOHs, - const MapRegToId &RegToId) { - // Look for opportunities to turn ADRP -> ADD or - // ADRP -> LDR GOTPAGEOFF into ADR. - // If ADRP has more than one use. Give up. - if (Use.getOpcode() != ARM64::ADDXri && - (Use.getOpcode() != ARM64::LDRXui || - !(Use.getOperand(2).getTargetFlags() & ARM64II::MO_GOT))) - return false; - InstrToInstrs::const_iterator It = UseToDefs.find(&Use); - // The map may contain garbage that we need to ignore. - if (It == UseToDefs.end() || It->second.empty()) - return false; - const MachineInstr &Def = **It->second.begin(); - if (Def.getOpcode() != ARM64::ADRP) - return false; - // Check the number of users of ADRP. - const SetOfMachineInstr *Users = - getUses(DefsPerColorToUses, - RegToId.find(Def.getOperand(0).getReg())->second, Def); - if (Users->size() > 1) { - ++NumADRComplexCandidate; - return false; - } - ++NumADRSimpleCandidate; - assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Def)) && - "ADRP already involved in LOH."); - assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Use)) && - "ADD already involved in LOH."); - DEBUG(dbgs() << "Record AdrpAdd\n" << Def << '\n' << Use << '\n'); - - SmallVector Args; - Args.push_back(&Def); - Args.push_back(&Use); - - ARM64FI.addLOHDirective(Use.getOpcode() == ARM64::ADDXri ? MCLOH_AdrpAdd - : MCLOH_AdrpLdrGot, - Args); - return true; -} - -/// Based on the use to defs information (in non-ADRPMode), compute the -/// opportunities of LOH non-ADRP-related -static void computeOthers(const InstrToInstrs &UseToDefs, - const InstrToInstrs *DefsPerColorToUses, - ARM64FunctionInfo &ARM64FI, const MapRegToId &RegToId, - const MachineDominatorTree *MDT) { - SetOfMachineInstr *InvolvedInLOHs = nullptr; -#ifdef DEBUG - SetOfMachineInstr InvolvedInLOHsStorage; - InvolvedInLOHs = &InvolvedInLOHsStorage; -#endif // DEBUG - DEBUG(dbgs() << "*** Compute LOH for Others\n"); - // ADRP -> ADD/LDR -> LDR/STR pattern. - // Fall back to ADRP -> ADD pattern if we fail to catch the bigger pattern. - - // FIXME: When the statistics are not important, - // This initial filtering loop can be merged into the next loop. - // Currently, we didn't do it to have the same code for both DEBUG and - // NDEBUG builds. Indeed, the iterator of the second loop would need - // to be changed. - SetOfMachineInstr PotentialCandidates; - SetOfMachineInstr PotentialADROpportunities; - for (auto &Use : UseToDefs) { - // If no definition is available, this is a non candidate. - if (Use.second.empty()) - continue; - // Keep only instructions that are load or store and at the end of - // a ADRP -> ADD/LDR/Nothing chain. - // We already filtered out the no-chain cases. - if (!isCandidate(Use.first, UseToDefs, MDT)) { - PotentialADROpportunities.insert(Use.first); - continue; - } - PotentialCandidates.insert(Use.first); - } - - // Make the following distinctions for statistics as the linker does - // know how to decode instructions: - // - ADD/LDR/Nothing make there different patterns. - // - LDR/STR make two different patterns. - // Hence, 6 - 1 base patterns. - // (because ADRP-> Nothing -> STR is not simplifiable) - - // The linker is only able to have a simple semantic, i.e., if pattern A - // do B. - // However, we want to see the opportunity we may miss if we were able to - // catch more complex cases. - - // PotentialCandidates are result of a chain ADRP -> ADD/LDR -> - // A potential candidate becomes a candidate, if its current immediate - // operand is zero and all nodes of the chain have respectively only one user -#ifdef DEBUG - SetOfMachineInstr DefsOfPotentialCandidates; -#endif - for (const MachineInstr *Candidate : PotentialCandidates) { - // Get the definition of the candidate i.e., ADD or LDR. - const MachineInstr *Def = *UseToDefs.find(Candidate)->second.begin(); - // Record the elements of the chain. - const MachineInstr *L1 = Def; - const MachineInstr *L2 = nullptr; - unsigned ImmediateDefOpc = Def->getOpcode(); - if (Def->getOpcode() != ARM64::ADRP) { - // Check the number of users of this node. - const SetOfMachineInstr *Users = - getUses(DefsPerColorToUses, - RegToId.find(Def->getOperand(0).getReg())->second, *Def); - if (Users->size() > 1) { -#ifdef DEBUG - // if all the uses of this def are in potential candidate, this is - // a complex candidate of level 2. - bool IsLevel2 = true; - for (const MachineInstr *MI : *Users) { - if (!PotentialCandidates.count(MI)) { - ++NumTooCplxLvl2; - IsLevel2 = false; - break; - } - } - if (IsLevel2) - ++NumCplxLvl2; -#endif // DEBUG - PotentialADROpportunities.insert(Def); - continue; - } - L2 = Def; - Def = *UseToDefs.find(Def)->second.begin(); - L1 = Def; - } // else the element in the middle of the chain is nothing, thus - // Def already contains the first element of the chain. - - // Check the number of users of the first node in the chain, i.e., ADRP - const SetOfMachineInstr *Users = - getUses(DefsPerColorToUses, - RegToId.find(Def->getOperand(0).getReg())->second, *Def); - if (Users->size() > 1) { -#ifdef DEBUG - // if all the uses of this def are in the defs of the potential candidate, - // this is a complex candidate of level 1 - if (DefsOfPotentialCandidates.empty()) { - // lazy init - DefsOfPotentialCandidates = PotentialCandidates; - for (const MachineInstr *Candidate : PotentialCandidates) { - if (!UseToDefs.find(Candidate)->second.empty()) - DefsOfPotentialCandidates.insert( - *UseToDefs.find(Candidate)->second.begin()); - } - } - bool Found = false; - for (auto &Use : *Users) { - if (!DefsOfPotentialCandidates.count(Use)) { - ++NumTooCplxLvl1; - Found = true; - break; - } - } - if (!Found) - ++NumCplxLvl1; -#endif // DEBUG - continue; - } - - bool IsL2Add = (ImmediateDefOpc == ARM64::ADDXri); - // If the chain is three instructions long and ldr is the second element, - // then this ldr must load form GOT, otherwise this is not a correct chain. - if (L2 && !IsL2Add && L2->getOperand(2).getTargetFlags() != ARM64II::MO_GOT) - continue; - SmallVector Args; - MCLOHType Kind; - if (isCandidateLoad(Candidate)) { - if (!L2) { - // At this point, the candidate LOH indicates that the ldr instruction - // may use a direct access to the symbol. There is not such encoding - // for loads of byte and half. - if (!supportLoadFromLiteral(Candidate)) - continue; - - DEBUG(dbgs() << "Record AdrpLdr:\n" << *L1 << '\n' << *Candidate - << '\n'); - Kind = MCLOH_AdrpLdr; - Args.push_back(L1); - Args.push_back(Candidate); - assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) && - "L1 already involved in LOH."); - assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) && - "Candidate already involved in LOH."); - ++NumADRPToLDR; - } else { - DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot") - << "Ldr:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate - << '\n'); - - Kind = IsL2Add ? MCLOH_AdrpAddLdr : MCLOH_AdrpLdrGotLdr; - Args.push_back(L1); - Args.push_back(L2); - Args.push_back(Candidate); - - PotentialADROpportunities.remove(L2); - assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) && - "L1 already involved in LOH."); - assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) && - "L2 already involved in LOH."); - assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) && - "Candidate already involved in LOH."); -#ifdef DEBUG - // get the immediate of the load - if (Candidate->getOperand(2).getImm() == 0) - if (ImmediateDefOpc == ARM64::ADDXri) - ++NumADDToLDR; - else - ++NumLDRToLDR; - else if (ImmediateDefOpc == ARM64::ADDXri) - ++NumADDToLDRWithImm; - else - ++NumLDRToLDRWithImm; -#endif // DEBUG - } - } else { - if (ImmediateDefOpc == ARM64::ADRP) - continue; - else { - - DEBUG(dbgs() << "Record Adrp" << (IsL2Add ? "Add" : "LdrGot") - << "Str:\n" << *L1 << '\n' << *L2 << '\n' << *Candidate - << '\n'); - - Kind = IsL2Add ? MCLOH_AdrpAddStr : MCLOH_AdrpLdrGotStr; - Args.push_back(L1); - Args.push_back(L2); - Args.push_back(Candidate); - - PotentialADROpportunities.remove(L2); - assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L1)) && - "L1 already involved in LOH."); - assert((!InvolvedInLOHs || InvolvedInLOHs->insert(L2)) && - "L2 already involved in LOH."); - assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Candidate)) && - "Candidate already involved in LOH."); -#ifdef DEBUG - // get the immediate of the store - if (Candidate->getOperand(2).getImm() == 0) - if (ImmediateDefOpc == ARM64::ADDXri) - ++NumADDToSTR; - else - ++NumLDRToSTR; - else if (ImmediateDefOpc == ARM64::ADDXri) - ++NumADDToSTRWithImm; - else - ++NumLDRToSTRWithImm; -#endif // DEBUG - } - } - ARM64FI.addLOHDirective(Kind, Args); - } - - // Now, we grabbed all the big patterns, check ADR opportunities. - for (const MachineInstr *Candidate : PotentialADROpportunities) - registerADRCandidate(*Candidate, UseToDefs, DefsPerColorToUses, ARM64FI, - InvolvedInLOHs, RegToId); -} - -/// Look for every register defined by potential LOHs candidates. -/// Map these registers with dense id in @p RegToId and vice-versa in -/// @p IdToReg. @p IdToReg is populated only in DEBUG mode. -static void collectInvolvedReg(MachineFunction &MF, MapRegToId &RegToId, - MapIdToReg &IdToReg, - const TargetRegisterInfo *TRI) { - unsigned CurRegId = 0; - if (!PreCollectRegister) { - unsigned NbReg = TRI->getNumRegs(); - for (; CurRegId < NbReg; ++CurRegId) { - RegToId[CurRegId] = CurRegId; - DEBUG(IdToReg.push_back(CurRegId)); - DEBUG(assert(IdToReg[CurRegId] == CurRegId && "Reg index mismatches")); - } - return; - } - - DEBUG(dbgs() << "** Collect Involved Register\n"); - for (const auto &MBB : MF) { - for (const MachineInstr &MI : MBB) { - if (!canDefBePartOfLOH(&MI)) - continue; - - // Process defs - for (MachineInstr::const_mop_iterator IO = MI.operands_begin(), - IOEnd = MI.operands_end(); - IO != IOEnd; ++IO) { - if (!IO->isReg() || !IO->isDef()) - continue; - unsigned CurReg = IO->getReg(); - for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) - if (RegToId.find(*AI) == RegToId.end()) { - DEBUG(IdToReg.push_back(*AI); - assert(IdToReg[CurRegId] == *AI && - "Reg index mismatches insertion index.")); - RegToId[*AI] = CurRegId++; - DEBUG(dbgs() << "Register: " << PrintReg(*AI, TRI) << '\n'); - } - } - } - } -} - -bool ARM64CollectLOH::runOnMachineFunction(MachineFunction &MF) { - const TargetMachine &TM = MF.getTarget(); - const TargetRegisterInfo *TRI = TM.getRegisterInfo(); - const MachineDominatorTree *MDT = &getAnalysis(); - - MapRegToId RegToId; - MapIdToReg IdToReg; - ARM64FunctionInfo *ARM64FI = MF.getInfo(); - assert(ARM64FI && "No MachineFunctionInfo for this function!"); - - DEBUG(dbgs() << "Looking for LOH in " << MF.getName() << '\n'); - - collectInvolvedReg(MF, RegToId, IdToReg, TRI); - if (RegToId.empty()) - return false; - - MachineInstr *DummyOp = nullptr; - if (BasicBlockScopeOnly) { - const ARM64InstrInfo *TII = - static_cast(TM.getInstrInfo()); - // For local analysis, create a dummy operation to record uses that are not - // local. - DummyOp = MF.CreateMachineInstr(TII->get(ARM64::COPY), DebugLoc()); - } - - unsigned NbReg = RegToId.size(); - bool Modified = false; - - // Start with ADRP. - InstrToInstrs *ColorOpToReachedUses = new InstrToInstrs[NbReg]; - - // Compute the reaching def in ADRP mode, meaning ADRP definitions - // are first considered as uses. - reachingDef(MF, ColorOpToReachedUses, RegToId, true, DummyOp); - DEBUG(dbgs() << "ADRP reaching defs\n"); - DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg)); - - // Translate the definition to uses map into a use to definitions map to ease - // statistic computation. - InstrToInstrs ADRPToReachingDefs; - reachedUsesToDefs(ADRPToReachingDefs, ColorOpToReachedUses, RegToId, true); - - // Compute LOH for ADRP. - computeADRP(ADRPToReachingDefs, *ARM64FI, MDT); - delete[] ColorOpToReachedUses; - - // Continue with general ADRP -> ADD/LDR -> LDR/STR pattern. - ColorOpToReachedUses = new InstrToInstrs[NbReg]; - - // first perform a regular reaching def analysis. - reachingDef(MF, ColorOpToReachedUses, RegToId, false, DummyOp); - DEBUG(dbgs() << "All reaching defs\n"); - DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg)); - - // Turn that into a use to defs to ease statistic computation. - InstrToInstrs UsesToReachingDefs; - reachedUsesToDefs(UsesToReachingDefs, ColorOpToReachedUses, RegToId, false); - - // Compute other than AdrpAdrp LOH. - computeOthers(UsesToReachingDefs, ColorOpToReachedUses, *ARM64FI, RegToId, - MDT); - delete[] ColorOpToReachedUses; - - if (BasicBlockScopeOnly) - MF.DeleteMachineInstr(DummyOp); - - return Modified; -} - -/// createARM64CollectLOHPass - returns an instance of the Statistic for -/// linker optimization pass. -FunctionPass *llvm::createARM64CollectLOHPass() { - return new ARM64CollectLOH(); -} diff --git a/lib/Target/ARM64/ARM64ConditionalCompares.cpp b/lib/Target/ARM64/ARM64ConditionalCompares.cpp deleted file mode 100644 index 2243cce51a1..00000000000 --- a/lib/Target/ARM64/ARM64ConditionalCompares.cpp +++ /dev/null @@ -1,919 +0,0 @@ -//===-- ARM64ConditionalCompares.cpp --- CCMP formation for ARM64 ---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the ARM64ConditionalCompares pass which reduces -// branching and code size by using the conditional compare instructions CCMP, -// CCMN, and FCMP. -// -// The CFG transformations for forming conditional compares are very similar to -// if-conversion, and this pass should run immediately before the early -// if-conversion pass. -// -//===----------------------------------------------------------------------===// - -#include "ARM64.h" -#include "llvm/ADT/BitVector.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/SetVector.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SparseSet.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachineTraceMetrics.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetRegisterInfo.h" -#include "llvm/Target/TargetSubtargetInfo.h" - -using namespace llvm; - -#define DEBUG_TYPE "arm64-ccmp" - -// Absolute maximum number of instructions allowed per speculated block. -// This bypasses all other heuristics, so it should be set fairly high. -static cl::opt BlockInstrLimit( - "arm64-ccmp-limit", cl::init(30), cl::Hidden, - cl::desc("Maximum number of instructions per speculated block.")); - -// Stress testing mode - disable heuristics. -static cl::opt Stress("arm64-stress-ccmp", cl::Hidden, - cl::desc("Turn all knobs to 11")); - -STATISTIC(NumConsidered, "Number of ccmps considered"); -STATISTIC(NumPhiRejs, "Number of ccmps rejected (PHI)"); -STATISTIC(NumPhysRejs, "Number of ccmps rejected (Physregs)"); -STATISTIC(NumPhi2Rejs, "Number of ccmps rejected (PHI2)"); -STATISTIC(NumHeadBranchRejs, "Number of ccmps rejected (Head branch)"); -STATISTIC(NumCmpBranchRejs, "Number of ccmps rejected (CmpBB branch)"); -STATISTIC(NumCmpTermRejs, "Number of ccmps rejected (CmpBB is cbz...)"); -STATISTIC(NumImmRangeRejs, "Number of ccmps rejected (Imm out of range)"); -STATISTIC(NumLiveDstRejs, "Number of ccmps rejected (Cmp dest live)"); -STATISTIC(NumMultNZCVUses, "Number of ccmps rejected (NZCV used)"); -STATISTIC(NumUnknNZCVDefs, "Number of ccmps rejected (NZCV def unknown)"); - -STATISTIC(NumSpeculateRejs, "Number of ccmps rejected (Can't speculate)"); - -STATISTIC(NumConverted, "Number of ccmp instructions created"); -STATISTIC(NumCompBranches, "Number of cbz/cbnz branches converted"); - -//===----------------------------------------------------------------------===// -// SSACCmpConv -//===----------------------------------------------------------------------===// -// -// The SSACCmpConv class performs ccmp-conversion on SSA form machine code -// after determining if it is possible. The class contains no heuristics; -// external code should be used to determine when ccmp-conversion is a good -// idea. -// -// CCmp-formation works on a CFG representing chained conditions, typically -// from C's short-circuit || and && operators: -// -// From: Head To: Head -// / | CmpBB -// / | / | -// | CmpBB / | -// | / | Tail | -// | / | | | -// Tail | | | -// | | | | -// ... ... ... ... -// -// The Head block is terminated by a br.cond instruction, and the CmpBB block -// contains compare + br.cond. Tail must be a successor of both. -// -// The cmp-conversion turns the compare instruction in CmpBB into a conditional -// compare, and merges CmpBB into Head, speculatively executing its -// instructions. The ARM64 conditional compare instructions have an immediate -// operand that specifies the NZCV flag values when the condition is false and -// the compare isn't executed. This makes it possible to chain compares with -// different condition codes. -// -// Example: -// -// if (a == 5 || b == 17) -// foo(); -// -// Head: -// cmp w0, #5 -// b.eq Tail -// CmpBB: -// cmp w1, #17 -// b.eq Tail -// ... -// Tail: -// bl _foo -// -// Becomes: -// -// Head: -// cmp w0, #5 -// ccmp w1, #17, 4, ne ; 4 = nZcv -// b.eq Tail -// ... -// Tail: -// bl _foo -// -// The ccmp condition code is the one that would cause the Head terminator to -// branch to CmpBB. -// -// FIXME: It should also be possible to speculate a block on the critical edge -// between Head and Tail, just like if-converting a diamond. -// -// FIXME: Handle PHIs in Tail by turning them into selects (if-conversion). - -namespace { -class SSACCmpConv { - MachineFunction *MF; - const TargetInstrInfo *TII; - const TargetRegisterInfo *TRI; - MachineRegisterInfo *MRI; - -public: - /// The first block containing a conditional branch, dominating everything - /// else. - MachineBasicBlock *Head; - - /// The block containing cmp+br.cond with a successor shared with Head. - MachineBasicBlock *CmpBB; - - /// The common successor for Head and CmpBB. - MachineBasicBlock *Tail; - - /// The compare instruction in CmpBB that can be converted to a ccmp. - MachineInstr *CmpMI; - -private: - /// The branch condition in Head as determined by AnalyzeBranch. - SmallVector HeadCond; - - /// The condition code that makes Head branch to CmpBB. - ARM64CC::CondCode HeadCmpBBCC; - - /// The branch condition in CmpBB. - SmallVector CmpBBCond; - - /// The condition code that makes CmpBB branch to Tail. - ARM64CC::CondCode CmpBBTailCC; - - /// Check if the Tail PHIs are trivially convertible. - bool trivialTailPHIs(); - - /// Remove CmpBB from the Tail PHIs. - void updateTailPHIs(); - - /// Check if an operand defining DstReg is dead. - bool isDeadDef(unsigned DstReg); - - /// Find the compare instruction in MBB that controls the conditional branch. - /// Return NULL if a convertible instruction can't be found. - MachineInstr *findConvertibleCompare(MachineBasicBlock *MBB); - - /// Return true if all non-terminator instructions in MBB can be safely - /// speculated. - bool canSpeculateInstrs(MachineBasicBlock *MBB, const MachineInstr *CmpMI); - -public: - /// runOnMachineFunction - Initialize per-function data structures. - void runOnMachineFunction(MachineFunction &MF) { - this->MF = &MF; - TII = MF.getTarget().getInstrInfo(); - TRI = MF.getTarget().getRegisterInfo(); - MRI = &MF.getRegInfo(); - } - - /// If the sub-CFG headed by MBB can be cmp-converted, initialize the - /// internal state, and return true. - bool canConvert(MachineBasicBlock *MBB); - - /// Cmo-convert the last block passed to canConvertCmp(), assuming - /// it is possible. Add any erased blocks to RemovedBlocks. - void convert(SmallVectorImpl &RemovedBlocks); - - /// Return the expected code size delta if the conversion into a - /// conditional compare is performed. - int expectedCodeSizeDelta() const; -}; -} // end anonymous namespace - -// Check that all PHIs in Tail are selecting the same value from Head and CmpBB. -// This means that no if-conversion is required when merging CmpBB into Head. -bool SSACCmpConv::trivialTailPHIs() { - for (auto &I : *Tail) { - if (!I.isPHI()) - break; - unsigned HeadReg = 0, CmpBBReg = 0; - // PHI operands come in (VReg, MBB) pairs. - for (unsigned oi = 1, oe = I.getNumOperands(); oi != oe; oi += 2) { - MachineBasicBlock *MBB = I.getOperand(oi + 1).getMBB(); - unsigned Reg = I.getOperand(oi).getReg(); - if (MBB == Head) { - assert((!HeadReg || HeadReg == Reg) && "Inconsistent PHI operands"); - HeadReg = Reg; - } - if (MBB == CmpBB) { - assert((!CmpBBReg || CmpBBReg == Reg) && "Inconsistent PHI operands"); - CmpBBReg = Reg; - } - } - if (HeadReg != CmpBBReg) - return false; - } - return true; -} - -// Assuming that trivialTailPHIs() is true, update the Tail PHIs by simply -// removing the CmpBB operands. The Head operands will be identical. -void SSACCmpConv::updateTailPHIs() { - for (auto &I : *Tail) { - if (!I.isPHI()) - break; - // I is a PHI. It can have multiple entries for CmpBB. - for (unsigned oi = I.getNumOperands(); oi > 2; oi -= 2) { - // PHI operands are (Reg, MBB) at (oi-2, oi-1). - if (I.getOperand(oi - 1).getMBB() == CmpBB) { - I.RemoveOperand(oi - 1); - I.RemoveOperand(oi - 2); - } - } - } -} - -// This pass runs before the ARM64DeadRegisterDefinitions pass, so compares are -// still writing virtual registers without any uses. -bool SSACCmpConv::isDeadDef(unsigned DstReg) { - // Writes to the zero register are dead. - if (DstReg == ARM64::WZR || DstReg == ARM64::XZR) - return true; - if (!TargetRegisterInfo::isVirtualRegister(DstReg)) - return false; - // A virtual register def without any uses will be marked dead later, and - // eventually replaced by the zero register. - return MRI->use_nodbg_empty(DstReg); -} - -// Parse a condition code returned by AnalyzeBranch, and compute the CondCode -// corresponding to TBB. -// Return -static bool parseCond(ArrayRef Cond, ARM64CC::CondCode &CC) { - // A normal br.cond simply has the condition code. - if (Cond[0].getImm() != -1) { - assert(Cond.size() == 1 && "Unknown Cond array format"); - CC = (ARM64CC::CondCode)(int)Cond[0].getImm(); - return true; - } - // For tbz and cbz instruction, the opcode is next. - switch (Cond[1].getImm()) { - default: - // This includes tbz / tbnz branches which can't be converted to - // ccmp + br.cond. - return false; - case ARM64::CBZW: - case ARM64::CBZX: - assert(Cond.size() == 3 && "Unknown Cond array format"); - CC = ARM64CC::EQ; - return true; - case ARM64::CBNZW: - case ARM64::CBNZX: - assert(Cond.size() == 3 && "Unknown Cond array format"); - CC = ARM64CC::NE; - return true; - } -} - -MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) { - MachineBasicBlock::iterator I = MBB->getFirstTerminator(); - if (I == MBB->end()) - return nullptr; - // The terminator must be controlled by the flags. - if (!I->readsRegister(ARM64::NZCV)) { - switch (I->getOpcode()) { - case ARM64::CBZW: - case ARM64::CBZX: - case ARM64::CBNZW: - case ARM64::CBNZX: - // These can be converted into a ccmp against #0. - return I; - } - ++NumCmpTermRejs; - DEBUG(dbgs() << "Flags not used by terminator: " << *I); - return nullptr; - } - - // Now find the instruction controlling the terminator. - for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) { - --I; - assert(!I->isTerminator() && "Spurious terminator"); - switch (I->getOpcode()) { - // cmp is an alias for subs with a dead destination register. - case ARM64::SUBSWri: - case ARM64::SUBSXri: - // cmn is an alias for adds with a dead destination register. - case ARM64::ADDSWri: - case ARM64::ADDSXri: - // Check that the immediate operand is within range, ccmp wants a uimm5. - // Rd = SUBSri Rn, imm, shift - if (I->getOperand(3).getImm() || !isUInt<5>(I->getOperand(2).getImm())) { - DEBUG(dbgs() << "Immediate out of range for ccmp: " << *I); - ++NumImmRangeRejs; - return nullptr; - } - // Fall through. - case ARM64::SUBSWrr: - case ARM64::SUBSXrr: - case ARM64::ADDSWrr: - case ARM64::ADDSXrr: - if (isDeadDef(I->getOperand(0).getReg())) - return I; - DEBUG(dbgs() << "Can't convert compare with live destination: " << *I); - ++NumLiveDstRejs; - return nullptr; - case ARM64::FCMPSrr: - case ARM64::FCMPDrr: - case ARM64::FCMPESrr: - case ARM64::FCMPEDrr: - return I; - } - - // Check for flag reads and clobbers. - MIOperands::PhysRegInfo PRI = - MIOperands(I).analyzePhysReg(ARM64::NZCV, TRI); - - if (PRI.Reads) { - // The ccmp doesn't produce exactly the same flags as the original - // compare, so reject the transform if there are uses of the flags - // besides the terminators. - DEBUG(dbgs() << "Can't create ccmp with multiple uses: " << *I); - ++NumMultNZCVUses; - return nullptr; - } - - if (PRI.Clobbers) { - DEBUG(dbgs() << "Not convertible compare: " << *I); - ++NumUnknNZCVDefs; - return nullptr; - } - } - DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n'); - return nullptr; -} - -/// Determine if all the instructions in MBB can safely -/// be speculated. The terminators are not considered. -/// -/// Only CmpMI is allowed to clobber the flags. -/// -bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB, - const MachineInstr *CmpMI) { - // Reject any live-in physregs. It's probably NZCV/EFLAGS, and very hard to - // get right. - if (!MBB->livein_empty()) { - DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has live-ins.\n"); - return false; - } - - unsigned InstrCount = 0; - - // Check all instructions, except the terminators. It is assumed that - // terminators never have side effects or define any used register values. - for (auto &I : make_range(MBB->begin(), MBB->getFirstTerminator())) { - if (I.isDebugValue()) - continue; - - if (++InstrCount > BlockInstrLimit && !Stress) { - DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has more than " - << BlockInstrLimit << " instructions.\n"); - return false; - } - - // There shouldn't normally be any phis in a single-predecessor block. - if (I.isPHI()) { - DEBUG(dbgs() << "Can't hoist: " << I); - return false; - } - - // Don't speculate loads. Note that it may be possible and desirable to - // speculate GOT or constant pool loads that are guaranteed not to trap, - // but we don't support that for now. - if (I.mayLoad()) { - DEBUG(dbgs() << "Won't speculate load: " << I); - return false; - } - - // We never speculate stores, so an AA pointer isn't necessary. - bool DontMoveAcrossStore = true; - if (!I.isSafeToMove(TII, nullptr, DontMoveAcrossStore)) { - DEBUG(dbgs() << "Can't speculate: " << I); - return false; - } - - // Only CmpMI is allowed to clobber the flags. - if (&I != CmpMI && I.modifiesRegister(ARM64::NZCV, TRI)) { - DEBUG(dbgs() << "Clobbers flags: " << I); - return false; - } - } - return true; -} - -/// Analyze the sub-cfg rooted in MBB, and return true if it is a potential -/// candidate for cmp-conversion. Fill out the internal state. -/// -bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) { - Head = MBB; - Tail = CmpBB = nullptr; - - if (Head->succ_size() != 2) - return false; - MachineBasicBlock *Succ0 = Head->succ_begin()[0]; - MachineBasicBlock *Succ1 = Head->succ_begin()[1]; - - // CmpBB can only have a single predecessor. Tail is allowed many. - if (Succ0->pred_size() != 1) - std::swap(Succ0, Succ1); - - // Succ0 is our candidate for CmpBB. - if (Succ0->pred_size() != 1 || Succ0->succ_size() != 2) - return false; - - CmpBB = Succ0; - Tail = Succ1; - - if (!CmpBB->isSuccessor(Tail)) - return false; - - // The CFG topology checks out. - DEBUG(dbgs() << "\nTriangle: BB#" << Head->getNumber() << " -> BB#" - << CmpBB->getNumber() << " -> BB#" << Tail->getNumber() << '\n'); - ++NumConsidered; - - // Tail is allowed to have many predecessors, but we can't handle PHIs yet. - // - // FIXME: Real PHIs could be if-converted as long as the CmpBB values are - // defined before The CmpBB cmp clobbers the flags. Alternatively, it should - // always be safe to sink the ccmp down to immediately before the CmpBB - // terminators. - if (!trivialTailPHIs()) { - DEBUG(dbgs() << "Can't handle phis in Tail.\n"); - ++NumPhiRejs; - return false; - } - - if (!Tail->livein_empty()) { - DEBUG(dbgs() << "Can't handle live-in physregs in Tail.\n"); - ++NumPhysRejs; - return false; - } - - // CmpBB should never have PHIs since Head is its only predecessor. - // FIXME: Clean them up if it happens. - if (!CmpBB->empty() && CmpBB->front().isPHI()) { - DEBUG(dbgs() << "Can't handle phis in CmpBB.\n"); - ++NumPhi2Rejs; - return false; - } - - if (!CmpBB->livein_empty()) { - DEBUG(dbgs() << "Can't handle live-in physregs in CmpBB.\n"); - ++NumPhysRejs; - return false; - } - - // The branch we're looking to eliminate must be analyzable. - HeadCond.clear(); - MachineBasicBlock *TBB = nullptr, *FBB = nullptr; - if (TII->AnalyzeBranch(*Head, TBB, FBB, HeadCond)) { - DEBUG(dbgs() << "Head branch not analyzable.\n"); - ++NumHeadBranchRejs; - return false; - } - - // This is weird, probably some sort of degenerate CFG, or an edge to a - // landing pad. - if (!TBB || HeadCond.empty()) { - DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in Head.\n"); - ++NumHeadBranchRejs; - return false; - } - - if (!parseCond(HeadCond, HeadCmpBBCC)) { - DEBUG(dbgs() << "Unsupported branch type on Head\n"); - ++NumHeadBranchRejs; - return false; - } - - // Make sure the branch direction is right. - if (TBB != CmpBB) { - assert(TBB == Tail && "Unexpected TBB"); - HeadCmpBBCC = ARM64CC::getInvertedCondCode(HeadCmpBBCC); - } - - CmpBBCond.clear(); - TBB = FBB = nullptr; - if (TII->AnalyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) { - DEBUG(dbgs() << "CmpBB branch not analyzable.\n"); - ++NumCmpBranchRejs; - return false; - } - - if (!TBB || CmpBBCond.empty()) { - DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch in CmpBB.\n"); - ++NumCmpBranchRejs; - return false; - } - - if (!parseCond(CmpBBCond, CmpBBTailCC)) { - DEBUG(dbgs() << "Unsupported branch type on CmpBB\n"); - ++NumCmpBranchRejs; - return false; - } - - if (TBB != Tail) - CmpBBTailCC = ARM64CC::getInvertedCondCode(CmpBBTailCC); - - DEBUG(dbgs() << "Head->CmpBB on " << ARM64CC::getCondCodeName(HeadCmpBBCC) - << ", CmpBB->Tail on " << ARM64CC::getCondCodeName(CmpBBTailCC) - << '\n'); - - CmpMI = findConvertibleCompare(CmpBB); - if (!CmpMI) - return false; - - if (!canSpeculateInstrs(CmpBB, CmpMI)) { - ++NumSpeculateRejs; - return false; - } - return true; -} - -void SSACCmpConv::convert(SmallVectorImpl &RemovedBlocks) { - DEBUG(dbgs() << "Merging BB#" << CmpBB->getNumber() << " into BB#" - << Head->getNumber() << ":\n" << *CmpBB); - - // All CmpBB instructions are moved into Head, and CmpBB is deleted. - // Update the CFG first. - updateTailPHIs(); - Head->removeSuccessor(CmpBB); - CmpBB->removeSuccessor(Tail); - Head->transferSuccessorsAndUpdatePHIs(CmpBB); - DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc(); - TII->RemoveBranch(*Head); - - // If the Head terminator was one of the cbz / tbz branches with built-in - // compare, we need to insert an explicit compare instruction in its place. - if (HeadCond[0].getImm() == -1) { - ++NumCompBranches; - unsigned Opc = 0; - switch (HeadCond[1].getImm()) { - case ARM64::CBZW: - case ARM64::CBNZW: - Opc = ARM64::SUBSWri; - break; - case ARM64::CBZX: - case ARM64::CBNZX: - Opc = ARM64::SUBSXri; - break; - default: - llvm_unreachable("Cannot convert Head branch"); - } - const MCInstrDesc &MCID = TII->get(Opc); - // Create a dummy virtual register for the SUBS def. - unsigned DestReg = - MRI->createVirtualRegister(TII->getRegClass(MCID, 0, TRI, *MF)); - // Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz. - BuildMI(*Head, Head->end(), TermDL, MCID) - .addReg(DestReg, RegState::Define | RegState::Dead) - .addOperand(HeadCond[2]) - .addImm(0) - .addImm(0); - // SUBS uses the GPR*sp register classes. - MRI->constrainRegClass(HeadCond[2].getReg(), - TII->getRegClass(MCID, 1, TRI, *MF)); - } - - Head->splice(Head->end(), CmpBB, CmpBB->begin(), CmpBB->end()); - - // Now replace CmpMI with a ccmp instruction that also considers the incoming - // flags. - unsigned Opc = 0; - unsigned FirstOp = 1; // First CmpMI operand to copy. - bool isZBranch = false; // CmpMI is a cbz/cbnz instruction. - switch (CmpMI->getOpcode()) { - default: - llvm_unreachable("Unknown compare opcode"); - case ARM64::SUBSWri: Opc = ARM64::CCMPWi; break; - case ARM64::SUBSWrr: Opc = ARM64::CCMPWr; break; - case ARM64::SUBSXri: Opc = ARM64::CCMPXi; break; - case ARM64::SUBSXrr: Opc = ARM64::CCMPXr; break; - case ARM64::ADDSWri: Opc = ARM64::CCMNWi; break; - case ARM64::ADDSWrr: Opc = ARM64::CCMNWr; break; - case ARM64::ADDSXri: Opc = ARM64::CCMNXi; break; - case ARM64::ADDSXrr: Opc = ARM64::CCMNXr; break; - case ARM64::FCMPSrr: Opc = ARM64::FCCMPSrr; FirstOp = 0; break; - case ARM64::FCMPDrr: Opc = ARM64::FCCMPDrr; FirstOp = 0; break; - case ARM64::FCMPESrr: Opc = ARM64::FCCMPESrr; FirstOp = 0; break; - case ARM64::FCMPEDrr: Opc = ARM64::FCCMPEDrr; FirstOp = 0; break; - case ARM64::CBZW: - case ARM64::CBNZW: - Opc = ARM64::CCMPWi; - FirstOp = 0; - isZBranch = true; - break; - case ARM64::CBZX: - case ARM64::CBNZX: - Opc = ARM64::CCMPXi; - FirstOp = 0; - isZBranch = true; - break; - } - - // The ccmp instruction should set the flags according to the comparison when - // Head would have branched to CmpBB. - // The NZCV immediate operand should provide flags for the case where Head - // would have branched to Tail. These flags should cause the new Head - // terminator to branch to tail. - unsigned NZCV = ARM64CC::getNZCVToSatisfyCondCode(CmpBBTailCC); - const MCInstrDesc &MCID = TII->get(Opc); - MRI->constrainRegClass(CmpMI->getOperand(FirstOp).getReg(), - TII->getRegClass(MCID, 0, TRI, *MF)); - if (CmpMI->getOperand(FirstOp + 1).isReg()) - MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(), - TII->getRegClass(MCID, 1, TRI, *MF)); - MachineInstrBuilder MIB = - BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID) - .addOperand(CmpMI->getOperand(FirstOp)); // Register Rn - if (isZBranch) - MIB.addImm(0); // cbz/cbnz Rn -> ccmp Rn, #0 - else - MIB.addOperand(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate - MIB.addImm(NZCV).addImm(HeadCmpBBCC); - - // If CmpMI was a terminator, we need a new conditional branch to replace it. - // This now becomes a Head terminator. - if (isZBranch) { - bool isNZ = CmpMI->getOpcode() == ARM64::CBNZW || - CmpMI->getOpcode() == ARM64::CBNZX; - BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), TII->get(ARM64::Bcc)) - .addImm(isNZ ? ARM64CC::NE : ARM64CC::EQ) - .addOperand(CmpMI->getOperand(1)); // Branch target. - } - CmpMI->eraseFromParent(); - Head->updateTerminator(); - - RemovedBlocks.push_back(CmpBB); - CmpBB->eraseFromParent(); - DEBUG(dbgs() << "Result:\n" << *Head); - ++NumConverted; -} - -int SSACCmpConv::expectedCodeSizeDelta() const { - int delta = 0; - // If the Head terminator was one of the cbz / tbz branches with built-in - // compare, we need to insert an explicit compare instruction in its place - // plus a branch instruction. - if (HeadCond[0].getImm() == -1) { - switch (HeadCond[1].getImm()) { - case ARM64::CBZW: - case ARM64::CBNZW: - case ARM64::CBZX: - case ARM64::CBNZX: - // Therefore delta += 1 - delta = 1; - break; - default: - llvm_unreachable("Cannot convert Head branch"); - } - } - // If the Cmp terminator was one of the cbz / tbz branches with - // built-in compare, it will be turned into a compare instruction - // into Head, but we do not save any instruction. - // Otherwise, we save the branch instruction. - switch (CmpMI->getOpcode()) { - default: - --delta; - break; - case ARM64::CBZW: - case ARM64::CBNZW: - case ARM64::CBZX: - case ARM64::CBNZX: - break; - } - return delta; -} - -//===----------------------------------------------------------------------===// -// ARM64ConditionalCompares Pass -//===----------------------------------------------------------------------===// - -namespace { -class ARM64ConditionalCompares : public MachineFunctionPass { - const TargetInstrInfo *TII; - const TargetRegisterInfo *TRI; - const MCSchedModel *SchedModel; - // Does the proceeded function has Oz attribute. - bool MinSize; - MachineRegisterInfo *MRI; - MachineDominatorTree *DomTree; - MachineLoopInfo *Loops; - MachineTraceMetrics *Traces; - MachineTraceMetrics::Ensemble *MinInstr; - SSACCmpConv CmpConv; - -public: - static char ID; - ARM64ConditionalCompares() : MachineFunctionPass(ID) {} - void getAnalysisUsage(AnalysisUsage &AU) const override; - bool runOnMachineFunction(MachineFunction &MF) override; - const char *getPassName() const override { - return "ARM64 Conditional Compares"; - } - -private: - bool tryConvert(MachineBasicBlock *); - void updateDomTree(ArrayRef Removed); - void updateLoops(ArrayRef Removed); - void invalidateTraces(); - bool shouldConvert(); -}; -} // end anonymous namespace - -char ARM64ConditionalCompares::ID = 0; - -namespace llvm { -void initializeARM64ConditionalComparesPass(PassRegistry &); -} - -INITIALIZE_PASS_BEGIN(ARM64ConditionalCompares, "arm64-ccmp", "ARM64 CCMP Pass", - false, false) -INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) -INITIALIZE_PASS_END(ARM64ConditionalCompares, "arm64-ccmp", "ARM64 CCMP Pass", - false, false) - -FunctionPass *llvm::createARM64ConditionalCompares() { - return new ARM64ConditionalCompares(); -} - -void ARM64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired(); - AU.addRequired(); - AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); - AU.addRequired(); - AU.addPreserved(); - MachineFunctionPass::getAnalysisUsage(AU); -} - -/// Update the dominator tree after if-conversion erased some blocks. -void -ARM64ConditionalCompares::updateDomTree(ArrayRef Removed) { - // convert() removes CmpBB which was previously dominated by Head. - // CmpBB children should be transferred to Head. - MachineDomTreeNode *HeadNode = DomTree->getNode(CmpConv.Head); - for (unsigned i = 0, e = Removed.size(); i != e; ++i) { - MachineDomTreeNode *Node = DomTree->getNode(Removed[i]); - assert(Node != HeadNode && "Cannot erase the head node"); - assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head"); - while (Node->getNumChildren()) - DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode); - DomTree->eraseNode(Removed[i]); - } -} - -/// Update LoopInfo after if-conversion. -void -ARM64ConditionalCompares::updateLoops(ArrayRef Removed) { - if (!Loops) - return; - for (unsigned i = 0, e = Removed.size(); i != e; ++i) - Loops->removeBlock(Removed[i]); -} - -/// Invalidate MachineTraceMetrics before if-conversion. -void ARM64ConditionalCompares::invalidateTraces() { - Traces->invalidate(CmpConv.Head); - Traces->invalidate(CmpConv.CmpBB); -} - -/// Apply cost model and heuristics to the if-conversion in IfConv. -/// Return true if the conversion is a good idea. -/// -bool ARM64ConditionalCompares::shouldConvert() { - // Stress testing mode disables all cost considerations. - if (Stress) - return true; - if (!MinInstr) - MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); - - // Head dominates CmpBB, so it is always included in its trace. - MachineTraceMetrics::Trace Trace = MinInstr->getTrace(CmpConv.CmpBB); - - // If code size is the main concern - if (MinSize) { - int CodeSizeDelta = CmpConv.expectedCodeSizeDelta(); - DEBUG(dbgs() << "Code size delta: " << CodeSizeDelta << '\n'); - // If we are minimizing the code size, do the conversion whatever - // the cost is. - if (CodeSizeDelta < 0) - return true; - if (CodeSizeDelta > 0) { - DEBUG(dbgs() << "Code size is increasing, give up on this one.\n"); - return false; - } - // CodeSizeDelta == 0, continue with the regular heuristics - } - - // Heuristic: The compare conversion delays the execution of the branch - // instruction because we must wait for the inputs to the second compare as - // well. The branch has no dependent instructions, but delaying it increases - // the cost of a misprediction. - // - // Set a limit on the delay we will accept. - unsigned DelayLimit = SchedModel->MispredictPenalty * 3 / 4; - - // Instruction depths can be computed for all trace instructions above CmpBB. - unsigned HeadDepth = - Trace.getInstrCycles(CmpConv.Head->getFirstTerminator()).Depth; - unsigned CmpBBDepth = - Trace.getInstrCycles(CmpConv.CmpBB->getFirstTerminator()).Depth; - DEBUG(dbgs() << "Head depth: " << HeadDepth - << "\nCmpBB depth: " << CmpBBDepth << '\n'); - if (CmpBBDepth > HeadDepth + DelayLimit) { - DEBUG(dbgs() << "Branch delay would be larger than " << DelayLimit - << " cycles.\n"); - return false; - } - - // Check the resource depth at the bottom of CmpBB - these instructions will - // be speculated. - unsigned ResDepth = Trace.getResourceDepth(true); - DEBUG(dbgs() << "Resources: " << ResDepth << '\n'); - - // Heuristic: The speculatively executed instructions must all be able to - // merge into the Head block. The Head critical path should dominate the - // resource cost of the speculated instructions. - if (ResDepth > HeadDepth) { - DEBUG(dbgs() << "Too many instructions to speculate.\n"); - return false; - } - return true; -} - -bool ARM64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) { - bool Changed = false; - while (CmpConv.canConvert(MBB) && shouldConvert()) { - invalidateTraces(); - SmallVector RemovedBlocks; - CmpConv.convert(RemovedBlocks); - Changed = true; - updateDomTree(RemovedBlocks); - updateLoops(RemovedBlocks); - } - return Changed; -} - -bool ARM64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) { - DEBUG(dbgs() << "********** ARM64 Conditional Compares **********\n" - << "********** Function: " << MF.getName() << '\n'); - TII = MF.getTarget().getInstrInfo(); - TRI = MF.getTarget().getRegisterInfo(); - SchedModel = - MF.getTarget().getSubtarget().getSchedModel(); - MRI = &MF.getRegInfo(); - DomTree = &getAnalysis(); - Loops = getAnalysisIfAvailable(); - Traces = &getAnalysis(); - MinInstr = nullptr; - MinSize = MF.getFunction()->getAttributes().hasAttribute( - AttributeSet::FunctionIndex, Attribute::MinSize); - - bool Changed = false; - CmpConv.runOnMachineFunction(MF); - - // Visit blocks in dominator tree pre-order. The pre-order enables multiple - // cmp-conversions from the same head block. - // Note that updateDomTree() modifies the children of the DomTree node - // currently being visited. The df_iterator supports that; it doesn't look at - // child_begin() / child_end() until after a node has been visited. - for (auto *I : depth_first(DomTree)) - if (tryConvert(I->getBlock())) - Changed = true; - - return Changed; -} diff --git a/lib/Target/ARM64/ARM64DeadRegisterDefinitionsPass.cpp b/lib/Target/ARM64/ARM64DeadRegisterDefinitionsPass.cpp deleted file mode 100644 index e8f03ec833f..00000000000 --- a/lib/Target/ARM64/ARM64DeadRegisterDefinitionsPass.cpp +++ /dev/null @@ -1,134 +0,0 @@ -//===-- ARM64DeadRegisterDefinitions.cpp - Replace dead defs w/ zero reg --===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// When allowed by the instruction, replace a dead definition of a GPR with -// the zero register. This makes the code a bit friendlier towards the -// hardware's register renamer. -//===----------------------------------------------------------------------===// - -#include "ARM64.h" -#include "ARM64RegisterInfo.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -using namespace llvm; - -#define DEBUG_TYPE "arm64-dead-defs" - -STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced"); - -namespace { -class ARM64DeadRegisterDefinitions : public MachineFunctionPass { -private: - const TargetRegisterInfo *TRI; - bool implicitlyDefinesOverlappingReg(unsigned Reg, const MachineInstr &MI); - bool processMachineBasicBlock(MachineBasicBlock &MBB); - bool usesFrameIndex(const MachineInstr &MI); -public: - static char ID; // Pass identification, replacement for typeid. - explicit ARM64DeadRegisterDefinitions() : MachineFunctionPass(ID) {} - - virtual bool runOnMachineFunction(MachineFunction &F) override; - - const char *getPassName() const override { return "Dead register definitions"; } - - virtual void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; -char ARM64DeadRegisterDefinitions::ID = 0; -} // end anonymous namespace - -bool ARM64DeadRegisterDefinitions::implicitlyDefinesOverlappingReg( - unsigned Reg, const MachineInstr &MI) { - for (const MachineOperand &MO : MI.implicit_operands()) - if (MO.isReg() && MO.isDef()) - if (TRI->regsOverlap(Reg, MO.getReg())) - return true; - return false; -} - -bool ARM64DeadRegisterDefinitions::usesFrameIndex(const MachineInstr &MI) { - for (const MachineOperand &Op : MI.uses()) - if (Op.isFI()) - return true; - return false; -} - -bool -ARM64DeadRegisterDefinitions::processMachineBasicBlock(MachineBasicBlock &MBB) { - bool Changed = false; - for (MachineInstr &MI : MBB) { - if (usesFrameIndex(MI)) { - // We need to skip this instruction because while it appears to have a - // dead def it uses a frame index which might expand into a multi - // instruction sequence during EPI. - DEBUG(dbgs() << " Ignoring, operand is frame index\n"); - continue; - } - for (int i = 0, e = MI.getDesc().getNumDefs(); i != e; ++i) { - MachineOperand &MO = MI.getOperand(i); - if (MO.isReg() && MO.isDead() && MO.isDef()) { - assert(!MO.isImplicit() && "Unexpected implicit def!"); - DEBUG(dbgs() << " Dead def operand #" << i << " in:\n "; - MI.print(dbgs())); - // Be careful not to change the register if it's a tied operand. - if (MI.isRegTiedToUseOperand(i)) { - DEBUG(dbgs() << " Ignoring, def is tied operand.\n"); - continue; - } - // Don't change the register if there's an implicit def of a subreg or - // supperreg. - if (implicitlyDefinesOverlappingReg(MO.getReg(), MI)) { - DEBUG(dbgs() << " Ignoring, implicitly defines overlap reg.\n"); - continue; - } - // Make sure the instruction take a register class that contains - // the zero register and replace it if so. - unsigned NewReg; - switch (MI.getDesc().OpInfo[i].RegClass) { - default: - DEBUG(dbgs() << " Ignoring, register is not a GPR.\n"); - continue; - case ARM64::GPR32RegClassID: - NewReg = ARM64::WZR; - break; - case ARM64::GPR64RegClassID: - NewReg = ARM64::XZR; - break; - } - DEBUG(dbgs() << " Replacing with zero register. New:\n "); - MO.setReg(NewReg); - DEBUG(MI.print(dbgs())); - ++NumDeadDefsReplaced; - } - } - } - return Changed; -} - -// Scan the function for instructions that have a dead definition of a -// register. Replace that register with the zero register when possible. -bool ARM64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) { - TRI = MF.getTarget().getRegisterInfo(); - bool Changed = false; - DEBUG(dbgs() << "***** ARM64DeadRegisterDefinitions *****\n"); - - for (auto &MBB : MF) - if (processMachineBasicBlock(MBB)) - Changed = true; - return Changed; -} - -FunctionPass *llvm::createARM64DeadRegisterDefinitions() { - return new ARM64DeadRegisterDefinitions(); -} diff --git a/lib/Target/ARM64/ARM64ExpandPseudoInsts.cpp b/lib/Target/ARM64/ARM64ExpandPseudoInsts.cpp deleted file mode 100644 index a4b5d31314e..00000000000 --- a/lib/Target/ARM64/ARM64ExpandPseudoInsts.cpp +++ /dev/null @@ -1,745 +0,0 @@ -//===-- ARM64ExpandPseudoInsts.cpp - Expand pseudo instructions ---*- C++ -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains a pass that expands pseudo instructions into target -// instructions to allow proper scheduling and other late optimizations. This -// pass should be run after register allocation but before the post-regalloc -// scheduling pass. -// -//===----------------------------------------------------------------------===// - -#include "MCTargetDesc/ARM64AddressingModes.h" -#include "ARM64InstrInfo.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/Support/MathExtras.h" -using namespace llvm; - -namespace { -class ARM64ExpandPseudo : public MachineFunctionPass { -public: - static char ID; - ARM64ExpandPseudo() : MachineFunctionPass(ID) {} - - const ARM64InstrInfo *TII; - - bool runOnMachineFunction(MachineFunction &Fn) override; - - const char *getPassName() const override { - return "ARM64 pseudo instruction expansion pass"; - } - -private: - bool expandMBB(MachineBasicBlock &MBB); - bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); - bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - unsigned BitSize); -}; -char ARM64ExpandPseudo::ID = 0; -} - -/// \brief Transfer implicit operands on the pseudo instruction to the -/// instructions created from the expansion. -static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI, - MachineInstrBuilder &DefMI) { - const MCInstrDesc &Desc = OldMI.getDesc(); - for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); i != e; - ++i) { - const MachineOperand &MO = OldMI.getOperand(i); - assert(MO.isReg() && MO.getReg()); - if (MO.isUse()) - UseMI.addOperand(MO); - else - DefMI.addOperand(MO); - } -} - -/// \brief Helper function which extracts the specified 16-bit chunk from a -/// 64-bit value. -static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) { - assert(ChunkIdx < 4 && "Out of range chunk index specified!"); - - return (Imm >> (ChunkIdx * 16)) & 0xFFFF; -} - -/// \brief Helper function which replicates a 16-bit chunk within a 64-bit -/// value. Indices correspond to element numbers in a v4i16. -static uint64_t replicateChunk(uint64_t Imm, unsigned FromIdx, unsigned ToIdx) { - assert((FromIdx < 4) && (ToIdx < 4) && "Out of range chunk index specified!"); - const unsigned ShiftAmt = ToIdx * 16; - - // Replicate the source chunk to the destination position. - const uint64_t Chunk = getChunk(Imm, FromIdx) << ShiftAmt; - // Clear the destination chunk. - Imm &= ~(0xFFFFLL << ShiftAmt); - // Insert the replicated chunk. - return Imm | Chunk; -} - -/// \brief Helper function which tries to materialize a 64-bit value with an -/// ORR + MOVK instruction sequence. -static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - const ARM64InstrInfo *TII, unsigned ChunkIdx) { - assert(ChunkIdx < 4 && "Out of range chunk index specified!"); - const unsigned ShiftAmt = ChunkIdx * 16; - - uint64_t Encoding; - if (ARM64_AM::processLogicalImmediate(OrrImm, 64, Encoding)) { - // Create the ORR-immediate instruction. - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ORRXri)) - .addOperand(MI.getOperand(0)) - .addReg(ARM64::XZR) - .addImm(Encoding); - - // Create the MOVK instruction. - const unsigned Imm16 = getChunk(UImm, ChunkIdx); - const unsigned DstReg = MI.getOperand(0).getReg(); - const bool DstIsDead = MI.getOperand(0).isDead(); - MachineInstrBuilder MIB1 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi)) - .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) - .addReg(DstReg) - .addImm(Imm16) - .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt)); - - transferImpOps(MI, MIB, MIB1); - MI.eraseFromParent(); - return true; - } - - return false; -} - -/// \brief Check whether the given 16-bit chunk replicated to full 64-bit width -/// can be materialized with an ORR instruction. -static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) { - Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk; - - return ARM64_AM::processLogicalImmediate(Chunk, 64, Encoding); -} - -/// \brief Check for identical 16-bit chunks within the constant and if so -/// materialize them with a single ORR instruction. The remaining one or two -/// 16-bit chunks will be materialized with MOVK instructions. -/// -/// This allows us to materialize constants like |A|B|A|A| or |A|B|C|A| (order -/// of the chunks doesn't matter), assuming |A|A|A|A| can be materialized with -/// an ORR instruction. -/// -static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - const ARM64InstrInfo *TII) { - typedef DenseMap CountMap; - CountMap Counts; - - // Scan the constant and count how often every chunk occurs. - for (unsigned Idx = 0; Idx < 4; ++Idx) - ++Counts[getChunk(UImm, Idx)]; - - // Traverse the chunks to find one which occurs more than once. - for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end(); - Chunk != End; ++Chunk) { - const uint64_t ChunkVal = Chunk->first; - const unsigned Count = Chunk->second; - - uint64_t Encoding = 0; - - // We are looking for chunks which have two or three instances and can be - // materialized with an ORR instruction. - if ((Count != 2 && Count != 3) || !canUseOrr(ChunkVal, Encoding)) - continue; - - const bool CountThree = Count == 3; - // Create the ORR-immediate instruction. - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ORRXri)) - .addOperand(MI.getOperand(0)) - .addReg(ARM64::XZR) - .addImm(Encoding); - - const unsigned DstReg = MI.getOperand(0).getReg(); - const bool DstIsDead = MI.getOperand(0).isDead(); - - unsigned ShiftAmt = 0; - uint64_t Imm16 = 0; - // Find the first chunk not materialized with the ORR instruction. - for (; ShiftAmt < 64; ShiftAmt += 16) { - Imm16 = (UImm >> ShiftAmt) & 0xFFFF; - - if (Imm16 != ChunkVal) - break; - } - - // Create the first MOVK instruction. - MachineInstrBuilder MIB1 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi)) - .addReg(DstReg, - RegState::Define | getDeadRegState(DstIsDead && CountThree)) - .addReg(DstReg) - .addImm(Imm16) - .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt)); - - // In case we have three instances the whole constant is now materialized - // and we can exit. - if (CountThree) { - transferImpOps(MI, MIB, MIB1); - MI.eraseFromParent(); - return true; - } - - // Find the remaining chunk which needs to be materialized. - for (ShiftAmt += 16; ShiftAmt < 64; ShiftAmt += 16) { - Imm16 = (UImm >> ShiftAmt) & 0xFFFF; - - if (Imm16 != ChunkVal) - break; - } - - // Create the second MOVK instruction. - MachineInstrBuilder MIB2 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi)) - .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) - .addReg(DstReg) - .addImm(Imm16) - .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt)); - - transferImpOps(MI, MIB, MIB2); - MI.eraseFromParent(); - return true; - } - - return false; -} - -/// \brief Check whether this chunk matches the pattern '1...0...'. This pattern -/// starts a contiguous sequence of ones if we look at the bits from the LSB -/// towards the MSB. -static bool isStartChunk(uint64_t Chunk) { - if (Chunk == 0 || Chunk == UINT64_MAX) - return false; - - return (CountLeadingOnes_64(Chunk) + countTrailingZeros(Chunk)) == 64; -} - -/// \brief Check whether this chunk matches the pattern '0...1...' This pattern -/// ends a contiguous sequence of ones if we look at the bits from the LSB -/// towards the MSB. -static bool isEndChunk(uint64_t Chunk) { - if (Chunk == 0 || Chunk == UINT64_MAX) - return false; - - return (countLeadingZeros(Chunk) + CountTrailingOnes_64(Chunk)) == 64; -} - -/// \brief Clear or set all bits in the chunk at the given index. -static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) { - const uint64_t Mask = 0xFFFF; - - if (Clear) - // Clear chunk in the immediate. - Imm &= ~(Mask << (Idx * 16)); - else - // Set all bits in the immediate for the particular chunk. - Imm |= Mask << (Idx * 16); - - return Imm; -} - -/// \brief Check whether the constant contains a sequence of contiguous ones, -/// which might be interrupted by one or two chunks. If so, materialize the -/// sequence of contiguous ones with an ORR instruction. -/// Materialize the chunks which are either interrupting the sequence or outside -/// of the sequence with a MOVK instruction. -/// -/// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk -/// which ends the sequence (0...1...). Then we are looking for constants which -/// contain at least one S and E chunk. -/// E.g. |E|A|B|S|, |A|E|B|S| or |A|B|E|S|. -/// -/// We are also looking for constants like |S|A|B|E| where the contiguous -/// sequence of ones wraps around the MSB into the LSB. -/// -static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator &MBBI, - const ARM64InstrInfo *TII) { - const int NotSet = -1; - const uint64_t Mask = 0xFFFF; - - int StartIdx = NotSet; - int EndIdx = NotSet; - // Try to find the chunks which start/end a contiguous sequence of ones. - for (int Idx = 0; Idx < 4; ++Idx) { - int64_t Chunk = getChunk(UImm, Idx); - // Sign extend the 16-bit chunk to 64-bit. - Chunk = (Chunk << 48) >> 48; - - if (isStartChunk(Chunk)) - StartIdx = Idx; - else if (isEndChunk(Chunk)) - EndIdx = Idx; - } - - // Early exit in case we can't find a start/end chunk. - if (StartIdx == NotSet || EndIdx == NotSet) - return false; - - // Outside of the contiguous sequence of ones everything needs to be zero. - uint64_t Outside = 0; - // Chunks between the start and end chunk need to have all their bits set. - uint64_t Inside = Mask; - - // If our contiguous sequence of ones wraps around from the MSB into the LSB, - // just swap indices and pretend we are materializing a contiguous sequence - // of zeros surrounded by a contiguous sequence of ones. - if (StartIdx > EndIdx) { - std::swap(StartIdx, EndIdx); - std::swap(Outside, Inside); - } - - uint64_t OrrImm = UImm; - int FirstMovkIdx = NotSet; - int SecondMovkIdx = NotSet; - - // Find out which chunks we need to patch up to obtain a contiguous sequence - // of ones. - for (int Idx = 0; Idx < 4; ++Idx) { - const uint64_t Chunk = getChunk(UImm, Idx); - - // Check whether we are looking at a chunk which is not part of the - // contiguous sequence of ones. - if ((Idx < StartIdx || EndIdx < Idx) && Chunk != Outside) { - OrrImm = updateImm(OrrImm, Idx, Outside == 0); - - // Remember the index we need to patch. - if (FirstMovkIdx == NotSet) - FirstMovkIdx = Idx; - else - SecondMovkIdx = Idx; - - // Check whether we are looking a chunk which is part of the contiguous - // sequence of ones. - } else if (Idx > StartIdx && Idx < EndIdx && Chunk != Inside) { - OrrImm = updateImm(OrrImm, Idx, Inside != Mask); - - // Remember the index we need to patch. - if (FirstMovkIdx == NotSet) - FirstMovkIdx = Idx; - else - SecondMovkIdx = Idx; - } - } - assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!"); - - // Create the ORR-immediate instruction. - uint64_t Encoding = 0; - ARM64_AM::processLogicalImmediate(OrrImm, 64, Encoding); - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ORRXri)) - .addOperand(MI.getOperand(0)) - .addReg(ARM64::XZR) - .addImm(Encoding); - - const unsigned DstReg = MI.getOperand(0).getReg(); - const bool DstIsDead = MI.getOperand(0).isDead(); - - const bool SingleMovk = SecondMovkIdx == NotSet; - // Create the first MOVK instruction. - MachineInstrBuilder MIB1 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi)) - .addReg(DstReg, - RegState::Define | getDeadRegState(DstIsDead && SingleMovk)) - .addReg(DstReg) - .addImm(getChunk(UImm, FirstMovkIdx)) - .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, FirstMovkIdx * 16)); - - // Early exit in case we only need to emit a single MOVK instruction. - if (SingleMovk) { - transferImpOps(MI, MIB, MIB1); - MI.eraseFromParent(); - return true; - } - - // Create the second MOVK instruction. - MachineInstrBuilder MIB2 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi)) - .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) - .addReg(DstReg) - .addImm(getChunk(UImm, SecondMovkIdx)) - .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, SecondMovkIdx * 16)); - - transferImpOps(MI, MIB, MIB2); - MI.eraseFromParent(); - return true; -} - -/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more -/// real move-immediate instructions to synthesize the immediate. -bool ARM64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - unsigned BitSize) { - MachineInstr &MI = *MBBI; - uint64_t Imm = MI.getOperand(1).getImm(); - const unsigned Mask = 0xFFFF; - - // Try a MOVI instruction (aka ORR-immediate with the zero register). - uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); - uint64_t Encoding; - if (ARM64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { - unsigned Opc = (BitSize == 32 ? ARM64::ORRWri : ARM64::ORRXri); - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)) - .addOperand(MI.getOperand(0)) - .addReg(BitSize == 32 ? ARM64::WZR : ARM64::XZR) - .addImm(Encoding); - transferImpOps(MI, MIB, MIB); - MI.eraseFromParent(); - return true; - } - - // Scan the immediate and count the number of 16-bit chunks which are either - // all ones or all zeros. - unsigned OneChunks = 0; - unsigned ZeroChunks = 0; - for (unsigned Shift = 0; Shift < BitSize; Shift += 16) { - const unsigned Chunk = (Imm >> Shift) & Mask; - if (Chunk == Mask) - OneChunks++; - else if (Chunk == 0) - ZeroChunks++; - } - - // Since we can't materialize the constant with a single ORR instruction, - // let's see whether we can materialize 3/4 of the constant with an ORR - // instruction and use an additional MOVK instruction to materialize the - // remaining 1/4. - // - // We are looking for constants with a pattern like: |A|X|B|X| or |X|A|X|B|. - // - // E.g. assuming |A|X|A|X| is a pattern which can be materialized with ORR, - // we would create the following instruction sequence: - // - // ORR x0, xzr, |A|X|A|X| - // MOVK x0, |B|, LSL #16 - // - // Only look at 64-bit constants which can't be materialized with a single - // instruction e.g. which have less than either three all zero or all one - // chunks. - // - // Ignore 32-bit constants here, they always can be materialized with a - // MOVZ/MOVN + MOVK pair. Since the 32-bit constant can't be materialized - // with a single ORR, the best sequence we can achieve is a ORR + MOVK pair. - // Thus we fall back to the default code below which in the best case creates - // a single MOVZ/MOVN instruction (in case one chunk is all zero or all one). - // - if (BitSize == 64 && OneChunks < 3 && ZeroChunks < 3) { - // If we interpret the 64-bit constant as a v4i16, are elements 0 and 2 - // identical? - if (getChunk(UImm, 0) == getChunk(UImm, 2)) { - // See if we can come up with a constant which can be materialized with - // ORR-immediate by replicating element 3 into element 1. - uint64_t OrrImm = replicateChunk(UImm, 3, 1); - if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 1)) - return true; - - // See if we can come up with a constant which can be materialized with - // ORR-immediate by replicating element 1 into element 3. - OrrImm = replicateChunk(UImm, 1, 3); - if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 3)) - return true; - - // If we interpret the 64-bit constant as a v4i16, are elements 1 and 3 - // identical? - } else if (getChunk(UImm, 1) == getChunk(UImm, 3)) { - // See if we can come up with a constant which can be materialized with - // ORR-immediate by replicating element 2 into element 0. - uint64_t OrrImm = replicateChunk(UImm, 2, 0); - if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 0)) - return true; - - // See if we can come up with a constant which can be materialized with - // ORR-immediate by replicating element 1 into element 3. - OrrImm = replicateChunk(UImm, 0, 2); - if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 2)) - return true; - } - } - - // Check for identical 16-bit chunks within the constant and if so materialize - // them with a single ORR instruction. The remaining one or two 16-bit chunks - // will be materialized with MOVK instructions. - if (BitSize == 64 && tryToreplicateChunks(UImm, MI, MBB, MBBI, TII)) - return true; - - // Check whether the constant contains a sequence of contiguous ones, which - // might be interrupted by one or two chunks. If so, materialize the sequence - // of contiguous ones with an ORR instruction. Materialize the chunks which - // are either interrupting the sequence or outside of the sequence with a - // MOVK instruction. - if (BitSize == 64 && trySequenceOfOnes(UImm, MI, MBB, MBBI, TII)) - return true; - - // Use a MOVZ or MOVN instruction to set the high bits, followed by one or - // more MOVK instructions to insert additional 16-bit portions into the - // lower bits. - bool isNeg = false; - - // Use MOVN to materialize the high bits if we have more all one chunks - // than all zero chunks. - if (OneChunks > ZeroChunks) { - isNeg = true; - Imm = ~Imm; - } - - unsigned FirstOpc; - if (BitSize == 32) { - Imm &= (1LL << 32) - 1; - FirstOpc = (isNeg ? ARM64::MOVNWi : ARM64::MOVZWi); - } else { - FirstOpc = (isNeg ? ARM64::MOVNXi : ARM64::MOVZXi); - } - unsigned Shift = 0; // LSL amount for high bits with MOVZ/MOVN - unsigned LastShift = 0; // LSL amount for last MOVK - if (Imm != 0) { - unsigned LZ = countLeadingZeros(Imm); - unsigned TZ = countTrailingZeros(Imm); - Shift = ((63 - LZ) / 16) * 16; - LastShift = (TZ / 16) * 16; - } - unsigned Imm16 = (Imm >> Shift) & Mask; - unsigned DstReg = MI.getOperand(0).getReg(); - bool DstIsDead = MI.getOperand(0).isDead(); - MachineInstrBuilder MIB1 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc)) - .addReg(DstReg, RegState::Define | - getDeadRegState(DstIsDead && Shift == LastShift)) - .addImm(Imm16) - .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, Shift)); - - // If a MOVN was used for the high bits of a negative value, flip the rest - // of the bits back for use with MOVK. - if (isNeg) - Imm = ~Imm; - - if (Shift == LastShift) { - transferImpOps(MI, MIB1, MIB1); - MI.eraseFromParent(); - return true; - } - - MachineInstrBuilder MIB2; - unsigned Opc = (BitSize == 32 ? ARM64::MOVKWi : ARM64::MOVKXi); - while (Shift != LastShift) { - Shift -= 16; - Imm16 = (Imm >> Shift) & Mask; - if (Imm16 == (isNeg ? Mask : 0)) - continue; // This 16-bit portion is already set correctly. - MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)) - .addReg(DstReg, - RegState::Define | - getDeadRegState(DstIsDead && Shift == LastShift)) - .addReg(DstReg) - .addImm(Imm16) - .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, Shift)); - } - - transferImpOps(MI, MIB1, MIB2); - MI.eraseFromParent(); - return true; -} - -/// \brief If MBBI references a pseudo instruction that should be expanded here, -/// do the expansion and return true. Otherwise return false. -bool ARM64ExpandPseudo::expandMI(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI) { - MachineInstr &MI = *MBBI; - unsigned Opcode = MI.getOpcode(); - switch (Opcode) { - default: - break; - - case ARM64::ADDWrr: - case ARM64::SUBWrr: - case ARM64::ADDXrr: - case ARM64::SUBXrr: - case ARM64::ADDSWrr: - case ARM64::SUBSWrr: - case ARM64::ADDSXrr: - case ARM64::SUBSXrr: - case ARM64::ANDWrr: - case ARM64::ANDXrr: - case ARM64::BICWrr: - case ARM64::BICXrr: - case ARM64::ANDSWrr: - case ARM64::ANDSXrr: - case ARM64::BICSWrr: - case ARM64::BICSXrr: - case ARM64::EONWrr: - case ARM64::EONXrr: - case ARM64::EORWrr: - case ARM64::EORXrr: - case ARM64::ORNWrr: - case ARM64::ORNXrr: - case ARM64::ORRWrr: - case ARM64::ORRXrr: { - unsigned Opcode; - switch (MI.getOpcode()) { - default: - return false; - case ARM64::ADDWrr: Opcode = ARM64::ADDWrs; break; - case ARM64::SUBWrr: Opcode = ARM64::SUBWrs; break; - case ARM64::ADDXrr: Opcode = ARM64::ADDXrs; break; - case ARM64::SUBXrr: Opcode = ARM64::SUBXrs; break; - case ARM64::ADDSWrr: Opcode = ARM64::ADDSWrs; break; - case ARM64::SUBSWrr: Opcode = ARM64::SUBSWrs; break; - case ARM64::ADDSXrr: Opcode = ARM64::ADDSXrs; break; - case ARM64::SUBSXrr: Opcode = ARM64::SUBSXrs; break; - case ARM64::ANDWrr: Opcode = ARM64::ANDWrs; break; - case ARM64::ANDXrr: Opcode = ARM64::ANDXrs; break; - case ARM64::BICWrr: Opcode = ARM64::BICWrs; break; - case ARM64::BICXrr: Opcode = ARM64::BICXrs; break; - case ARM64::ANDSWrr: Opcode = ARM64::ANDSWrs; break; - case ARM64::ANDSXrr: Opcode = ARM64::ANDSXrs; break; - case ARM64::BICSWrr: Opcode = ARM64::BICSWrs; break; - case ARM64::BICSXrr: Opcode = ARM64::BICSXrs; break; - case ARM64::EONWrr: Opcode = ARM64::EONWrs; break; - case ARM64::EONXrr: Opcode = ARM64::EONXrs; break; - case ARM64::EORWrr: Opcode = ARM64::EORWrs; break; - case ARM64::EORXrr: Opcode = ARM64::EORXrs; break; - case ARM64::ORNWrr: Opcode = ARM64::ORNWrs; break; - case ARM64::ORNXrr: Opcode = ARM64::ORNXrs; break; - case ARM64::ORRWrr: Opcode = ARM64::ORRWrs; break; - case ARM64::ORRXrr: Opcode = ARM64::ORRXrs; break; - } - MachineInstrBuilder MIB1 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode), - MI.getOperand(0).getReg()) - .addOperand(MI.getOperand(1)) - .addOperand(MI.getOperand(2)) - .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0)); - transferImpOps(MI, MIB1, MIB1); - MI.eraseFromParent(); - return true; - } - - case ARM64::FCVTSHpseudo: { - MachineOperand Src = MI.getOperand(1); - Src.setImplicit(); - unsigned SrcH = TII->getRegisterInfo().getSubReg(Src.getReg(), ARM64::hsub); - auto MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::FCVTSHr)) - .addOperand(MI.getOperand(0)) - .addReg(SrcH, RegState::Undef) - .addOperand(Src); - transferImpOps(MI, MIB, MIB); - MI.eraseFromParent(); - return true; - } - case ARM64::LOADgot: { - // Expand into ADRP + LDR. - unsigned DstReg = MI.getOperand(0).getReg(); - const MachineOperand &MO1 = MI.getOperand(1); - unsigned Flags = MO1.getTargetFlags(); - MachineInstrBuilder MIB1 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ADRP), DstReg); - MachineInstrBuilder MIB2 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::LDRXui)) - .addOperand(MI.getOperand(0)) - .addReg(DstReg); - - if (MO1.isGlobal()) { - MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | ARM64II::MO_PAGE); - MIB2.addGlobalAddress(MO1.getGlobal(), 0, - Flags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC); - } else if (MO1.isSymbol()) { - MIB1.addExternalSymbol(MO1.getSymbolName(), Flags | ARM64II::MO_PAGE); - MIB2.addExternalSymbol(MO1.getSymbolName(), - Flags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC); - } else { - assert(MO1.isCPI() && - "Only expect globals, externalsymbols, or constant pools"); - MIB1.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(), - Flags | ARM64II::MO_PAGE); - MIB2.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(), - Flags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC); - } - - transferImpOps(MI, MIB1, MIB2); - MI.eraseFromParent(); - return true; - } - - case ARM64::MOVaddr: - case ARM64::MOVaddrJT: - case ARM64::MOVaddrCP: - case ARM64::MOVaddrBA: - case ARM64::MOVaddrTLS: - case ARM64::MOVaddrEXT: { - // Expand into ADRP + ADD. - unsigned DstReg = MI.getOperand(0).getReg(); - MachineInstrBuilder MIB1 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ADRP), DstReg) - .addOperand(MI.getOperand(1)); - - MachineInstrBuilder MIB2 = - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ADDXri)) - .addOperand(MI.getOperand(0)) - .addReg(DstReg) - .addOperand(MI.getOperand(2)) - .addImm(0); - - transferImpOps(MI, MIB1, MIB2); - MI.eraseFromParent(); - return true; - } - - case ARM64::MOVi32imm: - return expandMOVImm(MBB, MBBI, 32); - case ARM64::MOVi64imm: - return expandMOVImm(MBB, MBBI, 64); - case ARM64::RET_ReallyLR: - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::RET)) - .addReg(ARM64::LR); - MI.eraseFromParent(); - return true; - } - return false; -} - -/// \brief Iterate over the instructions in basic block MBB and expand any -/// pseudo instructions. Return true if anything was modified. -bool ARM64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) { - bool Modified = false; - - MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); - while (MBBI != E) { - MachineBasicBlock::iterator NMBBI = std::next(MBBI); - Modified |= expandMI(MBB, MBBI); - MBBI = NMBBI; - } - - return Modified; -} - -bool ARM64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) { - TII = static_cast(MF.getTarget().getInstrInfo()); - - bool Modified = false; - for (auto &MBB : MF) - Modified |= expandMBB(MBB); - return Modified; -} - -/// \brief Returns an instance of the pseudo instruction expansion pass. -FunctionPass *llvm::createARM64ExpandPseudoPass() { - return new ARM64ExpandPseudo(); -} diff --git a/lib/Target/ARM64/ARM64FastISel.cpp b/lib/Target/ARM64/ARM64FastISel.cpp deleted file mode 100644 index f4bf616559a..00000000000 --- a/lib/Target/ARM64/ARM64FastISel.cpp +++ /dev/null @@ -1,1977 +0,0 @@ -//===-- ARM6464FastISel.cpp - ARM64 FastISel implementation ---------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines the ARM64-specific support for the FastISel class. Some -// of the target-specific code is generated by tablegen in the file -// ARM64GenFastISel.inc, which is #included here. -// -//===----------------------------------------------------------------------===// - -#include "ARM64.h" -#include "ARM64TargetMachine.h" -#include "ARM64Subtarget.h" -#include "ARM64CallingConv.h" -#include "MCTargetDesc/ARM64AddressingModes.h" -#include "llvm/CodeGen/CallingConvLower.h" -#include "llvm/CodeGen/FastISel.h" -#include "llvm/CodeGen/FunctionLoweringInfo.h" -#include "llvm/CodeGen/MachineConstantPool.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/CallingConv.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GetElementPtrTypeIterator.h" -#include "llvm/IR/GlobalAlias.h" -#include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Operator.h" -#include "llvm/Support/CommandLine.h" -using namespace llvm; - -namespace { - -class ARM64FastISel : public FastISel { - - class Address { - public: - typedef enum { - RegBase, - FrameIndexBase - } BaseKind; - - private: - BaseKind Kind; - union { - unsigned Reg; - int FI; - } Base; - int64_t Offset; - - public: - Address() : Kind(RegBase), Offset(0) { Base.Reg = 0; } - void setKind(BaseKind K) { Kind = K; } - BaseKind getKind() const { return Kind; } - bool isRegBase() const { return Kind == RegBase; } - bool isFIBase() const { return Kind == FrameIndexBase; } - void setReg(unsigned Reg) { - assert(isRegBase() && "Invalid base register access!"); - Base.Reg = Reg; - } - unsigned getReg() const { - assert(isRegBase() && "Invalid base register access!"); - return Base.Reg; - } - void setFI(unsigned FI) { - assert(isFIBase() && "Invalid base frame index access!"); - Base.FI = FI; - } - unsigned getFI() const { - assert(isFIBase() && "Invalid base frame index access!"); - return Base.FI; - } - void setOffset(int64_t O) { Offset = O; } - int64_t getOffset() { return Offset; } - - bool isValid() { return isFIBase() || (isRegBase() && getReg() != 0); } - }; - - /// Subtarget - Keep a pointer to the ARM64Subtarget around so that we can - /// make the right decision when generating code for different targets. - const ARM64Subtarget *Subtarget; - LLVMContext *Context; - -private: - // Selection routines. - bool SelectLoad(const Instruction *I); - bool SelectStore(const Instruction *I); - bool SelectBranch(const Instruction *I); - bool SelectIndirectBr(const Instruction *I); - bool SelectCmp(const Instruction *I); - bool SelectSelect(const Instruction *I); - bool SelectFPExt(const Instruction *I); - bool SelectFPTrunc(const Instruction *I); - bool SelectFPToInt(const Instruction *I, bool Signed); - bool SelectIntToFP(const Instruction *I, bool Signed); - bool SelectRem(const Instruction *I, unsigned ISDOpcode); - bool SelectCall(const Instruction *I, const char *IntrMemName); - bool SelectIntrinsicCall(const IntrinsicInst &I); - bool SelectRet(const Instruction *I); - bool SelectTrunc(const Instruction *I); - bool SelectIntExt(const Instruction *I); - bool SelectMul(const Instruction *I); - - // Utility helper routines. - bool isTypeLegal(Type *Ty, MVT &VT); - bool isLoadStoreTypeLegal(Type *Ty, MVT &VT); - bool ComputeAddress(const Value *Obj, Address &Addr); - bool SimplifyAddress(Address &Addr, MVT VT, int64_t ScaleFactor, - bool UseUnscaled); - void AddLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB, - unsigned Flags, bool UseUnscaled); - bool IsMemCpySmall(uint64_t Len, unsigned Alignment); - bool TryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len, - unsigned Alignment); - // Emit functions. - bool EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt); - bool EmitLoad(MVT VT, unsigned &ResultReg, Address Addr, - bool UseUnscaled = false); - bool EmitStore(MVT VT, unsigned SrcReg, Address Addr, - bool UseUnscaled = false); - unsigned EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt); - unsigned Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt); - - unsigned ARM64MaterializeFP(const ConstantFP *CFP, MVT VT); - unsigned ARM64MaterializeGV(const GlobalValue *GV); - - // Call handling routines. -private: - CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const; - bool ProcessCallArgs(SmallVectorImpl &Args, - SmallVectorImpl &ArgRegs, - SmallVectorImpl &ArgVTs, - SmallVectorImpl &ArgFlags, - SmallVectorImpl &RegArgs, CallingConv::ID CC, - unsigned &NumBytes); - bool FinishCall(MVT RetVT, SmallVectorImpl &UsedRegs, - const Instruction *I, CallingConv::ID CC, unsigned &NumBytes); - -public: - // Backend specific FastISel code. - unsigned TargetMaterializeAlloca(const AllocaInst *AI) override; - unsigned TargetMaterializeConstant(const Constant *C) override; - - explicit ARM64FastISel(FunctionLoweringInfo &funcInfo, - const TargetLibraryInfo *libInfo) - : FastISel(funcInfo, libInfo) { - Subtarget = &TM.getSubtarget(); - Context = &funcInfo.Fn->getContext(); - } - - bool TargetSelectInstruction(const Instruction *I) override; - -#include "ARM64GenFastISel.inc" -}; - -} // end anonymous namespace - -#include "ARM64GenCallingConv.inc" - -CCAssignFn *ARM64FastISel::CCAssignFnForCall(CallingConv::ID CC) const { - if (CC == CallingConv::WebKit_JS) - return CC_ARM64_WebKit_JS; - return Subtarget->isTargetDarwin() ? CC_ARM64_DarwinPCS : CC_ARM64_AAPCS; -} - -unsigned ARM64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) { - assert(TLI.getValueType(AI->getType(), true) == MVT::i64 && - "Alloca should always return a pointer."); - - // Don't handle dynamic allocas. - if (!FuncInfo.StaticAllocaMap.count(AI)) - return 0; - - DenseMap::iterator SI = - FuncInfo.StaticAllocaMap.find(AI); - - if (SI != FuncInfo.StaticAllocaMap.end()) { - unsigned ResultReg = createResultReg(&ARM64::GPR64RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADDXri), - ResultReg) - .addFrameIndex(SI->second) - .addImm(0) - .addImm(0); - return ResultReg; - } - - return 0; -} - -unsigned ARM64FastISel::ARM64MaterializeFP(const ConstantFP *CFP, MVT VT) { - if (VT != MVT::f32 && VT != MVT::f64) - return 0; - - const APFloat Val = CFP->getValueAPF(); - bool is64bit = (VT == MVT::f64); - - // This checks to see if we can use FMOV instructions to materialize - // a constant, otherwise we have to materialize via the constant pool. - if (TLI.isFPImmLegal(Val, VT)) { - int Imm; - unsigned Opc; - if (is64bit) { - Imm = ARM64_AM::getFP64Imm(Val); - Opc = ARM64::FMOVDi; - } else { - Imm = ARM64_AM::getFP32Imm(Val); - Opc = ARM64::FMOVSi; - } - unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) - .addImm(Imm); - return ResultReg; - } - - // Materialize via constant pool. MachineConstantPool wants an explicit - // alignment. - unsigned Align = DL.getPrefTypeAlignment(CFP->getType()); - if (Align == 0) - Align = DL.getTypeAllocSize(CFP->getType()); - - unsigned Idx = MCP.getConstantPoolIndex(cast(CFP), Align); - unsigned ADRPReg = createResultReg(&ARM64::GPR64commonRegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADRP), - ADRPReg).addConstantPoolIndex(Idx, 0, ARM64II::MO_PAGE); - - unsigned Opc = is64bit ? ARM64::LDRDui : ARM64::LDRSui; - unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) - .addReg(ADRPReg) - .addConstantPoolIndex(Idx, 0, ARM64II::MO_PAGEOFF | ARM64II::MO_NC); - return ResultReg; -} - -unsigned ARM64FastISel::ARM64MaterializeGV(const GlobalValue *GV) { - // We can't handle thread-local variables quickly yet. Unfortunately we have - // to peer through any aliases to find out if that rule applies. - const GlobalValue *TLSGV = GV; - if (const GlobalAlias *GA = dyn_cast(GV)) - TLSGV = GA->getAliasee(); - - if (const GlobalVariable *GVar = dyn_cast(TLSGV)) - if (GVar->isThreadLocal()) - return 0; - - unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM); - - EVT DestEVT = TLI.getValueType(GV->getType(), true); - if (!DestEVT.isSimple()) - return 0; - - unsigned ADRPReg = createResultReg(&ARM64::GPR64commonRegClass); - unsigned ResultReg; - - if (OpFlags & ARM64II::MO_GOT) { - // ADRP + LDRX - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADRP), - ADRPReg) - .addGlobalAddress(GV, 0, ARM64II::MO_GOT | ARM64II::MO_PAGE); - - ResultReg = createResultReg(&ARM64::GPR64RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::LDRXui), - ResultReg) - .addReg(ADRPReg) - .addGlobalAddress(GV, 0, ARM64II::MO_GOT | ARM64II::MO_PAGEOFF | - ARM64II::MO_NC); - } else { - // ADRP + ADDX - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADRP), - ADRPReg).addGlobalAddress(GV, 0, ARM64II::MO_PAGE); - - ResultReg = createResultReg(&ARM64::GPR64spRegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADDXri), - ResultReg) - .addReg(ADRPReg) - .addGlobalAddress(GV, 0, ARM64II::MO_PAGEOFF | ARM64II::MO_NC) - .addImm(0); - } - return ResultReg; -} - -unsigned ARM64FastISel::TargetMaterializeConstant(const Constant *C) { - EVT CEVT = TLI.getValueType(C->getType(), true); - - // Only handle simple types. - if (!CEVT.isSimple()) - return 0; - MVT VT = CEVT.getSimpleVT(); - - // FIXME: Handle ConstantInt. - if (const ConstantFP *CFP = dyn_cast(C)) - return ARM64MaterializeFP(CFP, VT); - else if (const GlobalValue *GV = dyn_cast(C)) - return ARM64MaterializeGV(GV); - - return 0; -} - -// Computes the address to get to an object. -bool ARM64FastISel::ComputeAddress(const Value *Obj, Address &Addr) { - const User *U = nullptr; - unsigned Opcode = Instruction::UserOp1; - if (const Instruction *I = dyn_cast(Obj)) { - // Don't walk into other basic blocks unless the object is an alloca from - // another block, otherwise it may not have a virtual register assigned. - if (FuncInfo.StaticAllocaMap.count(static_cast(Obj)) || - FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) { - Opcode = I->getOpcode(); - U = I; - } - } else if (const ConstantExpr *C = dyn_cast(Obj)) { - Opcode = C->getOpcode(); - U = C; - } - - if (const PointerType *Ty = dyn_cast(Obj->getType())) - if (Ty->getAddressSpace() > 255) - // Fast instruction selection doesn't support the special - // address spaces. - return false; - - switch (Opcode) { - default: - break; - case Instruction::BitCast: { - // Look through bitcasts. - return ComputeAddress(U->getOperand(0), Addr); - } - case Instruction::IntToPtr: { - // Look past no-op inttoptrs. - if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) - return ComputeAddress(U->getOperand(0), Addr); - break; - } - case Instruction::PtrToInt: { - // Look past no-op ptrtoints. - if (TLI.getValueType(U->getType()) == TLI.getPointerTy()) - return ComputeAddress(U->getOperand(0), Addr); - break; - } - case Instruction::GetElementPtr: { - Address SavedAddr = Addr; - uint64_t TmpOffset = Addr.getOffset(); - - // Iterate through the GEP folding the constants into offsets where - // we can. - gep_type_iterator GTI = gep_type_begin(U); - for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e; - ++i, ++GTI) { - const Value *Op = *i; - if (StructType *STy = dyn_cast(*GTI)) { - const StructLayout *SL = DL.getStructLayout(STy); - unsigned Idx = cast(Op)->getZExtValue(); - TmpOffset += SL->getElementOffset(Idx); - } else { - uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); - for (;;) { - if (const ConstantInt *CI = dyn_cast(Op)) { - // Constant-offset addressing. - TmpOffset += CI->getSExtValue() * S; - break; - } - if (canFoldAddIntoGEP(U, Op)) { - // A compatible add with a constant operand. Fold the constant. - ConstantInt *CI = - cast(cast(Op)->getOperand(1)); - TmpOffset += CI->getSExtValue() * S; - // Iterate on the other operand. - Op = cast(Op)->getOperand(0); - continue; - } - // Unsupported - goto unsupported_gep; - } - } - } - - // Try to grab the base operand now. - Addr.setOffset(TmpOffset); - if (ComputeAddress(U->getOperand(0), Addr)) - return true; - - // We failed, restore everything and try the other options. - Addr = SavedAddr; - - unsupported_gep: - break; - } - case Instruction::Alloca: { - const AllocaInst *AI = cast(Obj); - DenseMap::iterator SI = - FuncInfo.StaticAllocaMap.find(AI); - if (SI != FuncInfo.StaticAllocaMap.end()) { - Addr.setKind(Address::FrameIndexBase); - Addr.setFI(SI->second); - return true; - } - break; - } - } - - // Try to get this in a register if nothing else has worked. - if (!Addr.isValid()) - Addr.setReg(getRegForValue(Obj)); - return Addr.isValid(); -} - -bool ARM64FastISel::isTypeLegal(Type *Ty, MVT &VT) { - EVT evt = TLI.getValueType(Ty, true); - - // Only handle simple types. - if (evt == MVT::Other || !evt.isSimple()) - return false; - VT = evt.getSimpleVT(); - - // This is a legal type, but it's not something we handle in fast-isel. - if (VT == MVT::f128) - return false; - - // Handle all other legal types, i.e. a register that will directly hold this - // value. - return TLI.isTypeLegal(VT); -} - -bool ARM64FastISel::isLoadStoreTypeLegal(Type *Ty, MVT &VT) { - if (isTypeLegal(Ty, VT)) - return true; - - // If this is a type than can be sign or zero-extended to a basic operation - // go ahead and accept it now. For stores, this reflects truncation. - if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16) - return true; - - return false; -} - -bool ARM64FastISel::SimplifyAddress(Address &Addr, MVT VT, int64_t ScaleFactor, - bool UseUnscaled) { - bool needsLowering = false; - int64_t Offset = Addr.getOffset(); - switch (VT.SimpleTy) { - default: - return false; - case MVT::i1: - case MVT::i8: - case MVT::i16: - case MVT::i32: - case MVT::i64: - case MVT::f32: - case MVT::f64: - if (!UseUnscaled) - // Using scaled, 12-bit, unsigned immediate offsets. - needsLowering = ((Offset & 0xfff) != Offset); - else - // Using unscaled, 9-bit, signed immediate offsets. - needsLowering = (Offset > 256 || Offset < -256); - break; - } - - // FIXME: If this is a stack pointer and the offset needs to be simplified - // then put the alloca address into a register, set the base type back to - // register and continue. This should almost never happen. - if (needsLowering && Addr.getKind() == Address::FrameIndexBase) { - return false; - } - - // Since the offset is too large for the load/store instruction get the - // reg+offset into a register. - if (needsLowering) { - uint64_t UnscaledOffset = Addr.getOffset() * ScaleFactor; - unsigned ResultReg = FastEmit_ri_(MVT::i64, ISD::ADD, Addr.getReg(), false, - UnscaledOffset, MVT::i64); - if (ResultReg == 0) - return false; - Addr.setReg(ResultReg); - Addr.setOffset(0); - } - return true; -} - -void ARM64FastISel::AddLoadStoreOperands(Address &Addr, - const MachineInstrBuilder &MIB, - unsigned Flags, bool UseUnscaled) { - int64_t Offset = Addr.getOffset(); - // Frame base works a bit differently. Handle it separately. - if (Addr.getKind() == Address::FrameIndexBase) { - int FI = Addr.getFI(); - // FIXME: We shouldn't be using getObjectSize/getObjectAlignment. The size - // and alignment should be based on the VT. - MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getFixedStack(FI, Offset), Flags, - MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); - // Now add the rest of the operands. - MIB.addFrameIndex(FI).addImm(Offset).addMemOperand(MMO); - } else { - // Now add the rest of the operands. - MIB.addReg(Addr.getReg()); - MIB.addImm(Offset); - } -} - -bool ARM64FastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address Addr, - bool UseUnscaled) { - // Negative offsets require unscaled, 9-bit, signed immediate offsets. - // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets. - if (!UseUnscaled && Addr.getOffset() < 0) - UseUnscaled = true; - - unsigned Opc; - const TargetRegisterClass *RC; - bool VTIsi1 = false; - int64_t ScaleFactor = 0; - switch (VT.SimpleTy) { - default: - return false; - case MVT::i1: - VTIsi1 = true; - // Intentional fall-through. - case MVT::i8: - Opc = UseUnscaled ? ARM64::LDURBBi : ARM64::LDRBBui; - RC = &ARM64::GPR32RegClass; - ScaleFactor = 1; - break; - case MVT::i16: - Opc = UseUnscaled ? ARM64::LDURHHi : ARM64::LDRHHui; - RC = &ARM64::GPR32RegClass; - ScaleFactor = 2; - break; - case MVT::i32: - Opc = UseUnscaled ? ARM64::LDURWi : ARM64::LDRWui; - RC = &ARM64::GPR32RegClass; - ScaleFactor = 4; - break; - case MVT::i64: - Opc = UseUnscaled ? ARM64::LDURXi : ARM64::LDRXui; - RC = &ARM64::GPR64RegClass; - ScaleFactor = 8; - break; - case MVT::f32: - Opc = UseUnscaled ? ARM64::LDURSi : ARM64::LDRSui; - RC = TLI.getRegClassFor(VT); - ScaleFactor = 4; - break; - case MVT::f64: - Opc = UseUnscaled ? ARM64::LDURDi : ARM64::LDRDui; - RC = TLI.getRegClassFor(VT); - ScaleFactor = 8; - break; - } - // Scale the offset. - if (!UseUnscaled) { - int64_t Offset = Addr.getOffset(); - if (Offset & (ScaleFactor - 1)) - // Retry using an unscaled, 9-bit, signed immediate offset. - return EmitLoad(VT, ResultReg, Addr, /*UseUnscaled*/ true); - - Addr.setOffset(Offset / ScaleFactor); - } - - // Simplify this down to something we can handle. - if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled)) - return false; - - // Create the base instruction, then add the operands. - ResultReg = createResultReg(RC); - MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(Opc), ResultReg); - AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, UseUnscaled); - - // Loading an i1 requires special handling. - if (VTIsi1) { - MRI.constrainRegClass(ResultReg, &ARM64::GPR32RegClass); - unsigned ANDReg = createResultReg(&ARM64::GPR32spRegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri), - ANDReg) - .addReg(ResultReg) - .addImm(ARM64_AM::encodeLogicalImmediate(1, 32)); - ResultReg = ANDReg; - } - return true; -} - -bool ARM64FastISel::SelectLoad(const Instruction *I) { - MVT VT; - // Verify we have a legal type before going any further. Currently, we handle - // simple types that will directly fit in a register (i32/f32/i64/f64) or - // those that can be sign or zero-extended to a basic operation (i1/i8/i16). - if (!isLoadStoreTypeLegal(I->getType(), VT) || cast(I)->isAtomic()) - return false; - - // See if we can handle this address. - Address Addr; - if (!ComputeAddress(I->getOperand(0), Addr)) - return false; - - unsigned ResultReg; - if (!EmitLoad(VT, ResultReg, Addr)) - return false; - - UpdateValueMap(I, ResultReg); - return true; -} - -bool ARM64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr, - bool UseUnscaled) { - // Negative offsets require unscaled, 9-bit, signed immediate offsets. - // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets. - if (!UseUnscaled && Addr.getOffset() < 0) - UseUnscaled = true; - - unsigned StrOpc; - bool VTIsi1 = false; - int64_t ScaleFactor = 0; - // Using scaled, 12-bit, unsigned immediate offsets. - switch (VT.SimpleTy) { - default: - return false; - case MVT::i1: - VTIsi1 = true; - case MVT::i8: - StrOpc = UseUnscaled ? ARM64::STURBBi : ARM64::STRBBui; - ScaleFactor = 1; - break; - case MVT::i16: - StrOpc = UseUnscaled ? ARM64::STURHHi : ARM64::STRHHui; - ScaleFactor = 2; - break; - case MVT::i32: - StrOpc = UseUnscaled ? ARM64::STURWi : ARM64::STRWui; - ScaleFactor = 4; - break; - case MVT::i64: - StrOpc = UseUnscaled ? ARM64::STURXi : ARM64::STRXui; - ScaleFactor = 8; - break; - case MVT::f32: - StrOpc = UseUnscaled ? ARM64::STURSi : ARM64::STRSui; - ScaleFactor = 4; - break; - case MVT::f64: - StrOpc = UseUnscaled ? ARM64::STURDi : ARM64::STRDui; - ScaleFactor = 8; - break; - } - // Scale the offset. - if (!UseUnscaled) { - int64_t Offset = Addr.getOffset(); - if (Offset & (ScaleFactor - 1)) - // Retry using an unscaled, 9-bit, signed immediate offset. - return EmitStore(VT, SrcReg, Addr, /*UseUnscaled*/ true); - - Addr.setOffset(Offset / ScaleFactor); - } - - // Simplify this down to something we can handle. - if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled)) - return false; - - // Storing an i1 requires special handling. - if (VTIsi1) { - MRI.constrainRegClass(SrcReg, &ARM64::GPR32RegClass); - unsigned ANDReg = createResultReg(&ARM64::GPR32spRegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri), - ANDReg) - .addReg(SrcReg) - .addImm(ARM64_AM::encodeLogicalImmediate(1, 32)); - SrcReg = ANDReg; - } - // Create the base instruction, then add the operands. - MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(StrOpc)).addReg(SrcReg); - AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOStore, UseUnscaled); - return true; -} - -bool ARM64FastISel::SelectStore(const Instruction *I) { - MVT VT; - Value *Op0 = I->getOperand(0); - // Verify we have a legal type before going any further. Currently, we handle - // simple types that will directly fit in a register (i32/f32/i64/f64) or - // those that can be sign or zero-extended to a basic operation (i1/i8/i16). - if (!isLoadStoreTypeLegal(Op0->getType(), VT) || - cast(I)->isAtomic()) - return false; - - // Get the value to be stored into a register. - unsigned SrcReg = getRegForValue(Op0); - if (SrcReg == 0) - return false; - - // See if we can handle this address. - Address Addr; - if (!ComputeAddress(I->getOperand(1), Addr)) - return false; - - if (!EmitStore(VT, SrcReg, Addr)) - return false; - return true; -} - -static ARM64CC::CondCode getCompareCC(CmpInst::Predicate Pred) { - switch (Pred) { - case CmpInst::FCMP_ONE: - case CmpInst::FCMP_UEQ: - default: - // AL is our "false" for now. The other two need more compares. - return ARM64CC::AL; - case CmpInst::ICMP_EQ: - case CmpInst::FCMP_OEQ: - return ARM64CC::EQ; - case CmpInst::ICMP_SGT: - case CmpInst::FCMP_OGT: - return ARM64CC::GT; - case CmpInst::ICMP_SGE: - case CmpInst::FCMP_OGE: - return ARM64CC::GE; - case CmpInst::ICMP_UGT: - case CmpInst::FCMP_UGT: - return ARM64CC::HI; - case CmpInst::FCMP_OLT: - return ARM64CC::MI; - case CmpInst::ICMP_ULE: - case CmpInst::FCMP_OLE: - return ARM64CC::LS; - case CmpInst::FCMP_ORD: - return ARM64CC::VC; - case CmpInst::FCMP_UNO: - return ARM64CC::VS; - case CmpInst::FCMP_UGE: - return ARM64CC::PL; - case CmpInst::ICMP_SLT: - case CmpInst::FCMP_ULT: - return ARM64CC::LT; - case CmpInst::ICMP_SLE: - case CmpInst::FCMP_ULE: - return ARM64CC::LE; - case CmpInst::FCMP_UNE: - case CmpInst::ICMP_NE: - return ARM64CC::NE; - case CmpInst::ICMP_UGE: - return ARM64CC::HS; - case CmpInst::ICMP_ULT: - return ARM64CC::LO; - } -} - -bool ARM64FastISel::SelectBranch(const Instruction *I) { - const BranchInst *BI = cast(I); - MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; - MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; - - if (const CmpInst *CI = dyn_cast(BI->getCondition())) { - if (CI->hasOneUse() && (CI->getParent() == I->getParent())) { - // We may not handle every CC for now. - ARM64CC::CondCode CC = getCompareCC(CI->getPredicate()); - if (CC == ARM64CC::AL) - return false; - - // Emit the cmp. - if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned())) - return false; - - // Emit the branch. - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::Bcc)) - .addImm(CC) - .addMBB(TBB); - FuncInfo.MBB->addSuccessor(TBB); - - FastEmitBranch(FBB, DbgLoc); - return true; - } - } else if (TruncInst *TI = dyn_cast(BI->getCondition())) { - MVT SrcVT; - if (TI->hasOneUse() && TI->getParent() == I->getParent() && - (isLoadStoreTypeLegal(TI->getOperand(0)->getType(), SrcVT))) { - unsigned CondReg = getRegForValue(TI->getOperand(0)); - if (CondReg == 0) - return false; - - // Issue an extract_subreg to get the lower 32-bits. - if (SrcVT == MVT::i64) - CondReg = FastEmitInst_extractsubreg(MVT::i32, CondReg, /*Kill=*/true, - ARM64::sub_32); - - MRI.constrainRegClass(CondReg, &ARM64::GPR32RegClass); - unsigned ANDReg = createResultReg(&ARM64::GPR32spRegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri), - ANDReg) - .addReg(CondReg) - .addImm(ARM64_AM::encodeLogicalImmediate(1, 32)); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SUBSWri)) - .addReg(ANDReg) - .addReg(ANDReg) - .addImm(0) - .addImm(0); - - unsigned CC = ARM64CC::NE; - if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { - std::swap(TBB, FBB); - CC = ARM64CC::EQ; - } - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::Bcc)) - .addImm(CC) - .addMBB(TBB); - FuncInfo.MBB->addSuccessor(TBB); - FastEmitBranch(FBB, DbgLoc); - return true; - } - } else if (const ConstantInt *CI = - dyn_cast(BI->getCondition())) { - uint64_t Imm = CI->getZExtValue(); - MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::B)) - .addMBB(Target); - FuncInfo.MBB->addSuccessor(Target); - return true; - } - - unsigned CondReg = getRegForValue(BI->getCondition()); - if (CondReg == 0) - return false; - - // We've been divorced from our compare! Our block was split, and - // now our compare lives in a predecessor block. We musn't - // re-compare here, as the children of the compare aren't guaranteed - // live across the block boundary (we *could* check for this). - // Regardless, the compare has been done in the predecessor block, - // and it left a value for us in a virtual register. Ergo, we test - // the one-bit value left in the virtual register. - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SUBSWri), - ARM64::WZR) - .addReg(CondReg) - .addImm(0) - .addImm(0); - - unsigned CC = ARM64CC::NE; - if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { - std::swap(TBB, FBB); - CC = ARM64CC::EQ; - } - - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::Bcc)) - .addImm(CC) - .addMBB(TBB); - FuncInfo.MBB->addSuccessor(TBB); - FastEmitBranch(FBB, DbgLoc); - return true; -} - -bool ARM64FastISel::SelectIndirectBr(const Instruction *I) { - const IndirectBrInst *BI = cast(I); - unsigned AddrReg = getRegForValue(BI->getOperand(0)); - if (AddrReg == 0) - return false; - - // Emit the indirect branch. - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::BR)) - .addReg(AddrReg); - - // Make sure the CFG is up-to-date. - for (unsigned i = 0, e = BI->getNumSuccessors(); i != e; ++i) - FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[BI->getSuccessor(i)]); - - return true; -} - -bool ARM64FastISel::EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt) { - Type *Ty = Src1Value->getType(); - EVT SrcEVT = TLI.getValueType(Ty, true); - if (!SrcEVT.isSimple()) - return false; - MVT SrcVT = SrcEVT.getSimpleVT(); - - // Check to see if the 2nd operand is a constant that we can encode directly - // in the compare. - uint64_t Imm; - bool UseImm = false; - bool isNegativeImm = false; - if (const ConstantInt *ConstInt = dyn_cast(Src2Value)) { - if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 || - SrcVT == MVT::i8 || SrcVT == MVT::i1) { - const APInt &CIVal = ConstInt->getValue(); - - Imm = (isZExt) ? CIVal.getZExtValue() : CIVal.getSExtValue(); - if (CIVal.isNegative()) { - isNegativeImm = true; - Imm = -Imm; - } - // FIXME: We can handle more immediates using shifts. - UseImm = ((Imm & 0xfff) == Imm); - } - } else if (const ConstantFP *ConstFP = dyn_cast(Src2Value)) { - if (SrcVT == MVT::f32 || SrcVT == MVT::f64) - if (ConstFP->isZero() && !ConstFP->isNegative()) - UseImm = true; - } - - unsigned ZReg; - unsigned CmpOpc; - bool isICmp = true; - bool needsExt = false; - switch (SrcVT.SimpleTy) { - default: - return false; - case MVT::i1: - case MVT::i8: - case MVT::i16: - needsExt = true; - // Intentional fall-through. - case MVT::i32: - ZReg = ARM64::WZR; - if (UseImm) - CmpOpc = isNegativeImm ? ARM64::ADDSWri : ARM64::SUBSWri; - else - CmpOpc = ARM64::SUBSWrr; - break; - case MVT::i64: - ZReg = ARM64::XZR; - if (UseImm) - CmpOpc = isNegativeImm ? ARM64::ADDSXri : ARM64::SUBSXri; - else - CmpOpc = ARM64::SUBSXrr; - break; - case MVT::f32: - isICmp = false; - CmpOpc = UseImm ? ARM64::FCMPSri : ARM64::FCMPSrr; - break; - case MVT::f64: - isICmp = false; - CmpOpc = UseImm ? ARM64::FCMPDri : ARM64::FCMPDrr; - break; - } - - unsigned SrcReg1 = getRegForValue(Src1Value); - if (SrcReg1 == 0) - return false; - - unsigned SrcReg2; - if (!UseImm) { - SrcReg2 = getRegForValue(Src2Value); - if (SrcReg2 == 0) - return false; - } - - // We have i1, i8, or i16, we need to either zero extend or sign extend. - if (needsExt) { - SrcReg1 = EmitIntExt(SrcVT, SrcReg1, MVT::i32, isZExt); - if (SrcReg1 == 0) - return false; - if (!UseImm) { - SrcReg2 = EmitIntExt(SrcVT, SrcReg2, MVT::i32, isZExt); - if (SrcReg2 == 0) - return false; - } - } - - if (isICmp) { - if (UseImm) - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc)) - .addReg(ZReg) - .addReg(SrcReg1) - .addImm(Imm) - .addImm(0); - else - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc)) - .addReg(ZReg) - .addReg(SrcReg1) - .addReg(SrcReg2); - } else { - if (UseImm) - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc)) - .addReg(SrcReg1); - else - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc)) - .addReg(SrcReg1) - .addReg(SrcReg2); - } - return true; -} - -bool ARM64FastISel::SelectCmp(const Instruction *I) { - const CmpInst *CI = cast(I); - - // We may not handle every CC for now. - ARM64CC::CondCode CC = getCompareCC(CI->getPredicate()); - if (CC == ARM64CC::AL) - return false; - - // Emit the cmp. - if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned())) - return false; - - // Now set a register based on the comparison. - ARM64CC::CondCode invertedCC = getInvertedCondCode(CC); - unsigned ResultReg = createResultReg(&ARM64::GPR32RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::CSINCWr), - ResultReg) - .addReg(ARM64::WZR) - .addReg(ARM64::WZR) - .addImm(invertedCC); - - UpdateValueMap(I, ResultReg); - return true; -} - -bool ARM64FastISel::SelectSelect(const Instruction *I) { - const SelectInst *SI = cast(I); - - EVT DestEVT = TLI.getValueType(SI->getType(), true); - if (!DestEVT.isSimple()) - return false; - - MVT DestVT = DestEVT.getSimpleVT(); - if (DestVT != MVT::i32 && DestVT != MVT::i64 && DestVT != MVT::f32 && - DestVT != MVT::f64) - return false; - - unsigned CondReg = getRegForValue(SI->getCondition()); - if (CondReg == 0) - return false; - unsigned TrueReg = getRegForValue(SI->getTrueValue()); - if (TrueReg == 0) - return false; - unsigned FalseReg = getRegForValue(SI->getFalseValue()); - if (FalseReg == 0) - return false; - - - MRI.constrainRegClass(CondReg, &ARM64::GPR32RegClass); - unsigned ANDReg = createResultReg(&ARM64::GPR32spRegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri), - ANDReg) - .addReg(CondReg) - .addImm(ARM64_AM::encodeLogicalImmediate(1, 32)); - - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SUBSWri)) - .addReg(ANDReg) - .addReg(ANDReg) - .addImm(0) - .addImm(0); - - unsigned SelectOpc; - switch (DestVT.SimpleTy) { - default: - return false; - case MVT::i32: - SelectOpc = ARM64::CSELWr; - break; - case MVT::i64: - SelectOpc = ARM64::CSELXr; - break; - case MVT::f32: - SelectOpc = ARM64::FCSELSrrr; - break; - case MVT::f64: - SelectOpc = ARM64::FCSELDrrr; - break; - } - - unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT)); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SelectOpc), - ResultReg) - .addReg(TrueReg) - .addReg(FalseReg) - .addImm(ARM64CC::NE); - - UpdateValueMap(I, ResultReg); - return true; -} - -bool ARM64FastISel::SelectFPExt(const Instruction *I) { - Value *V = I->getOperand(0); - if (!I->getType()->isDoubleTy() || !V->getType()->isFloatTy()) - return false; - - unsigned Op = getRegForValue(V); - if (Op == 0) - return false; - - unsigned ResultReg = createResultReg(&ARM64::FPR64RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::FCVTDSr), - ResultReg).addReg(Op); - UpdateValueMap(I, ResultReg); - return true; -} - -bool ARM64FastISel::SelectFPTrunc(const Instruction *I) { - Value *V = I->getOperand(0); - if (!I->getType()->isFloatTy() || !V->getType()->isDoubleTy()) - return false; - - unsigned Op = getRegForValue(V); - if (Op == 0) - return false; - - unsigned ResultReg = createResultReg(&ARM64::FPR32RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::FCVTSDr), - ResultReg).addReg(Op); - UpdateValueMap(I, ResultReg); - return true; -} - -// FPToUI and FPToSI -bool ARM64FastISel::SelectFPToInt(const Instruction *I, bool Signed) { - MVT DestVT; - if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector()) - return false; - - unsigned SrcReg = getRegForValue(I->getOperand(0)); - if (SrcReg == 0) - return false; - - EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true); - if (SrcVT == MVT::f128) - return false; - - unsigned Opc; - if (SrcVT == MVT::f64) { - if (Signed) - Opc = (DestVT == MVT::i32) ? ARM64::FCVTZSUWDr : ARM64::FCVTZSUXDr; - else - Opc = (DestVT == MVT::i32) ? ARM64::FCVTZUUWDr : ARM64::FCVTZUUXDr; - } else { - if (Signed) - Opc = (DestVT == MVT::i32) ? ARM64::FCVTZSUWSr : ARM64::FCVTZSUXSr; - else - Opc = (DestVT == MVT::i32) ? ARM64::FCVTZUUWSr : ARM64::FCVTZUUXSr; - } - unsigned ResultReg = createResultReg( - DestVT == MVT::i32 ? &ARM64::GPR32RegClass : &ARM64::GPR64RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) - .addReg(SrcReg); - UpdateValueMap(I, ResultReg); - return true; -} - -bool ARM64FastISel::SelectIntToFP(const Instruction *I, bool Signed) { - MVT DestVT; - if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector()) - return false; - assert ((DestVT == MVT::f32 || DestVT == MVT::f64) && - "Unexpected value type."); - - unsigned SrcReg = getRegForValue(I->getOperand(0)); - if (SrcReg == 0) - return false; - - EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true); - - // Handle sign-extension. - if (SrcVT == MVT::i16 || SrcVT == MVT::i8 || SrcVT == MVT::i1) { - SrcReg = - EmitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed); - if (SrcReg == 0) - return false; - } - - MRI.constrainRegClass(SrcReg, SrcVT == MVT::i64 ? &ARM64::GPR64RegClass - : &ARM64::GPR32RegClass); - - unsigned Opc; - if (SrcVT == MVT::i64) { - if (Signed) - Opc = (DestVT == MVT::f32) ? ARM64::SCVTFUXSri : ARM64::SCVTFUXDri; - else - Opc = (DestVT == MVT::f32) ? ARM64::UCVTFUXSri : ARM64::UCVTFUXDri; - } else { - if (Signed) - Opc = (DestVT == MVT::f32) ? ARM64::SCVTFUWSri : ARM64::SCVTFUWDri; - else - Opc = (DestVT == MVT::f32) ? ARM64::UCVTFUWSri : ARM64::UCVTFUWDri; - } - - unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT)); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) - .addReg(SrcReg); - UpdateValueMap(I, ResultReg); - return true; -} - -bool ARM64FastISel::ProcessCallArgs(SmallVectorImpl &Args, - SmallVectorImpl &ArgRegs, - SmallVectorImpl &ArgVTs, - SmallVectorImpl &ArgFlags, - SmallVectorImpl &RegArgs, - CallingConv::ID CC, unsigned &NumBytes) { - SmallVector ArgLocs; - CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context); - CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC)); - - // Get a count of how many bytes are to be pushed on the stack. - NumBytes = CCInfo.getNextStackOffset(); - - // Issue CALLSEQ_START - unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) - .addImm(NumBytes); - - // Process the args. - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i]; - unsigned Arg = ArgRegs[VA.getValNo()]; - MVT ArgVT = ArgVTs[VA.getValNo()]; - - // Handle arg promotion: SExt, ZExt, AExt. - switch (VA.getLocInfo()) { - case CCValAssign::Full: - break; - case CCValAssign::SExt: { - MVT DestVT = VA.getLocVT(); - MVT SrcVT = ArgVT; - Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ false); - if (Arg == 0) - return false; - ArgVT = DestVT; - break; - } - case CCValAssign::AExt: - // Intentional fall-through. - case CCValAssign::ZExt: { - MVT DestVT = VA.getLocVT(); - MVT SrcVT = ArgVT; - Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ true); - if (Arg == 0) - return false; - ArgVT = DestVT; - break; - } - default: - llvm_unreachable("Unknown arg promotion!"); - } - - // Now copy/store arg to correct locations. - if (VA.isRegLoc() && !VA.needsCustom()) { - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg); - RegArgs.push_back(VA.getLocReg()); - } else if (VA.needsCustom()) { - // FIXME: Handle custom args. - return false; - } else { - assert(VA.isMemLoc() && "Assuming store on stack."); - - // Need to store on the stack. - unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8; - - unsigned BEAlign = 0; - if (ArgSize < 8 && !Subtarget->isLittleEndian()) - BEAlign = 8 - ArgSize; - - Address Addr; - Addr.setKind(Address::RegBase); - Addr.setReg(ARM64::SP); - Addr.setOffset(VA.getLocMemOffset() + BEAlign); - - if (!EmitStore(ArgVT, Arg, Addr)) - return false; - } - } - return true; -} - -bool ARM64FastISel::FinishCall(MVT RetVT, SmallVectorImpl &UsedRegs, - const Instruction *I, CallingConv::ID CC, - unsigned &NumBytes) { - // Issue CALLSEQ_END - unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp)) - .addImm(NumBytes) - .addImm(0); - - // Now the return value. - if (RetVT != MVT::isVoid) { - SmallVector RVLocs; - CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context); - CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC)); - - // Only handle a single return value. - if (RVLocs.size() != 1) - return false; - - // Copy all of the result registers out of their specified physreg. - MVT CopyVT = RVLocs[0].getValVT(); - unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT)); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), - ResultReg).addReg(RVLocs[0].getLocReg()); - UsedRegs.push_back(RVLocs[0].getLocReg()); - - // Finally update the result. - UpdateValueMap(I, ResultReg); - } - - return true; -} - -bool ARM64FastISel::SelectCall(const Instruction *I, - const char *IntrMemName = nullptr) { - const CallInst *CI = cast(I); - const Value *Callee = CI->getCalledValue(); - - // Don't handle inline asm or intrinsics. - if (isa(Callee)) - return false; - - // Only handle global variable Callees. - const GlobalValue *GV = dyn_cast(Callee); - if (!GV) - return false; - - // Check the calling convention. - ImmutableCallSite CS(CI); - CallingConv::ID CC = CS.getCallingConv(); - - // Let SDISel handle vararg functions. - PointerType *PT = cast(CS.getCalledValue()->getType()); - FunctionType *FTy = cast(PT->getElementType()); - if (FTy->isVarArg()) - return false; - - // Handle *simple* calls for now. - MVT RetVT; - Type *RetTy = I->getType(); - if (RetTy->isVoidTy()) - RetVT = MVT::isVoid; - else if (!isTypeLegal(RetTy, RetVT)) - return false; - - // Set up the argument vectors. - SmallVector Args; - SmallVector ArgRegs; - SmallVector ArgVTs; - SmallVector ArgFlags; - Args.reserve(CS.arg_size()); - ArgRegs.reserve(CS.arg_size()); - ArgVTs.reserve(CS.arg_size()); - ArgFlags.reserve(CS.arg_size()); - - for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end(); - i != e; ++i) { - // If we're lowering a memory intrinsic instead of a regular call, skip the - // last two arguments, which shouldn't be passed to the underlying function. - if (IntrMemName && e - i <= 2) - break; - - unsigned Arg = getRegForValue(*i); - if (Arg == 0) - return false; - - ISD::ArgFlagsTy Flags; - unsigned AttrInd = i - CS.arg_begin() + 1; - if (CS.paramHasAttr(AttrInd, Attribute::SExt)) - Flags.setSExt(); - if (CS.paramHasAttr(AttrInd, Attribute::ZExt)) - Flags.setZExt(); - - // FIXME: Only handle *easy* calls for now. - if (CS.paramHasAttr(AttrInd, Attribute::InReg) || - CS.paramHasAttr(AttrInd, Attribute::StructRet) || - CS.paramHasAttr(AttrInd, Attribute::Nest) || - CS.paramHasAttr(AttrInd, Attribute::ByVal)) - return false; - - MVT ArgVT; - Type *ArgTy = (*i)->getType(); - if (!isTypeLegal(ArgTy, ArgVT) && - !(ArgVT == MVT::i1 || ArgVT == MVT::i8 || ArgVT == MVT::i16)) - return false; - - // We don't handle vector parameters yet. - if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64) - return false; - - unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy); - Flags.setOrigAlign(OriginalAlignment); - - Args.push_back(*i); - ArgRegs.push_back(Arg); - ArgVTs.push_back(ArgVT); - ArgFlags.push_back(Flags); - } - - // Handle the arguments now that we've gotten them. - SmallVector RegArgs; - unsigned NumBytes; - if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes)) - return false; - - // Issue the call. - MachineInstrBuilder MIB; - MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::BL)); - if (!IntrMemName) - MIB.addGlobalAddress(GV, 0, 0); - else - MIB.addExternalSymbol(IntrMemName, 0); - - // Add implicit physical register uses to the call. - for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) - MIB.addReg(RegArgs[i], RegState::Implicit); - - // Add a register mask with the call-preserved registers. - // Proper defs for return values will be added by setPhysRegsDeadExcept(). - MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv())); - - // Finish off the call including any return values. - SmallVector UsedRegs; - if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes)) - return false; - - // Set all unused physreg defs as dead. - static_cast(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI); - - return true; -} - -bool ARM64FastISel::IsMemCpySmall(uint64_t Len, unsigned Alignment) { - if (Alignment) - return Len / Alignment <= 4; - else - return Len < 32; -} - -bool ARM64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len, - unsigned Alignment) { - // Make sure we don't bloat code by inlining very large memcpy's. - if (!IsMemCpySmall(Len, Alignment)) - return false; - - int64_t UnscaledOffset = 0; - Address OrigDest = Dest; - Address OrigSrc = Src; - - while (Len) { - MVT VT; - if (!Alignment || Alignment >= 8) { - if (Len >= 8) - VT = MVT::i64; - else if (Len >= 4) - VT = MVT::i32; - else if (Len >= 2) - VT = MVT::i16; - else { - VT = MVT::i8; - } - } else { - // Bound based on alignment. - if (Len >= 4 && Alignment == 4) - VT = MVT::i32; - else if (Len >= 2 && Alignment == 2) - VT = MVT::i16; - else { - VT = MVT::i8; - } - } - - bool RV; - unsigned ResultReg; - RV = EmitLoad(VT, ResultReg, Src); - assert(RV == true && "Should be able to handle this load."); - RV = EmitStore(VT, ResultReg, Dest); - assert(RV == true && "Should be able to handle this store."); - (void)RV; - - int64_t Size = VT.getSizeInBits() / 8; - Len -= Size; - UnscaledOffset += Size; - - // We need to recompute the unscaled offset for each iteration. - Dest.setOffset(OrigDest.getOffset() + UnscaledOffset); - Src.setOffset(OrigSrc.getOffset() + UnscaledOffset); - } - - return true; -} - -bool ARM64FastISel::SelectIntrinsicCall(const IntrinsicInst &I) { - // FIXME: Handle more intrinsics. - switch (I.getIntrinsicID()) { - default: - return false; - case Intrinsic::memcpy: - case Intrinsic::memmove: { - const MemTransferInst &MTI = cast(I); - // Don't handle volatile. - if (MTI.isVolatile()) - return false; - - // Disable inlining for memmove before calls to ComputeAddress. Otherwise, - // we would emit dead code because we don't currently handle memmoves. - bool isMemCpy = (I.getIntrinsicID() == Intrinsic::memcpy); - if (isa(MTI.getLength()) && isMemCpy) { - // Small memcpy's are common enough that we want to do them without a call - // if possible. - uint64_t Len = cast(MTI.getLength())->getZExtValue(); - unsigned Alignment = MTI.getAlignment(); - if (IsMemCpySmall(Len, Alignment)) { - Address Dest, Src; - if (!ComputeAddress(MTI.getRawDest(), Dest) || - !ComputeAddress(MTI.getRawSource(), Src)) - return false; - if (TryEmitSmallMemCpy(Dest, Src, Len, Alignment)) - return true; - } - } - - if (!MTI.getLength()->getType()->isIntegerTy(64)) - return false; - - if (MTI.getSourceAddressSpace() > 255 || MTI.getDestAddressSpace() > 255) - // Fast instruction selection doesn't support the special - // address spaces. - return false; - - const char *IntrMemName = isa(I) ? "memcpy" : "memmove"; - return SelectCall(&I, IntrMemName); - } - case Intrinsic::memset: { - const MemSetInst &MSI = cast(I); - // Don't handle volatile. - if (MSI.isVolatile()) - return false; - - if (!MSI.getLength()->getType()->isIntegerTy(64)) - return false; - - if (MSI.getDestAddressSpace() > 255) - // Fast instruction selection doesn't support the special - // address spaces. - return false; - - return SelectCall(&I, "memset"); - } - case Intrinsic::trap: { - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::BRK)) - .addImm(1); - return true; - } - } - return false; -} - -bool ARM64FastISel::SelectRet(const Instruction *I) { - const ReturnInst *Ret = cast(I); - const Function &F = *I->getParent()->getParent(); - - if (!FuncInfo.CanLowerReturn) - return false; - - if (F.isVarArg()) - return false; - - // Build a list of return value registers. - SmallVector RetRegs; - - if (Ret->getNumOperands() > 0) { - CallingConv::ID CC = F.getCallingConv(); - SmallVector Outs; - GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI); - - // Analyze operands of the call, assigning locations to each operand. - SmallVector ValLocs; - CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs, - I->getContext()); - CCAssignFn *RetCC = CC == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS - : RetCC_ARM64_AAPCS; - CCInfo.AnalyzeReturn(Outs, RetCC); - - // Only handle a single return value for now. - if (ValLocs.size() != 1) - return false; - - CCValAssign &VA = ValLocs[0]; - const Value *RV = Ret->getOperand(0); - - // Don't bother handling odd stuff for now. - if (VA.getLocInfo() != CCValAssign::Full) - return false; - // Only handle register returns for now. - if (!VA.isRegLoc()) - return false; - unsigned Reg = getRegForValue(RV); - if (Reg == 0) - return false; - - unsigned SrcReg = Reg + VA.getValNo(); - unsigned DestReg = VA.getLocReg(); - // Avoid a cross-class copy. This is very unlikely. - if (!MRI.getRegClass(SrcReg)->contains(DestReg)) - return false; - - EVT RVEVT = TLI.getValueType(RV->getType()); - if (!RVEVT.isSimple()) - return false; - - // Vectors (of > 1 lane) in big endian need tricky handling. - if (RVEVT.isVector() && RVEVT.getVectorNumElements() > 1) - return false; - - MVT RVVT = RVEVT.getSimpleVT(); - if (RVVT == MVT::f128) - return false; - MVT DestVT = VA.getValVT(); - // Special handling for extended integers. - if (RVVT != DestVT) { - if (RVVT != MVT::i1 && RVVT != MVT::i8 && RVVT != MVT::i16) - return false; - - if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt()) - return false; - - bool isZExt = Outs[0].Flags.isZExt(); - SrcReg = EmitIntExt(RVVT, SrcReg, DestVT, isZExt); - if (SrcReg == 0) - return false; - } - - // Make the copy. - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), DestReg).addReg(SrcReg); - - // Add register to return instruction. - RetRegs.push_back(VA.getLocReg()); - } - - MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(ARM64::RET_ReallyLR)); - for (unsigned i = 0, e = RetRegs.size(); i != e; ++i) - MIB.addReg(RetRegs[i], RegState::Implicit); - return true; -} - -bool ARM64FastISel::SelectTrunc(const Instruction *I) { - Type *DestTy = I->getType(); - Value *Op = I->getOperand(0); - Type *SrcTy = Op->getType(); - - EVT SrcEVT = TLI.getValueType(SrcTy, true); - EVT DestEVT = TLI.getValueType(DestTy, true); - if (!SrcEVT.isSimple()) - return false; - if (!DestEVT.isSimple()) - return false; - - MVT SrcVT = SrcEVT.getSimpleVT(); - MVT DestVT = DestEVT.getSimpleVT(); - - if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 && - SrcVT != MVT::i8) - return false; - if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8 && - DestVT != MVT::i1) - return false; - - unsigned SrcReg = getRegForValue(Op); - if (!SrcReg) - return false; - - // If we're truncating from i64 to a smaller non-legal type then generate an - // AND. Otherwise, we know the high bits are undefined and a truncate doesn't - // generate any code. - if (SrcVT == MVT::i64) { - uint64_t Mask = 0; - switch (DestVT.SimpleTy) { - default: - // Trunc i64 to i32 is handled by the target-independent fast-isel. - return false; - case MVT::i1: - Mask = 0x1; - break; - case MVT::i8: - Mask = 0xff; - break; - case MVT::i16: - Mask = 0xffff; - break; - } - // Issue an extract_subreg to get the lower 32-bits. - unsigned Reg32 = FastEmitInst_extractsubreg(MVT::i32, SrcReg, /*Kill=*/true, - ARM64::sub_32); - MRI.constrainRegClass(Reg32, &ARM64::GPR32RegClass); - // Create the AND instruction which performs the actual truncation. - unsigned ANDReg = createResultReg(&ARM64::GPR32spRegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri), - ANDReg) - .addReg(Reg32) - .addImm(ARM64_AM::encodeLogicalImmediate(Mask, 32)); - SrcReg = ANDReg; - } - - UpdateValueMap(I, SrcReg); - return true; -} - -unsigned ARM64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) { - assert((DestVT == MVT::i8 || DestVT == MVT::i16 || DestVT == MVT::i32 || - DestVT == MVT::i64) && - "Unexpected value type."); - // Handle i8 and i16 as i32. - if (DestVT == MVT::i8 || DestVT == MVT::i16) - DestVT = MVT::i32; - - if (isZExt) { - MRI.constrainRegClass(SrcReg, &ARM64::GPR32RegClass); - unsigned ResultReg = createResultReg(&ARM64::GPR32spRegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri), - ResultReg) - .addReg(SrcReg) - .addImm(ARM64_AM::encodeLogicalImmediate(1, 32)); - - if (DestVT == MVT::i64) { - // We're ZExt i1 to i64. The ANDWri Wd, Ws, #1 implicitly clears the - // upper 32 bits. Emit a SUBREG_TO_REG to extend from Wd to Xd. - unsigned Reg64 = MRI.createVirtualRegister(&ARM64::GPR64RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(ARM64::SUBREG_TO_REG), Reg64) - .addImm(0) - .addReg(ResultReg) - .addImm(ARM64::sub_32); - ResultReg = Reg64; - } - return ResultReg; - } else { - if (DestVT == MVT::i64) { - // FIXME: We're SExt i1 to i64. - return 0; - } - unsigned ResultReg = createResultReg(&ARM64::GPR32RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SBFMWri), - ResultReg) - .addReg(SrcReg) - .addImm(0) - .addImm(0); - return ResultReg; - } -} - -unsigned ARM64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, - bool isZExt) { - assert(DestVT != MVT::i1 && "ZeroExt/SignExt an i1?"); - unsigned Opc; - unsigned Imm = 0; - - switch (SrcVT.SimpleTy) { - default: - return 0; - case MVT::i1: - return Emiti1Ext(SrcReg, DestVT, isZExt); - case MVT::i8: - if (DestVT == MVT::i64) - Opc = isZExt ? ARM64::UBFMXri : ARM64::SBFMXri; - else - Opc = isZExt ? ARM64::UBFMWri : ARM64::SBFMWri; - Imm = 7; - break; - case MVT::i16: - if (DestVT == MVT::i64) - Opc = isZExt ? ARM64::UBFMXri : ARM64::SBFMXri; - else - Opc = isZExt ? ARM64::UBFMWri : ARM64::SBFMWri; - Imm = 15; - break; - case MVT::i32: - assert(DestVT == MVT::i64 && "IntExt i32 to i32?!?"); - Opc = isZExt ? ARM64::UBFMXri : ARM64::SBFMXri; - Imm = 31; - break; - } - - // Handle i8 and i16 as i32. - if (DestVT == MVT::i8 || DestVT == MVT::i16) - DestVT = MVT::i32; - else if (DestVT == MVT::i64) { - unsigned Src64 = MRI.createVirtualRegister(&ARM64::GPR64RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(ARM64::SUBREG_TO_REG), Src64) - .addImm(0) - .addReg(SrcReg) - .addImm(ARM64::sub_32); - SrcReg = Src64; - } - - unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT)); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) - .addReg(SrcReg) - .addImm(0) - .addImm(Imm); - - return ResultReg; -} - -bool ARM64FastISel::SelectIntExt(const Instruction *I) { - // On ARM, in general, integer casts don't involve legal types; this code - // handles promotable integers. The high bits for a type smaller than - // the register size are assumed to be undefined. - Type *DestTy = I->getType(); - Value *Src = I->getOperand(0); - Type *SrcTy = Src->getType(); - - bool isZExt = isa(I); - unsigned SrcReg = getRegForValue(Src); - if (!SrcReg) - return false; - - EVT SrcEVT = TLI.getValueType(SrcTy, true); - EVT DestEVT = TLI.getValueType(DestTy, true); - if (!SrcEVT.isSimple()) - return false; - if (!DestEVT.isSimple()) - return false; - - MVT SrcVT = SrcEVT.getSimpleVT(); - MVT DestVT = DestEVT.getSimpleVT(); - unsigned ResultReg = EmitIntExt(SrcVT, SrcReg, DestVT, isZExt); - if (ResultReg == 0) - return false; - UpdateValueMap(I, ResultReg); - return true; -} - -bool ARM64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) { - EVT DestEVT = TLI.getValueType(I->getType(), true); - if (!DestEVT.isSimple()) - return false; - - MVT DestVT = DestEVT.getSimpleVT(); - if (DestVT != MVT::i64 && DestVT != MVT::i32) - return false; - - unsigned DivOpc; - bool is64bit = (DestVT == MVT::i64); - switch (ISDOpcode) { - default: - return false; - case ISD::SREM: - DivOpc = is64bit ? ARM64::SDIVXr : ARM64::SDIVWr; - break; - case ISD::UREM: - DivOpc = is64bit ? ARM64::UDIVXr : ARM64::UDIVWr; - break; - } - unsigned MSubOpc = is64bit ? ARM64::MSUBXrrr : ARM64::MSUBWrrr; - unsigned Src0Reg = getRegForValue(I->getOperand(0)); - if (!Src0Reg) - return false; - - unsigned Src1Reg = getRegForValue(I->getOperand(1)); - if (!Src1Reg) - return false; - - unsigned QuotReg = createResultReg(TLI.getRegClassFor(DestVT)); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(DivOpc), QuotReg) - .addReg(Src0Reg) - .addReg(Src1Reg); - // The remainder is computed as numerator - (quotient * denominator) using the - // MSUB instruction. - unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT)); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MSubOpc), ResultReg) - .addReg(QuotReg) - .addReg(Src1Reg) - .addReg(Src0Reg); - UpdateValueMap(I, ResultReg); - return true; -} - -bool ARM64FastISel::SelectMul(const Instruction *I) { - EVT SrcEVT = TLI.getValueType(I->getOperand(0)->getType(), true); - if (!SrcEVT.isSimple()) - return false; - MVT SrcVT = SrcEVT.getSimpleVT(); - - // Must be simple value type. Don't handle vectors. - if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 && - SrcVT != MVT::i8) - return false; - - unsigned Opc; - unsigned ZReg; - switch (SrcVT.SimpleTy) { - default: - return false; - case MVT::i8: - case MVT::i16: - case MVT::i32: - ZReg = ARM64::WZR; - Opc = ARM64::MADDWrrr; - break; - case MVT::i64: - ZReg = ARM64::XZR; - Opc = ARM64::MADDXrrr; - break; - } - - unsigned Src0Reg = getRegForValue(I->getOperand(0)); - if (!Src0Reg) - return false; - - unsigned Src1Reg = getRegForValue(I->getOperand(1)); - if (!Src1Reg) - return false; - - // Create the base instruction, then add the operands. - unsigned ResultReg = createResultReg(TLI.getRegClassFor(SrcVT)); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) - .addReg(Src0Reg) - .addReg(Src1Reg) - .addReg(ZReg); - UpdateValueMap(I, ResultReg); - return true; -} - -bool ARM64FastISel::TargetSelectInstruction(const Instruction *I) { - switch (I->getOpcode()) { - default: - break; - case Instruction::Load: - return SelectLoad(I); - case Instruction::Store: - return SelectStore(I); - case Instruction::Br: - return SelectBranch(I); - case Instruction::IndirectBr: - return SelectIndirectBr(I); - case Instruction::FCmp: - case Instruction::ICmp: - return SelectCmp(I); - case Instruction::Select: - return SelectSelect(I); - case Instruction::FPExt: - return SelectFPExt(I); - case Instruction::FPTrunc: - return SelectFPTrunc(I); - case Instruction::FPToSI: - return SelectFPToInt(I, /*Signed=*/true); - case Instruction::FPToUI: - return SelectFPToInt(I, /*Signed=*/false); - case Instruction::SIToFP: - return SelectIntToFP(I, /*Signed=*/true); - case Instruction::UIToFP: - return SelectIntToFP(I, /*Signed=*/false); - case Instruction::SRem: - return SelectRem(I, ISD::SREM); - case Instruction::URem: - return SelectRem(I, ISD::UREM); - case Instruction::Call: - if (const IntrinsicInst *II = dyn_cast(I)) - return SelectIntrinsicCall(*II); - return SelectCall(I); - case Instruction::Ret: - return SelectRet(I); - case Instruction::Trunc: - return SelectTrunc(I); - case Instruction::ZExt: - case Instruction::SExt: - return SelectIntExt(I); - case Instruction::Mul: - // FIXME: This really should be handled by the target-independent selector. - return SelectMul(I); - } - return false; - // Silence warnings. - (void)&CC_ARM64_DarwinPCS_VarArg; -} - -namespace llvm { -llvm::FastISel *ARM64::createFastISel(FunctionLoweringInfo &funcInfo, - const TargetLibraryInfo *libInfo) { - return new ARM64FastISel(funcInfo, libInfo); -} -} diff --git a/lib/Target/ARM64/ARM64FrameLowering.cpp b/lib/Target/ARM64/ARM64FrameLowering.cpp deleted file mode 100644 index 9c17488ec58..00000000000 --- a/lib/Target/ARM64/ARM64FrameLowering.cpp +++ /dev/null @@ -1,888 +0,0 @@ -//===- ARM64FrameLowering.cpp - ARM64 Frame Lowering -----------*- C++ -*-====// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the ARM64 implementation of TargetFrameLowering class. -// -//===----------------------------------------------------------------------===// - -#include "ARM64FrameLowering.h" -#include "ARM64InstrInfo.h" -#include "ARM64MachineFunctionInfo.h" -#include "ARM64Subtarget.h" -#include "ARM64TargetMachine.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Function.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineModuleInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RegisterScavenging.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -#define DEBUG_TYPE "frame-info" - -static cl::opt EnableRedZone("arm64-redzone", - cl::desc("enable use of redzone on ARM64"), - cl::init(false), cl::Hidden); - -STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); - -static unsigned estimateStackSize(MachineFunction &MF) { - const MachineFrameInfo *FFI = MF.getFrameInfo(); - int Offset = 0; - for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) { - int FixedOff = -FFI->getObjectOffset(i); - if (FixedOff > Offset) - Offset = FixedOff; - } - for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) { - if (FFI->isDeadObjectIndex(i)) - continue; - Offset += FFI->getObjectSize(i); - unsigned Align = FFI->getObjectAlignment(i); - // Adjust to alignment boundary - Offset = (Offset + Align - 1) / Align * Align; - } - // This does not include the 16 bytes used for fp and lr. - return (unsigned)Offset; -} - -bool ARM64FrameLowering::canUseRedZone(const MachineFunction &MF) const { - if (!EnableRedZone) - return false; - // Don't use the red zone if the function explicitly asks us not to. - // This is typically used for kernel code. - if (MF.getFunction()->getAttributes().hasAttribute( - AttributeSet::FunctionIndex, Attribute::NoRedZone)) - return false; - - const MachineFrameInfo *MFI = MF.getFrameInfo(); - const ARM64FunctionInfo *AFI = MF.getInfo(); - unsigned NumBytes = AFI->getLocalStackSize(); - - // Note: currently hasFP() is always true for hasCalls(), but that's an - // implementation detail of the current code, not a strict requirement, - // so stay safe here and check both. - if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128) - return false; - return true; -} - -/// hasFP - Return true if the specified function should have a dedicated frame -/// pointer register. -bool ARM64FrameLowering::hasFP(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - -#ifndef NDEBUG - const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo(); - assert(!RegInfo->needsStackRealignment(MF) && - "No stack realignment on ARM64!"); -#endif - - return (MFI->hasCalls() || MFI->hasVarSizedObjects() || - MFI->isFrameAddressTaken()); -} - -/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is -/// not required, we reserve argument space for call sites in the function -/// immediately on entry to the current function. This eliminates the need for -/// add/sub sp brackets around call sites. Returns true if the call frame is -/// included as part of the stack frame. -bool ARM64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { - return !MF.getFrameInfo()->hasVarSizedObjects(); -} - -void ARM64FrameLowering::eliminateCallFramePseudoInstr( - MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const { - const ARM64InstrInfo *TII = - static_cast(MF.getTarget().getInstrInfo()); - DebugLoc DL = I->getDebugLoc(); - int Opc = I->getOpcode(); - bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); - uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; - - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); - if (!TFI->hasReservedCallFrame(MF)) { - unsigned Align = getStackAlignment(); - - int64_t Amount = I->getOperand(0).getImm(); - Amount = RoundUpToAlignment(Amount, Align); - if (!IsDestroy) - Amount = -Amount; - - // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it - // doesn't have to pop anything), then the first operand will be zero too so - // this adjustment is a no-op. - if (CalleePopAmount == 0) { - // FIXME: in-function stack adjustment for calls is limited to 24-bits - // because there's no guaranteed temporary register available. - // - // ADD/SUB (immediate) has only LSL #0 and LSL #12 avaiable. - // 1) For offset <= 12-bit, we use LSL #0 - // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses - // LSL #0, and the other uses LSL #12. - // - // Mostly call frames will be allocated at the start of a function so - // this is OK, but it is a limitation that needs dealing with. - assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large"); - emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, Amount, TII); - } - } else if (CalleePopAmount != 0) { - // If the calling convention demands that the callee pops arguments from the - // stack, we want to add it back if we have a reserved call frame. - assert(CalleePopAmount < 0xffffff && "call frame too large"); - emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, -CalleePopAmount, TII); - } - MBB.erase(I); -} - -void -ARM64FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - unsigned FramePtr) const { - MachineFunction &MF = *MBB.getParent(); - MachineFrameInfo *MFI = MF.getFrameInfo(); - MachineModuleInfo &MMI = MF.getMMI(); - const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); - const ARM64InstrInfo *TII = TM.getInstrInfo(); - DebugLoc DL = MBB.findDebugLoc(MBBI); - - // Add callee saved registers to move list. - const std::vector &CSI = MFI->getCalleeSavedInfo(); - if (CSI.empty()) - return; - - const DataLayout *TD = MF.getTarget().getDataLayout(); - bool HasFP = hasFP(MF); - - // Calculate amount of bytes used for return address storing. - int stackGrowth = -TD->getPointerSize(0); - - // Calculate offsets. - int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth; - unsigned TotalSkipped = 0; - for (const auto &Info : CSI) { - unsigned Reg = Info.getReg(); - int64_t Offset = MFI->getObjectOffset(Info.getFrameIdx()) - - getOffsetOfLocalArea() + saveAreaOffset; - - // Don't output a new CFI directive if we're re-saving the frame pointer or - // link register. This happens when the PrologEpilogInserter has inserted an - // extra "STP" of the frame pointer and link register -- the "emitPrologue" - // method automatically generates the directives when frame pointers are - // used. If we generate CFI directives for the extra "STP"s, the linker will - // lose track of the correct values for the frame pointer and link register. - if (HasFP && (FramePtr == Reg || Reg == ARM64::LR)) { - TotalSkipped += stackGrowth; - continue; - } - - unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); - unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( - nullptr, DwarfReg, Offset - TotalSkipped)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - } -} - -void ARM64FrameLowering::emitPrologue(MachineFunction &MF) const { - MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB. - MachineBasicBlock::iterator MBBI = MBB.begin(); - const MachineFrameInfo *MFI = MF.getFrameInfo(); - const Function *Fn = MF.getFunction(); - const ARM64RegisterInfo *RegInfo = TM.getRegisterInfo(); - const ARM64InstrInfo *TII = TM.getInstrInfo(); - MachineModuleInfo &MMI = MF.getMMI(); - ARM64FunctionInfo *AFI = MF.getInfo(); - bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry(); - bool HasFP = hasFP(MF); - DebugLoc DL = MBB.findDebugLoc(MBBI); - - int NumBytes = (int)MFI->getStackSize(); - if (!AFI->hasStackFrame()) { - assert(!HasFP && "unexpected function without stack frame but with FP"); - - // All of the stack allocation is for locals. - AFI->setLocalStackSize(NumBytes); - - // Label used to tie together the PROLOG_LABEL and the MachineMoves. - MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol(); - - // REDZONE: If the stack size is less than 128 bytes, we don't need - // to actually allocate. - if (NumBytes && !canUseRedZone(MF)) { - emitFrameOffset(MBB, MBBI, DL, ARM64::SP, ARM64::SP, -NumBytes, TII, - MachineInstr::FrameSetup); - - // Encode the stack size of the leaf function. - unsigned CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - } else if (NumBytes) { - ++NumRedZoneFunctions; - } - - return; - } - - // Only set up FP if we actually need to. - int FPOffset = 0; - if (HasFP) { - // First instruction must a) allocate the stack and b) have an immediate - // that is a multiple of -2. - assert((MBBI->getOpcode() == ARM64::STPXpre || - MBBI->getOpcode() == ARM64::STPDpre) && - MBBI->getOperand(3).getReg() == ARM64::SP && - MBBI->getOperand(4).getImm() < 0 && - (MBBI->getOperand(4).getImm() & 1) == 0); - - // Frame pointer is fp = sp - 16. Since the STPXpre subtracts the space - // required for the callee saved register area we get the frame pointer - // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8. - FPOffset = -(MBBI->getOperand(4).getImm() + 2) * 8; - assert(FPOffset >= 0 && "Bad Framepointer Offset"); - } - - // Move past the saves of the callee-saved registers. - while (MBBI->getOpcode() == ARM64::STPXi || - MBBI->getOpcode() == ARM64::STPDi || - MBBI->getOpcode() == ARM64::STPXpre || - MBBI->getOpcode() == ARM64::STPDpre) { - ++MBBI; - NumBytes -= 16; - } - assert(NumBytes >= 0 && "Negative stack allocation size!?"); - if (HasFP) { - // Issue sub fp, sp, FPOffset or - // mov fp,sp when FPOffset is zero. - // Note: All stores of callee-saved registers are marked as "FrameSetup". - // This code marks the instruction(s) that set the FP also. - emitFrameOffset(MBB, MBBI, DL, ARM64::FP, ARM64::SP, FPOffset, TII, - MachineInstr::FrameSetup); - } - - // All of the remaining stack allocations are for locals. - AFI->setLocalStackSize(NumBytes); - - // Allocate space for the rest of the frame. - if (NumBytes) { - // If we're a leaf function, try using the red zone. - if (!canUseRedZone(MF)) - emitFrameOffset(MBB, MBBI, DL, ARM64::SP, ARM64::SP, -NumBytes, TII, - MachineInstr::FrameSetup); - } - - // If we need a base pointer, set it up here. It's whatever the value of the - // stack pointer is at this point. Any variable size objects will be allocated - // after this, so we can still use the base pointer to reference locals. - // - // FIXME: Clarify FrameSetup flags here. - // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is - // needed. - // - if (RegInfo->hasBasePointer(MF)) - TII->copyPhysReg(MBB, MBBI, DL, ARM64::X19, ARM64::SP, false); - - if (needsFrameMoves) { - const DataLayout *TD = MF.getTarget().getDataLayout(); - const int StackGrowth = -TD->getPointerSize(0); - unsigned FramePtr = RegInfo->getFrameRegister(MF); - - // An example of the prologue: - // - // .globl __foo - // .align 2 - // __foo: - // Ltmp0: - // .cfi_startproc - // .cfi_personality 155, ___gxx_personality_v0 - // Leh_func_begin: - // .cfi_lsda 16, Lexception33 - // - // stp xa,bx, [sp, -#offset]! - // ... - // stp x28, x27, [sp, #offset-32] - // stp fp, lr, [sp, #offset-16] - // add fp, sp, #offset - 16 - // sub sp, sp, #1360 - // - // The Stack: - // +-------------------------------------------+ - // 10000 | ........ | ........ | ........ | ........ | - // 10004 | ........ | ........ | ........ | ........ | - // +-------------------------------------------+ - // 10008 | ........ | ........ | ........ | ........ | - // 1000c | ........ | ........ | ........ | ........ | - // +===========================================+ - // 10010 | X28 Register | - // 10014 | X28 Register | - // +-------------------------------------------+ - // 10018 | X27 Register | - // 1001c | X27 Register | - // +===========================================+ - // 10020 | Frame Pointer | - // 10024 | Frame Pointer | - // +-------------------------------------------+ - // 10028 | Link Register | - // 1002c | Link Register | - // +===========================================+ - // 10030 | ........ | ........ | ........ | ........ | - // 10034 | ........ | ........ | ........ | ........ | - // +-------------------------------------------+ - // 10038 | ........ | ........ | ........ | ........ | - // 1003c | ........ | ........ | ........ | ........ | - // +-------------------------------------------+ - // - // [sp] = 10030 :: >>initial value<< - // sp = 10020 :: stp fp, lr, [sp, #-16]! - // fp = sp == 10020 :: mov fp, sp - // [sp] == 10020 :: stp x28, x27, [sp, #-16]! - // sp == 10010 :: >>final value<< - // - // The frame pointer (w29) points to address 10020. If we use an offset of - // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24 - // for w27, and -32 for w28: - // - // Ltmp1: - // .cfi_def_cfa w29, 16 - // Ltmp2: - // .cfi_offset w30, -8 - // Ltmp3: - // .cfi_offset w29, -16 - // Ltmp4: - // .cfi_offset w27, -24 - // Ltmp5: - // .cfi_offset w28, -32 - - if (HasFP) { - // Define the current CFA rule to use the provided FP. - unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true); - unsigned CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - - // Record the location of the stored LR - unsigned LR = RegInfo->getDwarfRegNum(ARM64::LR, true); - CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createOffset(nullptr, LR, StackGrowth)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - - // Record the location of the stored FP - CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - } else { - // Encode the stack size of the leaf function. - unsigned CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createDefCfaOffset(nullptr, -MFI->getStackSize())); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - } - - // Now emit the moves for whatever callee saved regs we have. - emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr); - } -} - -static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) { - for (unsigned i = 0; CSRegs[i]; ++i) - if (Reg == CSRegs[i]) - return true; - return false; -} - -static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) { - unsigned RtIdx = 0; - if (MI->getOpcode() == ARM64::LDPXpost || MI->getOpcode() == ARM64::LDPDpost) - RtIdx = 1; - - if (MI->getOpcode() == ARM64::LDPXpost || - MI->getOpcode() == ARM64::LDPDpost || MI->getOpcode() == ARM64::LDPXi || - MI->getOpcode() == ARM64::LDPDi) { - if (!isCalleeSavedRegister(MI->getOperand(RtIdx).getReg(), CSRegs) || - !isCalleeSavedRegister(MI->getOperand(RtIdx + 1).getReg(), CSRegs) || - MI->getOperand(RtIdx + 2).getReg() != ARM64::SP) - return false; - return true; - } - - return false; -} - -void ARM64FrameLowering::emitEpilogue(MachineFunction &MF, - MachineBasicBlock &MBB) const { - MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); - assert(MBBI->isReturn() && "Can only insert epilog into returning blocks"); - MachineFrameInfo *MFI = MF.getFrameInfo(); - const ARM64InstrInfo *TII = - static_cast(MF.getTarget().getInstrInfo()); - const ARM64RegisterInfo *RegInfo = - static_cast(MF.getTarget().getRegisterInfo()); - DebugLoc DL = MBBI->getDebugLoc(); - unsigned RetOpcode = MBBI->getOpcode(); - - int NumBytes = MFI->getStackSize(); - const ARM64FunctionInfo *AFI = MF.getInfo(); - - // Initial and residual are named for consitency with the prologue. Note that - // in the epilogue, the residual adjustment is executed first. - uint64_t ArgumentPopSize = 0; - if (RetOpcode == ARM64::TCRETURNdi || RetOpcode == ARM64::TCRETURNri) { - MachineOperand &StackAdjust = MBBI->getOperand(1); - - // For a tail-call in a callee-pops-arguments environment, some or all of - // the stack may actually be in use for the call's arguments, this is - // calculated during LowerCall and consumed here... - ArgumentPopSize = StackAdjust.getImm(); - } else { - // ... otherwise the amount to pop is *all* of the argument space, - // conveniently stored in the MachineFunctionInfo by - // LowerFormalArguments. This will, of course, be zero for the C calling - // convention. - ArgumentPopSize = AFI->getArgumentStackToRestore(); - } - - // The stack frame should be like below, - // - // ---------------------- --- - // | | | - // | BytesInStackArgArea| CalleeArgStackSize - // | (NumReusableBytes) | (of tail call) - // | | --- - // | | | - // ---------------------| --- | - // | | | | - // | CalleeSavedReg | | | - // | (NumRestores * 16) | | | - // | | | | - // ---------------------| | NumBytes - // | | StackSize (StackAdjustUp) - // | LocalStackSize | | | - // | (covering callee | | | - // | args) | | | - // | | | | - // ---------------------- --- --- - // - // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize - // = StackSize + ArgumentPopSize - // - // ARM64TargetLowering::LowerCall figures out ArgumentPopSize and keeps - // it as the 2nd argument of ARM64ISD::TC_RETURN. - NumBytes += ArgumentPopSize; - - unsigned NumRestores = 0; - // Move past the restores of the callee-saved registers. - MachineBasicBlock::iterator LastPopI = MBBI; - const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); - if (LastPopI != MBB.begin()) { - do { - ++NumRestores; - --LastPopI; - } while (LastPopI != MBB.begin() && isCSRestore(LastPopI, CSRegs)); - if (!isCSRestore(LastPopI, CSRegs)) { - ++LastPopI; - --NumRestores; - } - } - NumBytes -= NumRestores * 16; - assert(NumBytes >= 0 && "Negative stack allocation size!?"); - - if (!hasFP(MF)) { - // If this was a redzone leaf function, we don't need to restore the - // stack pointer. - if (!canUseRedZone(MF)) - emitFrameOffset(MBB, LastPopI, DL, ARM64::SP, ARM64::SP, NumBytes, TII); - return; - } - - // Restore the original stack pointer. - // FIXME: Rather than doing the math here, we should instead just use - // non-post-indexed loads for the restores if we aren't actually going to - // be able to save any instructions. - if (NumBytes || MFI->hasVarSizedObjects()) - emitFrameOffset(MBB, LastPopI, DL, ARM64::SP, ARM64::FP, - -(NumRestores - 1) * 16, TII, MachineInstr::NoFlags); -} - -/// getFrameIndexOffset - Returns the displacement from the frame register to -/// the stack frame of the specified index. -int ARM64FrameLowering::getFrameIndexOffset(const MachineFunction &MF, - int FI) const { - unsigned FrameReg; - return getFrameIndexReference(MF, FI, FrameReg); -} - -/// getFrameIndexReference - Provide a base+offset reference to an FI slot for -/// debug info. It's the same as what we use for resolving the code-gen -/// references for now. FIXME: This can go wrong when references are -/// SP-relative and simple call frames aren't used. -int ARM64FrameLowering::getFrameIndexReference(const MachineFunction &MF, - int FI, - unsigned &FrameReg) const { - return resolveFrameIndexReference(MF, FI, FrameReg); -} - -int ARM64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF, - int FI, unsigned &FrameReg, - bool PreferFP) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - const ARM64RegisterInfo *RegInfo = - static_cast(MF.getTarget().getRegisterInfo()); - const ARM64FunctionInfo *AFI = MF.getInfo(); - int FPOffset = MFI->getObjectOffset(FI) + 16; - int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize(); - bool isFixed = MFI->isFixedObjectIndex(FI); - - // Use frame pointer to reference fixed objects. Use it for locals if - // there are VLAs (and thus the SP isn't reliable as a base). - // Make sure useFPForScavengingIndex() does the right thing for the emergency - // spill slot. - bool UseFP = false; - if (AFI->hasStackFrame()) { - // Note: Keeping the following as multiple 'if' statements rather than - // merging to a single expression for readability. - // - // Argument access should always use the FP. - if (isFixed) { - UseFP = hasFP(MF); - } else if (hasFP(MF) && !RegInfo->hasBasePointer(MF)) { - // Use SP or FP, whichever gives us the best chance of the offset - // being in range for direct access. If the FPOffset is positive, - // that'll always be best, as the SP will be even further away. - // If the FPOffset is negative, we have to keep in mind that the - // available offset range for negative offsets is smaller than for - // positive ones. If we have variable sized objects, we're stuck with - // using the FP regardless, though, as the SP offset is unknown - // and we don't have a base pointer available. If an offset is - // available via the FP and the SP, use whichever is closest. - if (PreferFP || MFI->hasVarSizedObjects() || FPOffset >= 0 || - (FPOffset >= -256 && Offset > -FPOffset)) - UseFP = true; - } - } - - if (UseFP) { - FrameReg = RegInfo->getFrameRegister(MF); - return FPOffset; - } - - // Use the base pointer if we have one. - if (RegInfo->hasBasePointer(MF)) - FrameReg = RegInfo->getBaseRegister(); - else { - FrameReg = ARM64::SP; - // If we're using the red zone for this function, the SP won't actually - // be adjusted, so the offsets will be negative. They're also all - // within range of the signed 9-bit immediate instructions. - if (canUseRedZone(MF)) - Offset -= AFI->getLocalStackSize(); - } - - return Offset; -} - -static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) { - if (Reg != ARM64::LR) - return getKillRegState(true); - - // LR maybe referred to later by an @llvm.returnaddress intrinsic. - bool LRLiveIn = MF.getRegInfo().isLiveIn(ARM64::LR); - bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken()); - return getKillRegState(LRKill); -} - -bool ARM64FrameLowering::spillCalleeSavedRegisters( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector &CSI, - const TargetRegisterInfo *TRI) const { - MachineFunction &MF = *MBB.getParent(); - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); - unsigned Count = CSI.size(); - DebugLoc DL; - assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); - - if (MI != MBB.end()) - DL = MI->getDebugLoc(); - - for (unsigned i = 0; i < Count; i += 2) { - unsigned idx = Count - i - 2; - unsigned Reg1 = CSI[idx].getReg(); - unsigned Reg2 = CSI[idx + 1].getReg(); - // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI - // list to come in sorted by frame index so that we can issue the store - // pair instructions directly. Assert if we see anything otherwise. - // - // The order of the registers in the list is controlled by - // getCalleeSavedRegs(), so they will always be in-order, as well. - assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() && - "Out of order callee saved regs!"); - unsigned StrOpc; - assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); - assert((i & 1) == 0 && "Odd index for callee-saved reg spill!"); - // Issue sequence of non-sp increment and pi sp spills for cs regs. The - // first spill is a pre-increment that allocates the stack. - // For example: - // stp x22, x21, [sp, #-48]! // addImm(-6) - // stp x20, x19, [sp, #16] // addImm(+2) - // stp fp, lr, [sp, #32] // addImm(+4) - // Rationale: This sequence saves uop updates compared to a sequence of - // pre-increment spills like stp xi,xj,[sp,#-16]! - // Note: Similar rational and sequence for restores in epilog. - if (ARM64::GPR64RegClass.contains(Reg1)) { - assert(ARM64::GPR64RegClass.contains(Reg2) && - "Expected GPR64 callee-saved register pair!"); - // For first spill use pre-increment store. - if (i == 0) - StrOpc = ARM64::STPXpre; - else - StrOpc = ARM64::STPXi; - } else if (ARM64::FPR64RegClass.contains(Reg1)) { - assert(ARM64::FPR64RegClass.contains(Reg2) && - "Expected FPR64 callee-saved register pair!"); - // For first spill use pre-increment store. - if (i == 0) - StrOpc = ARM64::STPDpre; - else - StrOpc = ARM64::STPDi; - } else - llvm_unreachable("Unexpected callee saved register!"); - DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", " - << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx() - << ", " << CSI[idx + 1].getFrameIdx() << ")\n"); - // Compute offset: i = 0 => offset = -Count; - // i = 2 => offset = -(Count - 2) + Count = 2 = i; etc. - const int Offset = (i == 0) ? -Count : i; - assert((Offset >= -64 && Offset <= 63) && - "Offset out of bounds for STP immediate"); - MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc)); - if (StrOpc == ARM64::STPDpre || StrOpc == ARM64::STPXpre) - MIB.addReg(ARM64::SP, RegState::Define); - - MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)) - .addReg(Reg1, getPrologueDeath(MF, Reg1)) - .addReg(ARM64::SP) - .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit - .setMIFlag(MachineInstr::FrameSetup); - } - return true; -} - -bool ARM64FrameLowering::restoreCalleeSavedRegisters( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector &CSI, - const TargetRegisterInfo *TRI) const { - MachineFunction &MF = *MBB.getParent(); - const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo(); - unsigned Count = CSI.size(); - DebugLoc DL; - assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); - - if (MI != MBB.end()) - DL = MI->getDebugLoc(); - - for (unsigned i = 0; i < Count; i += 2) { - unsigned Reg1 = CSI[i].getReg(); - unsigned Reg2 = CSI[i + 1].getReg(); - // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI - // list to come in sorted by frame index so that we can issue the store - // pair instructions directly. Assert if we see anything otherwise. - assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() && - "Out of order callee saved regs!"); - // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only - // the last load is sp-pi post-increment and de-allocates the stack: - // For example: - // ldp fp, lr, [sp, #32] // addImm(+4) - // ldp x20, x19, [sp, #16] // addImm(+2) - // ldp x22, x21, [sp], #48 // addImm(+6) - // Note: see comment in spillCalleeSavedRegisters() - unsigned LdrOpc; - - assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); - assert((i & 1) == 0 && "Odd index for callee-saved reg spill!"); - if (ARM64::GPR64RegClass.contains(Reg1)) { - assert(ARM64::GPR64RegClass.contains(Reg2) && - "Expected GPR64 callee-saved register pair!"); - if (i == Count - 2) - LdrOpc = ARM64::LDPXpost; - else - LdrOpc = ARM64::LDPXi; - } else if (ARM64::FPR64RegClass.contains(Reg1)) { - assert(ARM64::FPR64RegClass.contains(Reg2) && - "Expected FPR64 callee-saved register pair!"); - if (i == Count - 2) - LdrOpc = ARM64::LDPDpost; - else - LdrOpc = ARM64::LDPDi; - } else - llvm_unreachable("Unexpected callee saved register!"); - DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", " - << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx() - << ", " << CSI[i + 1].getFrameIdx() << ")\n"); - - // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4; - // etc. - const int Offset = (i == Count - 2) ? Count : Count - i - 2; - assert((Offset >= -64 && Offset <= 63) && - "Offset out of bounds for LDP immediate"); - MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc)); - if (LdrOpc == ARM64::LDPXpost || LdrOpc == ARM64::LDPDpost) - MIB.addReg(ARM64::SP, RegState::Define); - - MIB.addReg(Reg2, getDefRegState(true)) - .addReg(Reg1, getDefRegState(true)) - .addReg(ARM64::SP) - .addImm(Offset); // [sp], #offset * 8 or [sp, #offset * 8] - // where the factor * 8 is implicit - } - return true; -} - -void ARM64FrameLowering::processFunctionBeforeCalleeSavedScan( - MachineFunction &MF, RegScavenger *RS) const { - const ARM64RegisterInfo *RegInfo = - static_cast(MF.getTarget().getRegisterInfo()); - ARM64FunctionInfo *AFI = MF.getInfo(); - MachineRegisterInfo *MRI = &MF.getRegInfo(); - SmallVector UnspilledCSGPRs; - SmallVector UnspilledCSFPRs; - - // The frame record needs to be created by saving the appropriate registers - if (hasFP(MF)) { - MRI->setPhysRegUsed(ARM64::FP); - MRI->setPhysRegUsed(ARM64::LR); - } - - // Spill the BasePtr if it's used. Do this first thing so that the - // getCalleeSavedRegs() below will get the right answer. - if (RegInfo->hasBasePointer(MF)) - MRI->setPhysRegUsed(RegInfo->getBaseRegister()); - - // If any callee-saved registers are used, the frame cannot be eliminated. - unsigned NumGPRSpilled = 0; - unsigned NumFPRSpilled = 0; - bool ExtraCSSpill = false; - bool CanEliminateFrame = true; - DEBUG(dbgs() << "*** processFunctionBeforeCalleeSavedScan\nUsed CSRs:"); - const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); - - // Check pairs of consecutive callee-saved registers. - for (unsigned i = 0; CSRegs[i]; i += 2) { - assert(CSRegs[i + 1] && "Odd number of callee-saved registers!"); - - const unsigned OddReg = CSRegs[i]; - const unsigned EvenReg = CSRegs[i + 1]; - assert((ARM64::GPR64RegClass.contains(OddReg) && - ARM64::GPR64RegClass.contains(EvenReg)) ^ - (ARM64::FPR64RegClass.contains(OddReg) && - ARM64::FPR64RegClass.contains(EvenReg)) && - "Register class mismatch!"); - - const bool OddRegUsed = MRI->isPhysRegUsed(OddReg); - const bool EvenRegUsed = MRI->isPhysRegUsed(EvenReg); - - // Early exit if none of the registers in the register pair is actually - // used. - if (!OddRegUsed && !EvenRegUsed) { - if (ARM64::GPR64RegClass.contains(OddReg)) { - UnspilledCSGPRs.push_back(OddReg); - UnspilledCSGPRs.push_back(EvenReg); - } else { - UnspilledCSFPRs.push_back(OddReg); - UnspilledCSFPRs.push_back(EvenReg); - } - continue; - } - - unsigned Reg = ARM64::NoRegister; - // If only one of the registers of the register pair is used, make sure to - // mark the other one as used as well. - if (OddRegUsed ^ EvenRegUsed) { - // Find out which register is the additional spill. - Reg = OddRegUsed ? EvenReg : OddReg; - MRI->setPhysRegUsed(Reg); - } - - DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo)); - DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo)); - - assert(((OddReg == ARM64::LR && EvenReg == ARM64::FP) || - (RegInfo->getEncodingValue(OddReg) + 1 == - RegInfo->getEncodingValue(EvenReg))) && - "Register pair of non-adjacent registers!"); - if (ARM64::GPR64RegClass.contains(OddReg)) { - NumGPRSpilled += 2; - // If it's not a reserved register, we can use it in lieu of an - // emergency spill slot for the register scavenger. - // FIXME: It would be better to instead keep looking and choose another - // unspilled register that isn't reserved, if there is one. - if (Reg != ARM64::NoRegister && !RegInfo->isReservedReg(MF, Reg)) - ExtraCSSpill = true; - } else - NumFPRSpilled += 2; - - CanEliminateFrame = false; - } - - // FIXME: Set BigStack if any stack slot references may be out of range. - // For now, just conservatively guestimate based on unscaled indexing - // range. We'll end up allocating an unnecessary spill slot a lot, but - // realistically that's not a big deal at this stage of the game. - // The CSR spill slots have not been allocated yet, so estimateStackSize - // won't include them. - MachineFrameInfo *MFI = MF.getFrameInfo(); - unsigned CFSize = estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled); - DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n"); - bool BigStack = (CFSize >= 256); - if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) - AFI->setHasStackFrame(true); - - // Estimate if we might need to scavenge a register at some point in order - // to materialize a stack offset. If so, either spill one additional - // callee-saved register or reserve a special spill slot to facilitate - // register scavenging. If we already spilled an extra callee-saved register - // above to keep the number of spills even, we don't need to do anything else - // here. - if (BigStack && !ExtraCSSpill) { - - // If we're adding a register to spill here, we have to add two of them - // to keep the number of regs to spill even. - assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!"); - unsigned Count = 0; - while (!UnspilledCSGPRs.empty() && Count < 2) { - unsigned Reg = UnspilledCSGPRs.back(); - UnspilledCSGPRs.pop_back(); - DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo) - << " to get a scratch register.\n"); - MRI->setPhysRegUsed(Reg); - ExtraCSSpill = true; - ++Count; - } - - // If we didn't find an extra callee-saved register to spill, create - // an emergency spill slot. - if (!ExtraCSSpill) { - const TargetRegisterClass *RC = &ARM64::GPR64RegClass; - int FI = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), false); - RS->addScavengingFrameIndex(FI); - DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI - << " as the emergency spill slot.\n"); - } - } -} diff --git a/lib/Target/ARM64/ARM64FrameLowering.h b/lib/Target/ARM64/ARM64FrameLowering.h deleted file mode 100644 index 1991a0a18dd..00000000000 --- a/lib/Target/ARM64/ARM64FrameLowering.h +++ /dev/null @@ -1,75 +0,0 @@ -//===-- ARM64FrameLowering.h - TargetFrameLowering for ARM64 ----*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// -// -//===----------------------------------------------------------------------===// - -#ifndef ARM64_FRAMELOWERING_H -#define ARM64_FRAMELOWERING_H - -#include "llvm/Target/TargetFrameLowering.h" - -namespace llvm { - -class ARM64Subtarget; -class ARM64TargetMachine; - -class ARM64FrameLowering : public TargetFrameLowering { - const ARM64TargetMachine &TM; - -public: - explicit ARM64FrameLowering(const ARM64TargetMachine &TM, - const ARM64Subtarget &STI) - : TargetFrameLowering(StackGrowsDown, 16, 0, 16, - false /*StackRealignable*/), - TM(TM) {} - - void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - unsigned FramePtr) const; - - void eliminateCallFramePseudoInstr(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) const override; - - /// emitProlog/emitEpilog - These methods insert prolog and epilog code into - /// the function. - void emitPrologue(MachineFunction &MF) const override; - void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; - - int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; - int getFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg) const override; - int resolveFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg, - bool PreferFP = false) const; - bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector &CSI, - const TargetRegisterInfo *TRI) const override; - - bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - const std::vector &CSI, - const TargetRegisterInfo *TRI) const override; - - /// \brief Can this function use the red zone for local allocations. - bool canUseRedZone(const MachineFunction &MF) const; - - bool hasFP(const MachineFunction &MF) const override; - bool hasReservedCallFrame(const MachineFunction &MF) const override; - - void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS) const override; -}; - -} // End llvm namespace - -#endif diff --git a/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp b/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp deleted file mode 100644 index 23c45d414e2..00000000000 --- a/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp +++ /dev/null @@ -1,3030 +0,0 @@ -//===-- ARM64ISelDAGToDAG.cpp - A dag to dag inst selector for ARM64 ------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines an instruction selector for the ARM64 target. -// -//===----------------------------------------------------------------------===// - -#include "ARM64TargetMachine.h" -#include "MCTargetDesc/ARM64AddressingModes.h" -#include "llvm/ADT/APSInt.h" -#include "llvm/CodeGen/SelectionDAGISel.h" -#include "llvm/IR/Function.h" // To access function attributes. -#include "llvm/IR/GlobalValue.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -#define DEBUG_TYPE "arm64-isel" - -//===--------------------------------------------------------------------===// -/// ARM64DAGToDAGISel - ARM64 specific code to select ARM64 machine -/// instructions for SelectionDAG operations. -/// -namespace { - -class ARM64DAGToDAGISel : public SelectionDAGISel { - ARM64TargetMachine &TM; - - /// Subtarget - Keep a pointer to the ARM64Subtarget around so that we can - /// make the right decision when generating code for different targets. - const ARM64Subtarget *Subtarget; - - bool ForCodeSize; - -public: - explicit ARM64DAGToDAGISel(ARM64TargetMachine &tm, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(tm, OptLevel), TM(tm), - Subtarget(nullptr), ForCodeSize(false) {} - - const char *getPassName() const override { - return "ARM64 Instruction Selection"; - } - - bool runOnMachineFunction(MachineFunction &MF) override { - AttributeSet FnAttrs = MF.getFunction()->getAttributes(); - ForCodeSize = - FnAttrs.hasAttribute(AttributeSet::FunctionIndex, - Attribute::OptimizeForSize) || - FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); - Subtarget = &TM.getSubtarget(); - return SelectionDAGISel::runOnMachineFunction(MF); - } - - SDNode *Select(SDNode *Node) override; - - /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for - /// inline asm expressions. - bool SelectInlineAsmMemoryOperand(const SDValue &Op, - char ConstraintCode, - std::vector &OutOps) override; - - SDNode *SelectMLAV64LaneV128(SDNode *N); - SDNode *SelectMULLV64LaneV128(unsigned IntNo, SDNode *N); - bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift); - bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift); - bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift); - bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) { - return SelectShiftedRegister(N, false, Reg, Shift); - } - bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) { - return SelectShiftedRegister(N, true, Reg, Shift); - } - bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) { - return SelectAddrModeIndexed(N, 1, Base, OffImm); - } - bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) { - return SelectAddrModeIndexed(N, 2, Base, OffImm); - } - bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) { - return SelectAddrModeIndexed(N, 4, Base, OffImm); - } - bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) { - return SelectAddrModeIndexed(N, 8, Base, OffImm); - } - bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) { - return SelectAddrModeIndexed(N, 16, Base, OffImm); - } - bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) { - return SelectAddrModeUnscaled(N, 1, Base, OffImm); - } - bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) { - return SelectAddrModeUnscaled(N, 2, Base, OffImm); - } - bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) { - return SelectAddrModeUnscaled(N, 4, Base, OffImm); - } - bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) { - return SelectAddrModeUnscaled(N, 8, Base, OffImm); - } - bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) { - return SelectAddrModeUnscaled(N, 16, Base, OffImm); - } - - template - bool SelectAddrModeWRO(SDValue N, SDValue &Base, SDValue &Offset, - SDValue &SignExtend, SDValue &DoShift) { - return SelectAddrModeWRO(N, Width / 8, Base, Offset, SignExtend, DoShift); - } - - template - bool SelectAddrModeXRO(SDValue N, SDValue &Base, SDValue &Offset, - SDValue &SignExtend, SDValue &DoShift) { - return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift); - } - - - /// Form sequences of consecutive 64/128-bit registers for use in NEON - /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have - /// between 1 and 4 elements. If it contains a single element that is returned - /// unchanged; otherwise a REG_SEQUENCE value is returned. - SDValue createDTuple(ArrayRef Vecs); - SDValue createQTuple(ArrayRef Vecs); - - /// Generic helper for the createDTuple/createQTuple - /// functions. Those should almost always be called instead. - SDValue createTuple(ArrayRef Vecs, unsigned RegClassIDs[], - unsigned SubRegs[]); - - SDNode *SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt); - - SDNode *SelectIndexedLoad(SDNode *N, bool &Done); - - SDNode *SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc, - unsigned SubRegIdx); - SDNode *SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc, - unsigned SubRegIdx); - SDNode *SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); - SDNode *SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc); - - SDNode *SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc); - SDNode *SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc); - SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); - SDNode *SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); - - SDNode *SelectSIMDAddSubNarrowing(unsigned IntNo, SDNode *Node); - SDNode *SelectSIMDXtnNarrowing(unsigned IntNo, SDNode *Node); - - SDNode *SelectBitfieldExtractOp(SDNode *N); - SDNode *SelectBitfieldInsertOp(SDNode *N); - - SDNode *SelectLIBM(SDNode *N); - -// Include the pieces autogenerated from the target description. -#include "ARM64GenDAGISel.inc" - -private: - bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg, - SDValue &Shift); - bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base, - SDValue &OffImm); - bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base, - SDValue &OffImm); - bool SelectAddrModeWRO(SDValue N, unsigned Size, SDValue &Base, - SDValue &Offset, SDValue &SignExtend, - SDValue &DoShift); - bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base, - SDValue &Offset, SDValue &SignExtend, - SDValue &DoShift); - bool isWorthFolding(SDValue V) const; - bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend, - SDValue &Offset, SDValue &SignExtend); - - template - bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) { - return SelectCVTFixedPosOperand(N, FixedPos, RegWidth); - } - - bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width); -}; -} // end anonymous namespace - -/// isIntImmediate - This method tests to see if the node is a constant -/// operand. If so Imm will receive the 32-bit value. -static bool isIntImmediate(const SDNode *N, uint64_t &Imm) { - if (const ConstantSDNode *C = dyn_cast(N)) { - Imm = C->getZExtValue(); - return true; - } - return false; -} - -// isIntImmediate - This method tests to see if a constant operand. -// If so Imm will receive the value. -static bool isIntImmediate(SDValue N, uint64_t &Imm) { - return isIntImmediate(N.getNode(), Imm); -} - -// isOpcWithIntImmediate - This method tests to see if the node is a specific -// opcode and that it has a immediate integer right operand. -// If so Imm will receive the 32 bit value. -static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, - uint64_t &Imm) { - return N->getOpcode() == Opc && - isIntImmediate(N->getOperand(1).getNode(), Imm); -} - -bool ARM64DAGToDAGISel::SelectInlineAsmMemoryOperand( - const SDValue &Op, char ConstraintCode, std::vector &OutOps) { - assert(ConstraintCode == 'm' && "unexpected asm memory constraint"); - // Require the address to be in a register. That is safe for all ARM64 - // variants and it is hard to do anything much smarter without knowing - // how the operand is used. - OutOps.push_back(Op); - return false; -} - -/// SelectArithImmed - Select an immediate value that can be represented as -/// a 12-bit value shifted left by either 0 or 12. If so, return true with -/// Val set to the 12-bit value and Shift set to the shifter operand. -bool ARM64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val, - SDValue &Shift) { - // This function is called from the addsub_shifted_imm ComplexPattern, - // which lists [imm] as the list of opcode it's interested in, however - // we still need to check whether the operand is actually an immediate - // here because the ComplexPattern opcode list is only used in - // root-level opcode matching. - if (!isa(N.getNode())) - return false; - - uint64_t Immed = cast(N.getNode())->getZExtValue(); - unsigned ShiftAmt; - - if (Immed >> 12 == 0) { - ShiftAmt = 0; - } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) { - ShiftAmt = 12; - Immed = Immed >> 12; - } else - return false; - - unsigned ShVal = ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt); - Val = CurDAG->getTargetConstant(Immed, MVT::i32); - Shift = CurDAG->getTargetConstant(ShVal, MVT::i32); - return true; -} - -/// SelectNegArithImmed - As above, but negates the value before trying to -/// select it. -bool ARM64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val, - SDValue &Shift) { - // This function is called from the addsub_shifted_imm ComplexPattern, - // which lists [imm] as the list of opcode it's interested in, however - // we still need to check whether the operand is actually an immediate - // here because the ComplexPattern opcode list is only used in - // root-level opcode matching. - if (!isa(N.getNode())) - return false; - - // The immediate operand must be a 24-bit zero-extended immediate. - uint64_t Immed = cast(N.getNode())->getZExtValue(); - - // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0" - // have the opposite effect on the C flag, so this pattern mustn't match under - // those circumstances. - if (Immed == 0) - return false; - - if (N.getValueType() == MVT::i32) - Immed = ~((uint32_t)Immed) + 1; - else - Immed = ~Immed + 1ULL; - if (Immed & 0xFFFFFFFFFF000000ULL) - return false; - - Immed &= 0xFFFFFFULL; - return SelectArithImmed(CurDAG->getConstant(Immed, MVT::i32), Val, Shift); -} - -/// getShiftTypeForNode - Translate a shift node to the corresponding -/// ShiftType value. -static ARM64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) { - switch (N.getOpcode()) { - default: - return ARM64_AM::InvalidShiftExtend; - case ISD::SHL: - return ARM64_AM::LSL; - case ISD::SRL: - return ARM64_AM::LSR; - case ISD::SRA: - return ARM64_AM::ASR; - case ISD::ROTR: - return ARM64_AM::ROR; - } -} - -/// \brief Determine wether it is worth to fold V into an extended register. -bool ARM64DAGToDAGISel::isWorthFolding(SDValue V) const { - // it hurts if the a value is used at least twice, unless we are optimizing - // for code size. - if (ForCodeSize || V.hasOneUse()) - return true; - return false; -} - -/// SelectShiftedRegister - Select a "shifted register" operand. If the value -/// is not shifted, set the Shift operand to default of "LSL 0". The logical -/// instructions allow the shifted register to be rotated, but the arithmetic -/// instructions do not. The AllowROR parameter specifies whether ROR is -/// supported. -bool ARM64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR, - SDValue &Reg, SDValue &Shift) { - ARM64_AM::ShiftExtendType ShType = getShiftTypeForNode(N); - if (ShType == ARM64_AM::InvalidShiftExtend) - return false; - if (!AllowROR && ShType == ARM64_AM::ROR) - return false; - - if (ConstantSDNode *RHS = dyn_cast(N.getOperand(1))) { - unsigned BitSize = N.getValueType().getSizeInBits(); - unsigned Val = RHS->getZExtValue() & (BitSize - 1); - unsigned ShVal = ARM64_AM::getShifterImm(ShType, Val); - - Reg = N.getOperand(0); - Shift = CurDAG->getTargetConstant(ShVal, MVT::i32); - return isWorthFolding(N); - } - - return false; -} - -/// getExtendTypeForNode - Translate an extend node to the corresponding -/// ExtendType value. -static ARM64_AM::ShiftExtendType -getExtendTypeForNode(SDValue N, bool IsLoadStore = false) { - if (N.getOpcode() == ISD::SIGN_EXTEND || - N.getOpcode() == ISD::SIGN_EXTEND_INREG) { - EVT SrcVT; - if (N.getOpcode() == ISD::SIGN_EXTEND_INREG) - SrcVT = cast(N.getOperand(1))->getVT(); - else - SrcVT = N.getOperand(0).getValueType(); - - if (!IsLoadStore && SrcVT == MVT::i8) - return ARM64_AM::SXTB; - else if (!IsLoadStore && SrcVT == MVT::i16) - return ARM64_AM::SXTH; - else if (SrcVT == MVT::i32) - return ARM64_AM::SXTW; - assert(SrcVT != MVT::i64 && "extend from 64-bits?"); - - return ARM64_AM::InvalidShiftExtend; - } else if (N.getOpcode() == ISD::ZERO_EXTEND || - N.getOpcode() == ISD::ANY_EXTEND) { - EVT SrcVT = N.getOperand(0).getValueType(); - if (!IsLoadStore && SrcVT == MVT::i8) - return ARM64_AM::UXTB; - else if (!IsLoadStore && SrcVT == MVT::i16) - return ARM64_AM::UXTH; - else if (SrcVT == MVT::i32) - return ARM64_AM::UXTW; - assert(SrcVT != MVT::i64 && "extend from 64-bits?"); - - return ARM64_AM::InvalidShiftExtend; - } else if (N.getOpcode() == ISD::AND) { - ConstantSDNode *CSD = dyn_cast(N.getOperand(1)); - if (!CSD) - return ARM64_AM::InvalidShiftExtend; - uint64_t AndMask = CSD->getZExtValue(); - - switch (AndMask) { - default: - return ARM64_AM::InvalidShiftExtend; - case 0xFF: - return !IsLoadStore ? ARM64_AM::UXTB : ARM64_AM::InvalidShiftExtend; - case 0xFFFF: - return !IsLoadStore ? ARM64_AM::UXTH : ARM64_AM::InvalidShiftExtend; - case 0xFFFFFFFF: - return ARM64_AM::UXTW; - } - } - - return ARM64_AM::InvalidShiftExtend; -} - -// Helper for SelectMLAV64LaneV128 - Recognize high lane extracts. -static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) { - if (DL->getOpcode() != ARM64ISD::DUPLANE16 && - DL->getOpcode() != ARM64ISD::DUPLANE32) - return false; - - SDValue SV = DL->getOperand(0); - if (SV.getOpcode() != ISD::INSERT_SUBVECTOR) - return false; - - SDValue EV = SV.getOperand(1); - if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR) - return false; - - ConstantSDNode *DLidx = cast(DL->getOperand(1).getNode()); - ConstantSDNode *EVidx = cast(EV.getOperand(1).getNode()); - LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue(); - LaneOp = EV.getOperand(0); - - return true; -} - -// Helper for SelectOpcV64LaneV128 - Recogzine operatinos where one operand is a -// high lane extract. -static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp, - SDValue &LaneOp, int &LaneIdx) { - - if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) { - std::swap(Op0, Op1); - if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) - return false; - } - StdOp = Op1; - return true; -} - -/// SelectMLAV64LaneV128 - ARM64 supports vector MLAs where one multiplicand is -/// a lane in the upper half of a 128-bit vector. Recognize and select this so -/// that we don't emit unnecessary lane extracts. -SDNode *ARM64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) { - SDValue Op0 = N->getOperand(0); - SDValue Op1 = N->getOperand(1); - SDValue MLAOp1; // Will hold ordinary multiplicand for MLA. - SDValue MLAOp2; // Will hold lane-accessed multiplicand for MLA. - int LaneIdx = -1; // Will hold the lane index. - - if (Op1.getOpcode() != ISD::MUL || - !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2, - LaneIdx)) { - std::swap(Op0, Op1); - if (Op1.getOpcode() != ISD::MUL || - !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2, - LaneIdx)) - return nullptr; - } - - SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64); - - SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal }; - - unsigned MLAOpc = ~0U; - - switch (N->getSimpleValueType(0).SimpleTy) { - default: - llvm_unreachable("Unrecognized MLA."); - case MVT::v4i16: - MLAOpc = ARM64::MLAv4i16_indexed; - break; - case MVT::v8i16: - MLAOpc = ARM64::MLAv8i16_indexed; - break; - case MVT::v2i32: - MLAOpc = ARM64::MLAv2i32_indexed; - break; - case MVT::v4i32: - MLAOpc = ARM64::MLAv4i32_indexed; - break; - } - - return CurDAG->getMachineNode(MLAOpc, SDLoc(N), N->getValueType(0), Ops); -} - -SDNode *ARM64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) { - SDValue SMULLOp0; - SDValue SMULLOp1; - int LaneIdx; - - if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1, - LaneIdx)) - return nullptr; - - SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64); - - SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal }; - - unsigned SMULLOpc = ~0U; - - if (IntNo == Intrinsic::arm64_neon_smull) { - switch (N->getSimpleValueType(0).SimpleTy) { - default: - llvm_unreachable("Unrecognized SMULL."); - case MVT::v4i32: - SMULLOpc = ARM64::SMULLv4i16_indexed; - break; - case MVT::v2i64: - SMULLOpc = ARM64::SMULLv2i32_indexed; - break; - } - } else if (IntNo == Intrinsic::arm64_neon_umull) { - switch (N->getSimpleValueType(0).SimpleTy) { - default: - llvm_unreachable("Unrecognized SMULL."); - case MVT::v4i32: - SMULLOpc = ARM64::UMULLv4i16_indexed; - break; - case MVT::v2i64: - SMULLOpc = ARM64::UMULLv2i32_indexed; - break; - } - } else - llvm_unreachable("Unrecognized intrinsic."); - - return CurDAG->getMachineNode(SMULLOpc, SDLoc(N), N->getValueType(0), Ops); -} - -/// Instructions that accept extend modifiers like UXTW expect the register -/// being extended to be a GPR32, but the incoming DAG might be acting on a -/// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if -/// this is the case. -static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) { - if (N.getValueType() == MVT::i32) - return N; - - SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32); - MachineSDNode *Node = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, - SDLoc(N), MVT::i32, N, SubReg); - return SDValue(Node, 0); -} - - -/// SelectArithExtendedRegister - Select a "extended register" operand. This -/// operand folds in an extend followed by an optional left shift. -bool ARM64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg, - SDValue &Shift) { - unsigned ShiftVal = 0; - ARM64_AM::ShiftExtendType Ext; - - if (N.getOpcode() == ISD::SHL) { - ConstantSDNode *CSD = dyn_cast(N.getOperand(1)); - if (!CSD) - return false; - ShiftVal = CSD->getZExtValue(); - if (ShiftVal > 4) - return false; - - Ext = getExtendTypeForNode(N.getOperand(0)); - if (Ext == ARM64_AM::InvalidShiftExtend) - return false; - - Reg = N.getOperand(0).getOperand(0); - } else { - Ext = getExtendTypeForNode(N); - if (Ext == ARM64_AM::InvalidShiftExtend) - return false; - - Reg = N.getOperand(0); - } - - // ARM64 mandates that the RHS of the operation must use the smallest - // register classs that could contain the size being extended from. Thus, - // if we're folding a (sext i8), we need the RHS to be a GPR32, even though - // there might not be an actual 32-bit value in the program. We can - // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here. - assert(Ext != ARM64_AM::UXTX && Ext != ARM64_AM::SXTX); - Reg = narrowIfNeeded(CurDAG, Reg); - Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), MVT::i32); - return isWorthFolding(N); -} - -/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit -/// immediate" address. The "Size" argument is the size in bytes of the memory -/// reference, which determines the scale. -bool ARM64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size, - SDValue &Base, SDValue &OffImm) { - const TargetLowering *TLI = getTargetLowering(); - if (N.getOpcode() == ISD::FrameIndex) { - int FI = cast(N)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); - OffImm = CurDAG->getTargetConstant(0, MVT::i64); - return true; - } - - if (N.getOpcode() == ARM64ISD::ADDlow) { - GlobalAddressSDNode *GAN = - dyn_cast(N.getOperand(1).getNode()); - Base = N.getOperand(0); - OffImm = N.getOperand(1); - if (!GAN) - return true; - - const GlobalValue *GV = GAN->getGlobal(); - unsigned Alignment = GV->getAlignment(); - const DataLayout *DL = TLI->getDataLayout(); - if (Alignment == 0 && !Subtarget->isTargetDarwin()) - Alignment = DL->getABITypeAlignment(GV->getType()->getElementType()); - - if (Alignment >= Size) - return true; - } - - if (CurDAG->isBaseWithConstantOffset(N)) { - if (ConstantSDNode *RHS = dyn_cast(N.getOperand(1))) { - int64_t RHSC = (int64_t)RHS->getZExtValue(); - unsigned Scale = Log2_32(Size); - if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) { - Base = N.getOperand(0); - if (Base.getOpcode() == ISD::FrameIndex) { - int FI = cast(Base)->getIndex(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); - } - OffImm = CurDAG->getTargetConstant(RHSC >> Scale, MVT::i64); - return true; - } - } - } - - // Before falling back to our general case, check if the unscaled - // instructions can handle this. If so, that's preferable. - if (SelectAddrModeUnscaled(N, Size, Base, OffImm)) - return false; - - // Base only. The address will be materialized into a register before - // the memory is accessed. - // add x0, Xbase, #offset - // ldr x0, [x0] - Base = N; - OffImm = CurDAG->getTargetConstant(0, MVT::i64); - return true; -} - -/// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit -/// immediate" address. This should only match when there is an offset that -/// is not valid for a scaled immediate addressing mode. The "Size" argument -/// is the size in bytes of the memory reference, which is needed here to know -/// what is valid for a scaled immediate. -bool ARM64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size, - SDValue &Base, SDValue &OffImm) { - if (!CurDAG->isBaseWithConstantOffset(N)) - return false; - if (ConstantSDNode *RHS = dyn_cast(N.getOperand(1))) { - int64_t RHSC = RHS->getSExtValue(); - // If the offset is valid as a scaled immediate, don't match here. - if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && - RHSC < (0x1000 << Log2_32(Size))) - return false; - if (RHSC >= -256 && RHSC < 256) { - Base = N.getOperand(0); - if (Base.getOpcode() == ISD::FrameIndex) { - int FI = cast(Base)->getIndex(); - const TargetLowering *TLI = getTargetLowering(); - Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); - } - OffImm = CurDAG->getTargetConstant(RHSC, MVT::i64); - return true; - } - } - return false; -} - -static SDValue Widen(SelectionDAG *CurDAG, SDValue N) { - SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32); - SDValue ImpDef = SDValue( - CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SDLoc(N), MVT::i64), - 0); - MachineSDNode *Node = CurDAG->getMachineNode( - TargetOpcode::INSERT_SUBREG, SDLoc(N), MVT::i64, ImpDef, N, SubReg); - return SDValue(Node, 0); -} - -/// \brief Check if the given SHL node (\p N), can be used to form an -/// extended register for an addressing mode. -bool ARM64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size, - bool WantExtend, SDValue &Offset, - SDValue &SignExtend) { - assert(N.getOpcode() == ISD::SHL && "Invalid opcode."); - ConstantSDNode *CSD = dyn_cast(N.getOperand(1)); - if (!CSD || (CSD->getZExtValue() & 0x7) != CSD->getZExtValue()) - return false; - - if (WantExtend) { - ARM64_AM::ShiftExtendType Ext = getExtendTypeForNode(N.getOperand(0), true); - if (Ext == ARM64_AM::InvalidShiftExtend) - return false; - - Offset = narrowIfNeeded(CurDAG, N.getOperand(0).getOperand(0)); - SignExtend = CurDAG->getTargetConstant(Ext == ARM64_AM::SXTW, MVT::i32); - } else { - Offset = N.getOperand(0); - SignExtend = CurDAG->getTargetConstant(0, MVT::i32); - } - - unsigned LegalShiftVal = Log2_32(Size); - unsigned ShiftVal = CSD->getZExtValue(); - - if (ShiftVal != 0 && ShiftVal != LegalShiftVal) - return false; - - if (isWorthFolding(N)) - return true; - - return false; -} - -bool ARM64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size, - SDValue &Base, SDValue &Offset, - SDValue &SignExtend, - SDValue &DoShift) { - if (N.getOpcode() != ISD::ADD) - return false; - SDValue LHS = N.getOperand(0); - SDValue RHS = N.getOperand(1); - - // We don't want to match immediate adds here, because they are better lowered - // to the register-immediate addressing modes. - if (isa(LHS) || isa(RHS)) - return false; - - // Check if this particular node is reused in any non-memory related - // operation. If yes, do not try to fold this node into the address - // computation, since the computation will be kept. - const SDNode *Node = N.getNode(); - for (SDNode *UI : Node->uses()) { - if (!isa(*UI)) - return false; - } - - // Remember if it is worth folding N when it produces extended register. - bool IsExtendedRegisterWorthFolding = isWorthFolding(N); - - // Try to match a shifted extend on the RHS. - if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL && - SelectExtendedSHL(RHS, Size, true, Offset, SignExtend)) { - Base = LHS; - DoShift = CurDAG->getTargetConstant(true, MVT::i32); - return true; - } - - // Try to match a shifted extend on the LHS. - if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL && - SelectExtendedSHL(LHS, Size, true, Offset, SignExtend)) { - Base = RHS; - DoShift = CurDAG->getTargetConstant(true, MVT::i32); - return true; - } - - // There was no shift, whatever else we find. - DoShift = CurDAG->getTargetConstant(false, MVT::i32); - - ARM64_AM::ShiftExtendType Ext = ARM64_AM::InvalidShiftExtend; - // Try to match an unshifted extend on the LHS. - if (IsExtendedRegisterWorthFolding && - (Ext = getExtendTypeForNode(LHS, true)) != ARM64_AM::InvalidShiftExtend) { - Base = RHS; - Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0)); - SignExtend = CurDAG->getTargetConstant(Ext == ARM64_AM::SXTW, MVT::i32); - if (isWorthFolding(LHS)) - return true; - } - - // Try to match an unshifted extend on the RHS. - if (IsExtendedRegisterWorthFolding && - (Ext = getExtendTypeForNode(RHS, true)) != ARM64_AM::InvalidShiftExtend) { - Base = LHS; - Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0)); - SignExtend = CurDAG->getTargetConstant(Ext == ARM64_AM::SXTW, MVT::i32); - if (isWorthFolding(RHS)) - return true; - } - - return false; -} - -bool ARM64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size, - SDValue &Base, SDValue &Offset, - SDValue &SignExtend, - SDValue &DoShift) { - if (N.getOpcode() != ISD::ADD) - return false; - SDValue LHS = N.getOperand(0); - SDValue RHS = N.getOperand(1); - - // We don't want to match immediate adds here, because they are better lowered - // to the register-immediate addressing modes. - if (isa(LHS) || isa(RHS)) - return false; - - // Check if this particular node is reused in any non-memory related - // operation. If yes, do not try to fold this node into the address - // computation, since the computation will be kept. - const SDNode *Node = N.getNode(); - for (SDNode *UI : Node->uses()) { - if (!isa(*UI)) - return false; - } - - // Remember if it is worth folding N when it produces extended register. - bool IsExtendedRegisterWorthFolding = isWorthFolding(N); - - // Try to match a shifted extend on the RHS. - if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL && - SelectExtendedSHL(RHS, Size, false, Offset, SignExtend)) { - Base = LHS; - DoShift = CurDAG->getTargetConstant(true, MVT::i32); - return true; - } - - // Try to match a shifted extend on the LHS. - if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL && - SelectExtendedSHL(LHS, Size, false, Offset, SignExtend)) { - Base = RHS; - DoShift = CurDAG->getTargetConstant(true, MVT::i32); - return true; - } - - // Match any non-shifted, non-extend, non-immediate add expression. - Base = LHS; - Offset = RHS; - SignExtend = CurDAG->getTargetConstant(false, MVT::i32); - DoShift = CurDAG->getTargetConstant(false, MVT::i32); - // Reg1 + Reg2 is free: no check needed. - return true; -} - -SDValue ARM64DAGToDAGISel::createDTuple(ArrayRef Regs) { - static unsigned RegClassIDs[] = { ARM64::DDRegClassID, ARM64::DDDRegClassID, - ARM64::DDDDRegClassID }; - static unsigned SubRegs[] = { ARM64::dsub0, ARM64::dsub1, - ARM64::dsub2, ARM64::dsub3 }; - - return createTuple(Regs, RegClassIDs, SubRegs); -} - -SDValue ARM64DAGToDAGISel::createQTuple(ArrayRef Regs) { - static unsigned RegClassIDs[] = { ARM64::QQRegClassID, ARM64::QQQRegClassID, - ARM64::QQQQRegClassID }; - static unsigned SubRegs[] = { ARM64::qsub0, ARM64::qsub1, - ARM64::qsub2, ARM64::qsub3 }; - - return createTuple(Regs, RegClassIDs, SubRegs); -} - -SDValue ARM64DAGToDAGISel::createTuple(ArrayRef Regs, - unsigned RegClassIDs[], - unsigned SubRegs[]) { - // There's no special register-class for a vector-list of 1 element: it's just - // a vector. - if (Regs.size() == 1) - return Regs[0]; - - assert(Regs.size() >= 2 && Regs.size() <= 4); - - SDLoc DL(Regs[0].getNode()); - - SmallVector Ops; - - // First operand of REG_SEQUENCE is the desired RegClass. - Ops.push_back( - CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], MVT::i32)); - - // Then we get pairs of source & subregister-position for the components. - for (unsigned i = 0; i < Regs.size(); ++i) { - Ops.push_back(Regs[i]); - Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], MVT::i32)); - } - - SDNode *N = - CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops); - return SDValue(N, 0); -} - -SDNode *ARM64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, - unsigned Opc, bool isExt) { - SDLoc dl(N); - EVT VT = N->getValueType(0); - - unsigned ExtOff = isExt; - - // Form a REG_SEQUENCE to force register allocation. - unsigned Vec0Off = ExtOff + 1; - SmallVector Regs(N->op_begin() + Vec0Off, - N->op_begin() + Vec0Off + NumVecs); - SDValue RegSeq = createQTuple(Regs); - - SmallVector Ops; - if (isExt) - Ops.push_back(N->getOperand(1)); - Ops.push_back(RegSeq); - Ops.push_back(N->getOperand(NumVecs + ExtOff + 1)); - return CurDAG->getMachineNode(Opc, dl, VT, Ops); -} - -SDNode *ARM64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) { - LoadSDNode *LD = cast(N); - if (LD->isUnindexed()) - return nullptr; - EVT VT = LD->getMemoryVT(); - EVT DstVT = N->getValueType(0); - ISD::MemIndexedMode AM = LD->getAddressingMode(); - bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC; - - // We're not doing validity checking here. That was done when checking - // if we should mark the load as indexed or not. We're just selecting - // the right instruction. - unsigned Opcode = 0; - - ISD::LoadExtType ExtType = LD->getExtensionType(); - bool InsertTo64 = false; - if (VT == MVT::i64) - Opcode = IsPre ? ARM64::LDRXpre : ARM64::LDRXpost; - else if (VT == MVT::i32) { - if (ExtType == ISD::NON_EXTLOAD) - Opcode = IsPre ? ARM64::LDRWpre : ARM64::LDRWpost; - else if (ExtType == ISD::SEXTLOAD) - Opcode = IsPre ? ARM64::LDRSWpre : ARM64::LDRSWpost; - else { - Opcode = IsPre ? ARM64::LDRWpre : ARM64::LDRWpost; - InsertTo64 = true; - // The result of the load is only i32. It's the subreg_to_reg that makes - // it into an i64. - DstVT = MVT::i32; - } - } else if (VT == MVT::i16) { - if (ExtType == ISD::SEXTLOAD) { - if (DstVT == MVT::i64) - Opcode = IsPre ? ARM64::LDRSHXpre : ARM64::LDRSHXpost; - else - Opcode = IsPre ? ARM64::LDRSHWpre : ARM64::LDRSHWpost; - } else { - Opcode = IsPre ? ARM64::LDRHHpre : ARM64::LDRHHpost; - InsertTo64 = DstVT == MVT::i64; - // The result of the load is only i32. It's the subreg_to_reg that makes - // it into an i64. - DstVT = MVT::i32; - } - } else if (VT == MVT::i8) { - if (ExtType == ISD::SEXTLOAD) { - if (DstVT == MVT::i64) - Opcode = IsPre ? ARM64::LDRSBXpre : ARM64::LDRSBXpost; - else - Opcode = IsPre ? ARM64::LDRSBWpre : ARM64::LDRSBWpost; - } else { - Opcode = IsPre ? ARM64::LDRBBpre : ARM64::LDRBBpost; - InsertTo64 = DstVT == MVT::i64; - // The result of the load is only i32. It's the subreg_to_reg that makes - // it into an i64. - DstVT = MVT::i32; - } - } else if (VT == MVT::f32) { - Opcode = IsPre ? ARM64::LDRSpre : ARM64::LDRSpost; - } else if (VT == MVT::f64 || VT.is64BitVector()) { - Opcode = IsPre ? ARM64::LDRDpre : ARM64::LDRDpost; - } else if (VT.is128BitVector()) { - Opcode = IsPre ? ARM64::LDRQpre : ARM64::LDRQpost; - } else - return nullptr; - SDValue Chain = LD->getChain(); - SDValue Base = LD->getBasePtr(); - ConstantSDNode *OffsetOp = cast(LD->getOffset()); - int OffsetVal = (int)OffsetOp->getZExtValue(); - SDValue Offset = CurDAG->getTargetConstant(OffsetVal, MVT::i64); - SDValue Ops[] = { Base, Offset, Chain }; - SDNode *Res = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i64, DstVT, - MVT::Other, Ops); - // Either way, we're replacing the node, so tell the caller that. - Done = true; - SDValue LoadedVal = SDValue(Res, 1); - if (InsertTo64) { - SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32); - LoadedVal = - SDValue(CurDAG->getMachineNode(ARM64::SUBREG_TO_REG, SDLoc(N), MVT::i64, - CurDAG->getTargetConstant(0, MVT::i64), - LoadedVal, SubReg), - 0); - } - - ReplaceUses(SDValue(N, 0), LoadedVal); - ReplaceUses(SDValue(N, 1), SDValue(Res, 0)); - ReplaceUses(SDValue(N, 2), SDValue(Res, 2)); - - return nullptr; -} - -SDNode *ARM64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc, - unsigned SubRegIdx) { - SDLoc dl(N); - EVT VT = N->getValueType(0); - SDValue Chain = N->getOperand(0); - - SmallVector Ops; - Ops.push_back(N->getOperand(2)); // Mem operand; - Ops.push_back(Chain); - - std::vector ResTys; - ResTys.push_back(MVT::Untyped); - ResTys.push_back(MVT::Other); - - SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); - SDValue SuperReg = SDValue(Ld, 0); - for (unsigned i = 0; i < NumVecs; ++i) - ReplaceUses(SDValue(N, i), - CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg)); - - ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1)); - return nullptr; -} - -SDNode *ARM64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs, - unsigned Opc, unsigned SubRegIdx) { - SDLoc dl(N); - EVT VT = N->getValueType(0); - SDValue Chain = N->getOperand(0); - - SmallVector Ops; - Ops.push_back(N->getOperand(1)); // Mem operand - Ops.push_back(N->getOperand(2)); // Incremental - Ops.push_back(Chain); - - std::vector ResTys; - ResTys.push_back(MVT::i64); // Type of the write back register - ResTys.push_back(MVT::Untyped); - ResTys.push_back(MVT::Other); - - SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); - - // Update uses of write back register - ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0)); - - // Update uses of vector list - SDValue SuperReg = SDValue(Ld, 1); - if (NumVecs == 1) - ReplaceUses(SDValue(N, 0), SuperReg); - else - for (unsigned i = 0; i < NumVecs; ++i) - ReplaceUses(SDValue(N, i), - CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg)); - - // Update the chain - ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2)); - return nullptr; -} - -SDNode *ARM64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs, - unsigned Opc) { - SDLoc dl(N); - EVT VT = N->getOperand(2)->getValueType(0); - - // Form a REG_SEQUENCE to force register allocation. - bool Is128Bit = VT.getSizeInBits() == 128; - SmallVector Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs); - SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs); - - SmallVector Ops; - Ops.push_back(RegSeq); - Ops.push_back(N->getOperand(NumVecs + 2)); - Ops.push_back(N->getOperand(0)); - SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops); - - return St; -} - -SDNode *ARM64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs, - unsigned Opc) { - SDLoc dl(N); - EVT VT = N->getOperand(2)->getValueType(0); - SmallVector ResTys; - ResTys.push_back(MVT::i64); // Type of the write back register - ResTys.push_back(MVT::Other); // Type for the Chain - - // Form a REG_SEQUENCE to force register allocation. - bool Is128Bit = VT.getSizeInBits() == 128; - SmallVector Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs); - SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs); - - SmallVector Ops; - Ops.push_back(RegSeq); - Ops.push_back(N->getOperand(NumVecs + 1)); // base register - Ops.push_back(N->getOperand(NumVecs + 2)); // Incremental - Ops.push_back(N->getOperand(0)); // Chain - SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); - - return St; -} - -/// WidenVector - Given a value in the V64 register class, produce the -/// equivalent value in the V128 register class. -class WidenVector { - SelectionDAG &DAG; - -public: - WidenVector(SelectionDAG &DAG) : DAG(DAG) {} - - SDValue operator()(SDValue V64Reg) { - EVT VT = V64Reg.getValueType(); - unsigned NarrowSize = VT.getVectorNumElements(); - MVT EltTy = VT.getVectorElementType().getSimpleVT(); - MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); - SDLoc DL(V64Reg); - - SDValue Undef = - SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0); - return DAG.getTargetInsertSubreg(ARM64::dsub, DL, WideTy, Undef, V64Reg); - } -}; - -/// NarrowVector - Given a value in the V128 register class, produce the -/// equivalent value in the V64 register class. -static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { - EVT VT = V128Reg.getValueType(); - unsigned WideSize = VT.getVectorNumElements(); - MVT EltTy = VT.getVectorElementType().getSimpleVT(); - MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); - - return DAG.getTargetExtractSubreg(ARM64::dsub, SDLoc(V128Reg), NarrowTy, - V128Reg); -} - -SDNode *ARM64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs, - unsigned Opc) { - SDLoc dl(N); - EVT VT = N->getValueType(0); - bool Narrow = VT.getSizeInBits() == 64; - - // Form a REG_SEQUENCE to force register allocation. - SmallVector Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs); - - if (Narrow) - std::transform(Regs.begin(), Regs.end(), Regs.begin(), - WidenVector(*CurDAG)); - - SDValue RegSeq = createQTuple(Regs); - - std::vector ResTys; - ResTys.push_back(MVT::Untyped); - ResTys.push_back(MVT::Other); - - unsigned LaneNo = - cast(N->getOperand(NumVecs + 2))->getZExtValue(); - - SmallVector Ops; - Ops.push_back(RegSeq); - Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); - Ops.push_back(N->getOperand(NumVecs + 3)); - Ops.push_back(N->getOperand(0)); - SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); - SDValue SuperReg = SDValue(Ld, 0); - - EVT WideVT = RegSeq.getOperand(1)->getValueType(0); - static unsigned QSubs[] = { ARM64::qsub0, ARM64::qsub1, ARM64::qsub2, - ARM64::qsub3 }; - for (unsigned i = 0; i < NumVecs; ++i) { - SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg); - if (Narrow) - NV = NarrowVector(NV, *CurDAG); - ReplaceUses(SDValue(N, i), NV); - } - - ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1)); - - return Ld; -} - -SDNode *ARM64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs, - unsigned Opc) { - SDLoc dl(N); - EVT VT = N->getValueType(0); - bool Narrow = VT.getSizeInBits() == 64; - - // Form a REG_SEQUENCE to force register allocation. - SmallVector Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs); - - if (Narrow) - std::transform(Regs.begin(), Regs.end(), Regs.begin(), - WidenVector(*CurDAG)); - - SDValue RegSeq = createQTuple(Regs); - - std::vector ResTys; - ResTys.push_back(MVT::i64); // Type of the write back register - ResTys.push_back(MVT::Untyped); - ResTys.push_back(MVT::Other); - - unsigned LaneNo = - cast(N->getOperand(NumVecs + 1))->getZExtValue(); - - SmallVector Ops; - Ops.push_back(RegSeq); - Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); // Lane Number - Ops.push_back(N->getOperand(NumVecs + 2)); // Base register - Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental - Ops.push_back(N->getOperand(0)); - SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); - - // Update uses of the write back register - ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0)); - - // Update uses of the vector list - SDValue SuperReg = SDValue(Ld, 1); - if (NumVecs == 1) { - ReplaceUses(SDValue(N, 0), - Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg); - } else { - EVT WideVT = RegSeq.getOperand(1)->getValueType(0); - static unsigned QSubs[] = { ARM64::qsub0, ARM64::qsub1, ARM64::qsub2, - ARM64::qsub3 }; - for (unsigned i = 0; i < NumVecs; ++i) { - SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, - SuperReg); - if (Narrow) - NV = NarrowVector(NV, *CurDAG); - ReplaceUses(SDValue(N, i), NV); - } - } - - // Update the Chain - ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2)); - - return Ld; -} - -SDNode *ARM64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs, - unsigned Opc) { - SDLoc dl(N); - EVT VT = N->getOperand(2)->getValueType(0); - bool Narrow = VT.getSizeInBits() == 64; - - // Form a REG_SEQUENCE to force register allocation. - SmallVector Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs); - - if (Narrow) - std::transform(Regs.begin(), Regs.end(), Regs.begin(), - WidenVector(*CurDAG)); - - SDValue RegSeq = createQTuple(Regs); - - unsigned LaneNo = - cast(N->getOperand(NumVecs + 2))->getZExtValue(); - - SmallVector Ops; - Ops.push_back(RegSeq); - Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); - Ops.push_back(N->getOperand(NumVecs + 3)); - Ops.push_back(N->getOperand(0)); - SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); - - // Transfer memoperands. - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = cast(N)->getMemOperand(); - cast(St)->setMemRefs(MemOp, MemOp + 1); - - return St; -} - -SDNode *ARM64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs, - unsigned Opc) { - SDLoc dl(N); - EVT VT = N->getOperand(2)->getValueType(0); - bool Narrow = VT.getSizeInBits() == 64; - - // Form a REG_SEQUENCE to force register allocation. - SmallVector Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs); - - if (Narrow) - std::transform(Regs.begin(), Regs.end(), Regs.begin(), - WidenVector(*CurDAG)); - - SDValue RegSeq = createQTuple(Regs); - - SmallVector ResTys; - ResTys.push_back(MVT::i64); // Type of the write back register - ResTys.push_back(MVT::Other); - - unsigned LaneNo = - cast(N->getOperand(NumVecs + 1))->getZExtValue(); - - SmallVector Ops; - Ops.push_back(RegSeq); - Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); - Ops.push_back(N->getOperand(NumVecs + 2)); // Base Register - Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental - Ops.push_back(N->getOperand(0)); - SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops); - - // Transfer memoperands. - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = cast(N)->getMemOperand(); - cast(St)->setMemRefs(MemOp, MemOp + 1); - - return St; -} - -static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N, - unsigned &Opc, SDValue &Opd0, - unsigned &LSB, unsigned &MSB, - unsigned NumberOfIgnoredLowBits, - bool BiggerPattern) { - assert(N->getOpcode() == ISD::AND && - "N must be a AND operation to call this function"); - - EVT VT = N->getValueType(0); - - // Here we can test the type of VT and return false when the type does not - // match, but since it is done prior to that call in the current context - // we turned that into an assert to avoid redundant code. - assert((VT == MVT::i32 || VT == MVT::i64) && - "Type checking must have been done before calling this function"); - - // FIXME: simplify-demanded-bits in DAGCombine will probably have - // changed the AND node to a 32-bit mask operation. We'll have to - // undo that as part of the transform here if we want to catch all - // the opportunities. - // Currently the NumberOfIgnoredLowBits argument helps to recover - // form these situations when matching bigger pattern (bitfield insert). - - // For unsigned extracts, check for a shift right and mask - uint64_t And_imm = 0; - if (!isOpcWithIntImmediate(N, ISD::AND, And_imm)) - return false; - - const SDNode *Op0 = N->getOperand(0).getNode(); - - // Because of simplify-demanded-bits in DAGCombine, the mask may have been - // simplified. Try to undo that - And_imm |= (1 << NumberOfIgnoredLowBits) - 1; - - // The immediate is a mask of the low bits iff imm & (imm+1) == 0 - if (And_imm & (And_imm + 1)) - return false; - - bool ClampMSB = false; - uint64_t Srl_imm = 0; - // Handle the SRL + ANY_EXTEND case. - if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND && - isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, Srl_imm)) { - // Extend the incoming operand of the SRL to 64-bit. - Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0)); - // Make sure to clamp the MSB so that we preserve the semantics of the - // original operations. - ClampMSB = true; - } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE && - isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, - Srl_imm)) { - // If the shift result was truncated, we can still combine them. - Opd0 = Op0->getOperand(0).getOperand(0); - - // Use the type of SRL node. - VT = Opd0->getValueType(0); - } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) { - Opd0 = Op0->getOperand(0); - } else if (BiggerPattern) { - // Let's pretend a 0 shift right has been performed. - // The resulting code will be at least as good as the original one - // plus it may expose more opportunities for bitfield insert pattern. - // FIXME: Currently we limit this to the bigger pattern, because - // some optimizations expect AND and not UBFM - Opd0 = N->getOperand(0); - } else - return false; - - assert((BiggerPattern || (Srl_imm > 0 && Srl_imm < VT.getSizeInBits())) && - "bad amount in shift node!"); - - LSB = Srl_imm; - MSB = Srl_imm + (VT == MVT::i32 ? CountTrailingOnes_32(And_imm) - : CountTrailingOnes_64(And_imm)) - - 1; - if (ClampMSB) - // Since we're moving the extend before the right shift operation, we need - // to clamp the MSB to make sure we don't shift in undefined bits instead of - // the zeros which would get shifted in with the original right shift - // operation. - MSB = MSB > 31 ? 31 : MSB; - - Opc = VT == MVT::i32 ? ARM64::UBFMWri : ARM64::UBFMXri; - return true; -} - -static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, - unsigned &LSB, unsigned &MSB) { - // We are looking for the following pattern which basically extracts a single - // bit from the source value and places it in the LSB of the destination - // value, all other bits of the destination value or set to zero: - // - // Value2 = AND Value, MaskImm - // SRL Value2, ShiftImm - // - // with MaskImm >> ShiftImm == 1. - // - // This gets selected into a single UBFM: - // - // UBFM Value, ShiftImm, ShiftImm - // - - if (N->getOpcode() != ISD::SRL) - return false; - - uint64_t And_mask = 0; - if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_mask)) - return false; - - Opd0 = N->getOperand(0).getOperand(0); - - uint64_t Srl_imm = 0; - if (!isIntImmediate(N->getOperand(1), Srl_imm)) - return false; - - // Check whether we really have a one bit extract here. - if (And_mask >> Srl_imm == 0x1) { - if (N->getValueType(0) == MVT::i32) - Opc = ARM64::UBFMWri; - else - Opc = ARM64::UBFMXri; - - LSB = MSB = Srl_imm; - - return true; - } - - return false; -} - -static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0, - unsigned &LSB, unsigned &MSB, - bool BiggerPattern) { - assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && - "N must be a SHR/SRA operation to call this function"); - - EVT VT = N->getValueType(0); - - // Here we can test the type of VT and return false when the type does not - // match, but since it is done prior to that call in the current context - // we turned that into an assert to avoid redundant code. - assert((VT == MVT::i32 || VT == MVT::i64) && - "Type checking must have been done before calling this function"); - - // Check for AND + SRL doing a one bit extract. - if (isOneBitExtractOpFromShr(N, Opc, Opd0, LSB, MSB)) - return true; - - // we're looking for a shift of a shift - uint64_t Shl_imm = 0; - uint64_t Trunc_bits = 0; - if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) { - Opd0 = N->getOperand(0).getOperand(0); - } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL && - N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) { - // We are looking for a shift of truncate. Truncate from i64 to i32 could - // be considered as setting high 32 bits as zero. Our strategy here is to - // always generate 64bit UBFM. This consistency will help the CSE pass - // later find more redundancy. - Opd0 = N->getOperand(0).getOperand(0); - Trunc_bits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits(); - VT = Opd0->getValueType(0); - assert(VT == MVT::i64 && "the promoted type should be i64"); - } else if (BiggerPattern) { - // Let's pretend a 0 shift left has been performed. - // FIXME: Currently we limit this to the bigger pattern case, - // because some optimizations expect AND and not UBFM - Opd0 = N->getOperand(0); - } else - return false; - - assert(Shl_imm < VT.getSizeInBits() && "bad amount in shift node!"); - uint64_t Srl_imm = 0; - if (!isIntImmediate(N->getOperand(1), Srl_imm)) - return false; - - assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() && - "bad amount in shift node!"); - // Note: The width operand is encoded as width-1. - unsigned Width = VT.getSizeInBits() - Trunc_bits - Srl_imm - 1; - int sLSB = Srl_imm - Shl_imm; - if (sLSB < 0) - return false; - LSB = sLSB; - MSB = LSB + Width; - // SRA requires a signed extraction - if (VT == MVT::i32) - Opc = N->getOpcode() == ISD::SRA ? ARM64::SBFMWri : ARM64::UBFMWri; - else - Opc = N->getOpcode() == ISD::SRA ? ARM64::SBFMXri : ARM64::UBFMXri; - return true; -} - -static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc, - SDValue &Opd0, unsigned &LSB, unsigned &MSB, - unsigned NumberOfIgnoredLowBits = 0, - bool BiggerPattern = false) { - if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64) - return false; - - switch (N->getOpcode()) { - default: - if (!N->isMachineOpcode()) - return false; - break; - case ISD::AND: - return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, LSB, MSB, - NumberOfIgnoredLowBits, BiggerPattern); - case ISD::SRL: - case ISD::SRA: - return isBitfieldExtractOpFromShr(N, Opc, Opd0, LSB, MSB, BiggerPattern); - } - - unsigned NOpc = N->getMachineOpcode(); - switch (NOpc) { - default: - return false; - case ARM64::SBFMWri: - case ARM64::UBFMWri: - case ARM64::SBFMXri: - case ARM64::UBFMXri: - Opc = NOpc; - Opd0 = N->getOperand(0); - LSB = cast(N->getOperand(1).getNode())->getZExtValue(); - MSB = cast(N->getOperand(2).getNode())->getZExtValue(); - return true; - } - // Unreachable - return false; -} - -SDNode *ARM64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) { - unsigned Opc, LSB, MSB; - SDValue Opd0; - if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, LSB, MSB)) - return nullptr; - - EVT VT = N->getValueType(0); - - // If the bit extract operation is 64bit but the original type is 32bit, we - // need to add one EXTRACT_SUBREG. - if ((Opc == ARM64::SBFMXri || Opc == ARM64::UBFMXri) && VT == MVT::i32) { - SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(LSB, MVT::i64), - CurDAG->getTargetConstant(MSB, MVT::i64)}; - - SDNode *BFM = CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i64, Ops64); - SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32); - MachineSDNode *Node = - CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N), MVT::i32, - SDValue(BFM, 0), SubReg); - return Node; - } - - SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(LSB, VT), - CurDAG->getTargetConstant(MSB, VT)}; - return CurDAG->SelectNodeTo(N, Opc, VT, Ops); -} - -/// Does DstMask form a complementary pair with the mask provided by -/// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking, -/// this asks whether DstMask zeroes precisely those bits that will be set by -/// the other half. -static bool isBitfieldDstMask(uint64_t DstMask, APInt BitsToBeInserted, - unsigned NumberOfIgnoredHighBits, EVT VT) { - assert((VT == MVT::i32 || VT == MVT::i64) && - "i32 or i64 mask type expected!"); - unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits; - - APInt SignificantDstMask = APInt(BitWidth, DstMask); - APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth); - - return (SignificantDstMask & SignificantBitsToBeInserted) == 0 && - (SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue(); -} - -// Look for bits that will be useful for later uses. -// A bit is consider useless as soon as it is dropped and never used -// before it as been dropped. -// E.g., looking for useful bit of x -// 1. y = x & 0x7 -// 2. z = y >> 2 -// After #1, x useful bits are 0x7, then the useful bits of x, live through -// y. -// After #2, the useful bits of x are 0x4. -// However, if x is used on an unpredicatable instruction, then all its bits -// are useful. -// E.g. -// 1. y = x & 0x7 -// 2. z = y >> 2 -// 3. str x, [@x] -static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0); - -static void getUsefulBitsFromAndWithImmediate(SDValue Op, APInt &UsefulBits, - unsigned Depth) { - uint64_t Imm = - cast(Op.getOperand(1).getNode())->getZExtValue(); - Imm = ARM64_AM::decodeLogicalImmediate(Imm, UsefulBits.getBitWidth()); - UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm); - getUsefulBits(Op, UsefulBits, Depth + 1); -} - -static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits, - uint64_t Imm, uint64_t MSB, - unsigned Depth) { - // inherit the bitwidth value - APInt OpUsefulBits(UsefulBits); - OpUsefulBits = 1; - - if (MSB >= Imm) { - OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1); - --OpUsefulBits; - // The interesting part will be in the lower part of the result - getUsefulBits(Op, OpUsefulBits, Depth + 1); - // The interesting part was starting at Imm in the argument - OpUsefulBits = OpUsefulBits.shl(Imm); - } else { - OpUsefulBits = OpUsefulBits.shl(MSB + 1); - --OpUsefulBits; - // The interesting part will be shifted in the result - OpUsefulBits = OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm); - getUsefulBits(Op, OpUsefulBits, Depth + 1); - // The interesting part was at zero in the argument - OpUsefulBits = OpUsefulBits.lshr(OpUsefulBits.getBitWidth() - Imm); - } - - UsefulBits &= OpUsefulBits; -} - -static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits, - unsigned Depth) { - uint64_t Imm = - cast(Op.getOperand(1).getNode())->getZExtValue(); - uint64_t MSB = - cast(Op.getOperand(2).getNode())->getZExtValue(); - - getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth); -} - -static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits, - unsigned Depth) { - uint64_t ShiftTypeAndValue = - cast(Op.getOperand(2).getNode())->getZExtValue(); - APInt Mask(UsefulBits); - Mask.clearAllBits(); - Mask.flipAllBits(); - - if (ARM64_AM::getShiftType(ShiftTypeAndValue) == ARM64_AM::LSL) { - // Shift Left - uint64_t ShiftAmt = ARM64_AM::getShiftValue(ShiftTypeAndValue); - Mask = Mask.shl(ShiftAmt); - getUsefulBits(Op, Mask, Depth + 1); - Mask = Mask.lshr(ShiftAmt); - } else if (ARM64_AM::getShiftType(ShiftTypeAndValue) == ARM64_AM::LSR) { - // Shift Right - // We do not handle ARM64_AM::ASR, because the sign will change the - // number of useful bits - uint64_t ShiftAmt = ARM64_AM::getShiftValue(ShiftTypeAndValue); - Mask = Mask.lshr(ShiftAmt); - getUsefulBits(Op, Mask, Depth + 1); - Mask = Mask.shl(ShiftAmt); - } else - return; - - UsefulBits &= Mask; -} - -static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits, - unsigned Depth) { - uint64_t Imm = - cast(Op.getOperand(2).getNode())->getZExtValue(); - uint64_t MSB = - cast(Op.getOperand(3).getNode())->getZExtValue(); - - if (Op.getOperand(1) == Orig) - return getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth); - - APInt OpUsefulBits(UsefulBits); - OpUsefulBits = 1; - - if (MSB >= Imm) { - OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1); - --OpUsefulBits; - UsefulBits &= ~OpUsefulBits; - getUsefulBits(Op, UsefulBits, Depth + 1); - } else { - OpUsefulBits = OpUsefulBits.shl(MSB + 1); - --OpUsefulBits; - UsefulBits = ~(OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm)); - getUsefulBits(Op, UsefulBits, Depth + 1); - } -} - -static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits, - SDValue Orig, unsigned Depth) { - - // Users of this node should have already been instruction selected - // FIXME: Can we turn that into an assert? - if (!UserNode->isMachineOpcode()) - return; - - switch (UserNode->getMachineOpcode()) { - default: - return; - case ARM64::ANDSWri: - case ARM64::ANDSXri: - case ARM64::ANDWri: - case ARM64::ANDXri: - // We increment Depth only when we call the getUsefulBits - return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits, - Depth); - case ARM64::UBFMWri: - case ARM64::UBFMXri: - return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth); - - case ARM64::ORRWrs: - case ARM64::ORRXrs: - if (UserNode->getOperand(1) != Orig) - return; - return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits, - Depth); - case ARM64::BFMWri: - case ARM64::BFMXri: - return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth); - } -} - -static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) { - if (Depth >= 6) - return; - // Initialize UsefulBits - if (!Depth) { - unsigned Bitwidth = Op.getValueType().getScalarType().getSizeInBits(); - // At the beginning, assume every produced bits is useful - UsefulBits = APInt(Bitwidth, 0); - UsefulBits.flipAllBits(); - } - APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0); - - for (SDNode *Node : Op.getNode()->uses()) { - // A use cannot produce useful bits - APInt UsefulBitsForUse = APInt(UsefulBits); - getUsefulBitsForUse(Node, UsefulBitsForUse, Op, Depth); - UsersUsefulBits |= UsefulBitsForUse; - } - // UsefulBits contains the produced bits that are meaningful for the - // current definition, thus a user cannot make a bit meaningful at - // this point - UsefulBits &= UsersUsefulBits; -} - -/// Create a machine node performing a notional SHL of Op by ShlAmount. If -/// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is -/// 0, return Op unchanged. -static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) { - if (ShlAmount == 0) - return Op; - - EVT VT = Op.getValueType(); - unsigned BitWidth = VT.getSizeInBits(); - unsigned UBFMOpc = BitWidth == 32 ? ARM64::UBFMWri : ARM64::UBFMXri; - - SDNode *ShiftNode; - if (ShlAmount > 0) { - // LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt - ShiftNode = CurDAG->getMachineNode( - UBFMOpc, SDLoc(Op), VT, Op, - CurDAG->getTargetConstant(BitWidth - ShlAmount, VT), - CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, VT)); - } else { - // LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1 - assert(ShlAmount < 0 && "expected right shift"); - int ShrAmount = -ShlAmount; - ShiftNode = CurDAG->getMachineNode( - UBFMOpc, SDLoc(Op), VT, Op, CurDAG->getTargetConstant(ShrAmount, VT), - CurDAG->getTargetConstant(BitWidth - 1, VT)); - } - - return SDValue(ShiftNode, 0); -} - -/// Does this tree qualify as an attempt to move a bitfield into position, -/// essentially "(and (shl VAL, N), Mask)". -static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op, - SDValue &Src, int &ShiftAmount, - int &MaskWidth) { - EVT VT = Op.getValueType(); - unsigned BitWidth = VT.getSizeInBits(); - (void)BitWidth; - assert(BitWidth == 32 || BitWidth == 64); - - APInt KnownZero, KnownOne; - CurDAG->computeKnownBits(Op, KnownZero, KnownOne); - - // Non-zero in the sense that they're not provably zero, which is the key - // point if we want to use this value - uint64_t NonZeroBits = (~KnownZero).getZExtValue(); - - // Discard a constant AND mask if present. It's safe because the node will - // already have been factored into the computeKnownBits calculation above. - uint64_t AndImm; - if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) { - assert((~APInt(BitWidth, AndImm) & ~KnownZero) == 0); - Op = Op.getOperand(0); - } - - uint64_t ShlImm; - if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm)) - return false; - Op = Op.getOperand(0); - - if (!isShiftedMask_64(NonZeroBits)) - return false; - - ShiftAmount = countTrailingZeros(NonZeroBits); - MaskWidth = CountTrailingOnes_64(NonZeroBits >> ShiftAmount); - - // BFI encompasses sufficiently many nodes that it's worth inserting an extra - // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL - // amount. - Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount); - - return true; -} - -// Given a OR operation, check if we have the following pattern -// ubfm c, b, imm, imm2 (or something that does the same jobs, see -// isBitfieldExtractOp) -// d = e & mask2 ; where mask is a binary sequence of 1..10..0 and -// countTrailingZeros(mask2) == imm2 - imm + 1 -// f = d | c -// if yes, given reference arguments will be update so that one can replace -// the OR instruction with: -// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2 -static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, - SDValue &Src, unsigned &ImmR, - unsigned &ImmS, SelectionDAG *CurDAG) { - assert(N->getOpcode() == ISD::OR && "Expect a OR operation"); - - // Set Opc - EVT VT = N->getValueType(0); - if (VT == MVT::i32) - Opc = ARM64::BFMWri; - else if (VT == MVT::i64) - Opc = ARM64::BFMXri; - else - return false; - - // Because of simplify-demanded-bits in DAGCombine, involved masks may not - // have the expected shape. Try to undo that. - APInt UsefulBits; - getUsefulBits(SDValue(N, 0), UsefulBits); - - unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros(); - unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros(); - - // OR is commutative, check both possibilities (does llvm provide a - // way to do that directely, e.g., via code matcher?) - SDValue OrOpd1Val = N->getOperand(1); - SDNode *OrOpd0 = N->getOperand(0).getNode(); - SDNode *OrOpd1 = N->getOperand(1).getNode(); - for (int i = 0; i < 2; - ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) { - unsigned BFXOpc; - int DstLSB, Width; - if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS, - NumberOfIgnoredLowBits, true)) { - // Check that the returned opcode is compatible with the pattern, - // i.e., same type and zero extended (U and not S) - if ((BFXOpc != ARM64::UBFMXri && VT == MVT::i64) || - (BFXOpc != ARM64::UBFMWri && VT == MVT::i32)) - continue; - - // Compute the width of the bitfield insertion - DstLSB = 0; - Width = ImmS - ImmR + 1; - // FIXME: This constraint is to catch bitfield insertion we may - // want to widen the pattern if we want to grab general bitfied - // move case - if (Width <= 0) - continue; - - // If the mask on the insertee is correct, we have a BFXIL operation. We - // can share the ImmR and ImmS values from the already-computed UBFM. - } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), Src, - DstLSB, Width)) { - ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits(); - ImmS = Width - 1; - } else - continue; - - // Check the second part of the pattern - EVT VT = OrOpd1->getValueType(0); - assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand"); - - // Compute the Known Zero for the candidate of the first operand. - // This allows to catch more general case than just looking for - // AND with imm. Indeed, simplify-demanded-bits may have removed - // the AND instruction because it proves it was useless. - APInt KnownZero, KnownOne; - CurDAG->computeKnownBits(OrOpd1Val, KnownZero, KnownOne); - - // Check if there is enough room for the second operand to appear - // in the first one - APInt BitsToBeInserted = - APInt::getBitsSet(KnownZero.getBitWidth(), DstLSB, DstLSB + Width); - - if ((BitsToBeInserted & ~KnownZero) != 0) - continue; - - // Set the first operand - uint64_t Imm; - if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) && - isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT)) - // In that case, we can eliminate the AND - Dst = OrOpd1->getOperand(0); - else - // Maybe the AND has been removed by simplify-demanded-bits - // or is useful because it discards more bits - Dst = OrOpd1Val; - - // both parts match - return true; - } - - return false; -} - -SDNode *ARM64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) { - if (N->getOpcode() != ISD::OR) - return nullptr; - - unsigned Opc; - unsigned LSB, MSB; - SDValue Opd0, Opd1; - - if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG)) - return nullptr; - - EVT VT = N->getValueType(0); - SDValue Ops[] = { Opd0, - Opd1, - CurDAG->getTargetConstant(LSB, VT), - CurDAG->getTargetConstant(MSB, VT) }; - return CurDAG->SelectNodeTo(N, Opc, VT, Ops); -} - -SDNode *ARM64DAGToDAGISel::SelectLIBM(SDNode *N) { - EVT VT = N->getValueType(0); - unsigned Variant; - unsigned Opc; - unsigned FRINTXOpcs[] = { ARM64::FRINTXSr, ARM64::FRINTXDr }; - - if (VT == MVT::f32) { - Variant = 0; - } else if (VT == MVT::f64) { - Variant = 1; - } else - return nullptr; // Unrecognized argument type. Fall back on default codegen. - - // Pick the FRINTX variant needed to set the flags. - unsigned FRINTXOpc = FRINTXOpcs[Variant]; - - switch (N->getOpcode()) { - default: - return nullptr; // Unrecognized libm ISD node. Fall back on default codegen. - case ISD::FCEIL: { - unsigned FRINTPOpcs[] = { ARM64::FRINTPSr, ARM64::FRINTPDr }; - Opc = FRINTPOpcs[Variant]; - break; - } - case ISD::FFLOOR: { - unsigned FRINTMOpcs[] = { ARM64::FRINTMSr, ARM64::FRINTMDr }; - Opc = FRINTMOpcs[Variant]; - break; - } - case ISD::FTRUNC: { - unsigned FRINTZOpcs[] = { ARM64::FRINTZSr, ARM64::FRINTZDr }; - Opc = FRINTZOpcs[Variant]; - break; - } - case ISD::FROUND: { - unsigned FRINTAOpcs[] = { ARM64::FRINTASr, ARM64::FRINTADr }; - Opc = FRINTAOpcs[Variant]; - break; - } - } - - SDLoc dl(N); - SDValue In = N->getOperand(0); - SmallVector Ops; - Ops.push_back(In); - - if (!TM.Options.UnsafeFPMath) { - SDNode *FRINTX = CurDAG->getMachineNode(FRINTXOpc, dl, VT, MVT::Glue, In); - Ops.push_back(SDValue(FRINTX, 1)); - } - - return CurDAG->getMachineNode(Opc, dl, VT, Ops); -} - -bool -ARM64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, - unsigned RegWidth) { - APFloat FVal(0.0); - if (ConstantFPSDNode *CN = dyn_cast(N)) - FVal = CN->getValueAPF(); - else if (LoadSDNode *LN = dyn_cast(N)) { - // Some otherwise illegal constants are allowed in this case. - if (LN->getOperand(1).getOpcode() != ARM64ISD::ADDlow || - !isa(LN->getOperand(1)->getOperand(1))) - return false; - - ConstantPoolSDNode *CN = - dyn_cast(LN->getOperand(1)->getOperand(1)); - FVal = cast(CN->getConstVal())->getValueAPF(); - } else - return false; - - // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits - // is between 1 and 32 for a destination w-register, or 1 and 64 for an - // x-register. - // - // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we - // want THIS_NODE to be 2^fbits. This is much easier to deal with using - // integers. - bool IsExact; - - // fbits is between 1 and 64 in the worst-case, which means the fmul - // could have 2^64 as an actual operand. Need 65 bits of precision. - APSInt IntVal(65, true); - FVal.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact); - - // N.b. isPowerOf2 also checks for > 0. - if (!IsExact || !IntVal.isPowerOf2()) return false; - unsigned FBits = IntVal.logBase2(); - - // Checks above should have guaranteed that we haven't lost information in - // finding FBits, but it must still be in range. - if (FBits == 0 || FBits > RegWidth) return false; - - FixedPos = CurDAG->getTargetConstant(FBits, MVT::i32); - return true; -} - -SDNode *ARM64DAGToDAGISel::Select(SDNode *Node) { - // Dump information about the Node being selected - DEBUG(errs() << "Selecting: "); - DEBUG(Node->dump(CurDAG)); - DEBUG(errs() << "\n"); - - // If we have a custom node, we already have selected! - if (Node->isMachineOpcode()) { - DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n"); - Node->setNodeId(-1); - return nullptr; - } - - // Few custom selection stuff. - SDNode *ResNode = nullptr; - EVT VT = Node->getValueType(0); - - switch (Node->getOpcode()) { - default: - break; - - case ISD::ADD: - if (SDNode *I = SelectMLAV64LaneV128(Node)) - return I; - break; - - case ISD::LOAD: { - // Try to select as an indexed load. Fall through to normal processing - // if we can't. - bool Done = false; - SDNode *I = SelectIndexedLoad(Node, Done); - if (Done) - return I; - break; - } - - case ISD::SRL: - case ISD::AND: - case ISD::SRA: - if (SDNode *I = SelectBitfieldExtractOp(Node)) - return I; - break; - - case ISD::OR: - if (SDNode *I = SelectBitfieldInsertOp(Node)) - return I; - break; - - case ISD::EXTRACT_VECTOR_ELT: { - // Extracting lane zero is a special case where we can just use a plain - // EXTRACT_SUBREG instruction, which will become FMOV. This is easier for - // the rest of the compiler, especially the register allocator and copyi - // propagation, to reason about, so is preferred when it's possible to - // use it. - ConstantSDNode *LaneNode = cast(Node->getOperand(1)); - // Bail and use the default Select() for non-zero lanes. - if (LaneNode->getZExtValue() != 0) - break; - // If the element type is not the same as the result type, likewise - // bail and use the default Select(), as there's more to do than just - // a cross-class COPY. This catches extracts of i8 and i16 elements - // since they will need an explicit zext. - if (VT != Node->getOperand(0).getValueType().getVectorElementType()) - break; - unsigned SubReg; - switch (Node->getOperand(0) - .getValueType() - .getVectorElementType() - .getSizeInBits()) { - default: - assert(0 && "Unexpected vector element type!"); - case 64: - SubReg = ARM64::dsub; - break; - case 32: - SubReg = ARM64::ssub; - break; - case 16: // FALLTHROUGH - case 8: - llvm_unreachable("unexpected zext-requiring extract element!"); - } - SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT, - Node->getOperand(0)); - DEBUG(dbgs() << "ISEL: Custom selection!\n=> "); - DEBUG(Extract->dumpr(CurDAG)); - DEBUG(dbgs() << "\n"); - return Extract.getNode(); - } - case ISD::Constant: { - // Materialize zero constants as copies from WZR/XZR. This allows - // the coalescer to propagate these into other instructions. - ConstantSDNode *ConstNode = cast(Node); - if (ConstNode->isNullValue()) { - if (VT == MVT::i32) - return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node), - ARM64::WZR, MVT::i32).getNode(); - else if (VT == MVT::i64) - return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node), - ARM64::XZR, MVT::i64).getNode(); - } - break; - } - - case ISD::FrameIndex: { - // Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm. - int FI = cast(Node)->getIndex(); - unsigned Shifter = ARM64_AM::getShifterImm(ARM64_AM::LSL, 0); - const TargetLowering *TLI = getTargetLowering(); - SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); - SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32), - CurDAG->getTargetConstant(Shifter, MVT::i32) }; - return CurDAG->SelectNodeTo(Node, ARM64::ADDXri, MVT::i64, Ops); - } - case ISD::INTRINSIC_W_CHAIN: { - unsigned IntNo = cast(Node->getOperand(1))->getZExtValue(); - switch (IntNo) { - default: - break; - case Intrinsic::arm64_ldaxp: - case Intrinsic::arm64_ldxp: { - unsigned Op = - IntNo == Intrinsic::arm64_ldaxp ? ARM64::LDAXPX : ARM64::LDXPX; - SDValue MemAddr = Node->getOperand(2); - SDLoc DL(Node); - SDValue Chain = Node->getOperand(0); - - SDNode *Ld = CurDAG->getMachineNode(Op, DL, MVT::i64, MVT::i64, - MVT::Other, MemAddr, Chain); - - // Transfer memoperands. - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = cast(Node)->getMemOperand(); - cast(Ld)->setMemRefs(MemOp, MemOp + 1); - return Ld; - } - case Intrinsic::arm64_stlxp: - case Intrinsic::arm64_stxp: { - unsigned Op = - IntNo == Intrinsic::arm64_stlxp ? ARM64::STLXPX : ARM64::STXPX; - SDLoc DL(Node); - SDValue Chain = Node->getOperand(0); - SDValue ValLo = Node->getOperand(2); - SDValue ValHi = Node->getOperand(3); - SDValue MemAddr = Node->getOperand(4); - - // Place arguments in the right order. - SmallVector Ops; - Ops.push_back(ValLo); - Ops.push_back(ValHi); - Ops.push_back(MemAddr); - Ops.push_back(Chain); - - SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops); - // Transfer memoperands. - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = cast(Node)->getMemOperand(); - cast(St)->setMemRefs(MemOp, MemOp + 1); - - return St; - } - case Intrinsic::arm64_neon_ld1x2: - if (VT == MVT::v8i8) - return SelectLoad(Node, 2, ARM64::LD1Twov8b, ARM64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 2, ARM64::LD1Twov16b, ARM64::qsub0); - else if (VT == MVT::v4i16) - return SelectLoad(Node, 2, ARM64::LD1Twov4h, ARM64::dsub0); - else if (VT == MVT::v8i16) - return SelectLoad(Node, 2, ARM64::LD1Twov8h, ARM64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 2, ARM64::LD1Twov2s, ARM64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 2, ARM64::LD1Twov4s, ARM64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 2, ARM64::LD1Twov1d, ARM64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 2, ARM64::LD1Twov2d, ARM64::qsub0); - break; - case Intrinsic::arm64_neon_ld1x3: - if (VT == MVT::v8i8) - return SelectLoad(Node, 3, ARM64::LD1Threev8b, ARM64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 3, ARM64::LD1Threev16b, ARM64::qsub0); - else if (VT == MVT::v4i16) - return SelectLoad(Node, 3, ARM64::LD1Threev4h, ARM64::dsub0); - else if (VT == MVT::v8i16) - return SelectLoad(Node, 3, ARM64::LD1Threev8h, ARM64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 3, ARM64::LD1Threev2s, ARM64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 3, ARM64::LD1Threev4s, ARM64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 3, ARM64::LD1Threev1d, ARM64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 3, ARM64::LD1Threev2d, ARM64::qsub0); - break; - case Intrinsic::arm64_neon_ld1x4: - if (VT == MVT::v8i8) - return SelectLoad(Node, 4, ARM64::LD1Fourv8b, ARM64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 4, ARM64::LD1Fourv16b, ARM64::qsub0); - else if (VT == MVT::v4i16) - return SelectLoad(Node, 4, ARM64::LD1Fourv4h, ARM64::dsub0); - else if (VT == MVT::v8i16) - return SelectLoad(Node, 4, ARM64::LD1Fourv8h, ARM64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 4, ARM64::LD1Fourv2s, ARM64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 4, ARM64::LD1Fourv4s, ARM64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 4, ARM64::LD1Fourv1d, ARM64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 4, ARM64::LD1Fourv2d, ARM64::qsub0); - break; - case Intrinsic::arm64_neon_ld2: - if (VT == MVT::v8i8) - return SelectLoad(Node, 2, ARM64::LD2Twov8b, ARM64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 2, ARM64::LD2Twov16b, ARM64::qsub0); - else if (VT == MVT::v4i16) - return SelectLoad(Node, 2, ARM64::LD2Twov4h, ARM64::dsub0); - else if (VT == MVT::v8i16) - return SelectLoad(Node, 2, ARM64::LD2Twov8h, ARM64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 2, ARM64::LD2Twov2s, ARM64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 2, ARM64::LD2Twov4s, ARM64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 2, ARM64::LD1Twov1d, ARM64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 2, ARM64::LD2Twov2d, ARM64::qsub0); - break; - case Intrinsic::arm64_neon_ld3: - if (VT == MVT::v8i8) - return SelectLoad(Node, 3, ARM64::LD3Threev8b, ARM64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 3, ARM64::LD3Threev16b, ARM64::qsub0); - else if (VT == MVT::v4i16) - return SelectLoad(Node, 3, ARM64::LD3Threev4h, ARM64::dsub0); - else if (VT == MVT::v8i16) - return SelectLoad(Node, 3, ARM64::LD3Threev8h, ARM64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 3, ARM64::LD3Threev2s, ARM64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 3, ARM64::LD3Threev4s, ARM64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 3, ARM64::LD1Threev1d, ARM64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 3, ARM64::LD3Threev2d, ARM64::qsub0); - break; - case Intrinsic::arm64_neon_ld4: - if (VT == MVT::v8i8) - return SelectLoad(Node, 4, ARM64::LD4Fourv8b, ARM64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 4, ARM64::LD4Fourv16b, ARM64::qsub0); - else if (VT == MVT::v4i16) - return SelectLoad(Node, 4, ARM64::LD4Fourv4h, ARM64::dsub0); - else if (VT == MVT::v8i16) - return SelectLoad(Node, 4, ARM64::LD4Fourv8h, ARM64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 4, ARM64::LD4Fourv2s, ARM64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 4, ARM64::LD4Fourv4s, ARM64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 4, ARM64::LD1Fourv1d, ARM64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 4, ARM64::LD4Fourv2d, ARM64::qsub0); - break; - case Intrinsic::arm64_neon_ld2r: - if (VT == MVT::v8i8) - return SelectLoad(Node, 2, ARM64::LD2Rv8b, ARM64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 2, ARM64::LD2Rv16b, ARM64::qsub0); - else if (VT == MVT::v4i16) - return SelectLoad(Node, 2, ARM64::LD2Rv4h, ARM64::dsub0); - else if (VT == MVT::v8i16) - return SelectLoad(Node, 2, ARM64::LD2Rv8h, ARM64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 2, ARM64::LD2Rv2s, ARM64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 2, ARM64::LD2Rv4s, ARM64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 2, ARM64::LD2Rv1d, ARM64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 2, ARM64::LD2Rv2d, ARM64::qsub0); - break; - case Intrinsic::arm64_neon_ld3r: - if (VT == MVT::v8i8) - return SelectLoad(Node, 3, ARM64::LD3Rv8b, ARM64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 3, ARM64::LD3Rv16b, ARM64::qsub0); - else if (VT == MVT::v4i16) - return SelectLoad(Node, 3, ARM64::LD3Rv4h, ARM64::dsub0); - else if (VT == MVT::v8i16) - return SelectLoad(Node, 3, ARM64::LD3Rv8h, ARM64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 3, ARM64::LD3Rv2s, ARM64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 3, ARM64::LD3Rv4s, ARM64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 3, ARM64::LD3Rv1d, ARM64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 3, ARM64::LD3Rv2d, ARM64::qsub0); - break; - case Intrinsic::arm64_neon_ld4r: - if (VT == MVT::v8i8) - return SelectLoad(Node, 4, ARM64::LD4Rv8b, ARM64::dsub0); - else if (VT == MVT::v16i8) - return SelectLoad(Node, 4, ARM64::LD4Rv16b, ARM64::qsub0); - else if (VT == MVT::v4i16) - return SelectLoad(Node, 4, ARM64::LD4Rv4h, ARM64::dsub0); - else if (VT == MVT::v8i16) - return SelectLoad(Node, 4, ARM64::LD4Rv8h, ARM64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectLoad(Node, 4, ARM64::LD4Rv2s, ARM64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectLoad(Node, 4, ARM64::LD4Rv4s, ARM64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectLoad(Node, 4, ARM64::LD4Rv1d, ARM64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectLoad(Node, 4, ARM64::LD4Rv2d, ARM64::qsub0); - break; - case Intrinsic::arm64_neon_ld2lane: - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectLoadLane(Node, 2, ARM64::LD2i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) - return SelectLoadLane(Node, 2, ARM64::LD2i16); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectLoadLane(Node, 2, ARM64::LD2i32); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectLoadLane(Node, 2, ARM64::LD2i64); - break; - case Intrinsic::arm64_neon_ld3lane: - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectLoadLane(Node, 3, ARM64::LD3i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) - return SelectLoadLane(Node, 3, ARM64::LD3i16); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectLoadLane(Node, 3, ARM64::LD3i32); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectLoadLane(Node, 3, ARM64::LD3i64); - break; - case Intrinsic::arm64_neon_ld4lane: - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectLoadLane(Node, 4, ARM64::LD4i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) - return SelectLoadLane(Node, 4, ARM64::LD4i16); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectLoadLane(Node, 4, ARM64::LD4i32); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectLoadLane(Node, 4, ARM64::LD4i64); - break; - } - } break; - case ISD::INTRINSIC_WO_CHAIN: { - unsigned IntNo = cast(Node->getOperand(0))->getZExtValue(); - switch (IntNo) { - default: - break; - case Intrinsic::arm64_neon_tbl2: - return SelectTable(Node, 2, VT == MVT::v8i8 ? ARM64::TBLv8i8Two - : ARM64::TBLv16i8Two, - false); - case Intrinsic::arm64_neon_tbl3: - return SelectTable(Node, 3, VT == MVT::v8i8 ? ARM64::TBLv8i8Three - : ARM64::TBLv16i8Three, - false); - case Intrinsic::arm64_neon_tbl4: - return SelectTable(Node, 4, VT == MVT::v8i8 ? ARM64::TBLv8i8Four - : ARM64::TBLv16i8Four, - false); - case Intrinsic::arm64_neon_tbx2: - return SelectTable(Node, 2, VT == MVT::v8i8 ? ARM64::TBXv8i8Two - : ARM64::TBXv16i8Two, - true); - case Intrinsic::arm64_neon_tbx3: - return SelectTable(Node, 3, VT == MVT::v8i8 ? ARM64::TBXv8i8Three - : ARM64::TBXv16i8Three, - true); - case Intrinsic::arm64_neon_tbx4: - return SelectTable(Node, 4, VT == MVT::v8i8 ? ARM64::TBXv8i8Four - : ARM64::TBXv16i8Four, - true); - case Intrinsic::arm64_neon_smull: - case Intrinsic::arm64_neon_umull: - if (SDNode *N = SelectMULLV64LaneV128(IntNo, Node)) - return N; - break; - } - break; - } - case ISD::INTRINSIC_VOID: { - unsigned IntNo = cast(Node->getOperand(1))->getZExtValue(); - if (Node->getNumOperands() >= 3) - VT = Node->getOperand(2)->getValueType(0); - switch (IntNo) { - default: - break; - case Intrinsic::arm64_neon_st1x2: { - if (VT == MVT::v8i8) - return SelectStore(Node, 2, ARM64::ST1Twov8b); - else if (VT == MVT::v16i8) - return SelectStore(Node, 2, ARM64::ST1Twov16b); - else if (VT == MVT::v4i16) - return SelectStore(Node, 2, ARM64::ST1Twov4h); - else if (VT == MVT::v8i16) - return SelectStore(Node, 2, ARM64::ST1Twov8h); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectStore(Node, 2, ARM64::ST1Twov2s); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectStore(Node, 2, ARM64::ST1Twov4s); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectStore(Node, 2, ARM64::ST1Twov2d); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectStore(Node, 2, ARM64::ST1Twov1d); - break; - } - case Intrinsic::arm64_neon_st1x3: { - if (VT == MVT::v8i8) - return SelectStore(Node, 3, ARM64::ST1Threev8b); - else if (VT == MVT::v16i8) - return SelectStore(Node, 3, ARM64::ST1Threev16b); - else if (VT == MVT::v4i16) - return SelectStore(Node, 3, ARM64::ST1Threev4h); - else if (VT == MVT::v8i16) - return SelectStore(Node, 3, ARM64::ST1Threev8h); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectStore(Node, 3, ARM64::ST1Threev2s); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectStore(Node, 3, ARM64::ST1Threev4s); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectStore(Node, 3, ARM64::ST1Threev2d); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectStore(Node, 3, ARM64::ST1Threev1d); - break; - } - case Intrinsic::arm64_neon_st1x4: { - if (VT == MVT::v8i8) - return SelectStore(Node, 4, ARM64::ST1Fourv8b); - else if (VT == MVT::v16i8) - return SelectStore(Node, 4, ARM64::ST1Fourv16b); - else if (VT == MVT::v4i16) - return SelectStore(Node, 4, ARM64::ST1Fourv4h); - else if (VT == MVT::v8i16) - return SelectStore(Node, 4, ARM64::ST1Fourv8h); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectStore(Node, 4, ARM64::ST1Fourv2s); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectStore(Node, 4, ARM64::ST1Fourv4s); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectStore(Node, 4, ARM64::ST1Fourv2d); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectStore(Node, 4, ARM64::ST1Fourv1d); - break; - } - case Intrinsic::arm64_neon_st2: { - if (VT == MVT::v8i8) - return SelectStore(Node, 2, ARM64::ST2Twov8b); - else if (VT == MVT::v16i8) - return SelectStore(Node, 2, ARM64::ST2Twov16b); - else if (VT == MVT::v4i16) - return SelectStore(Node, 2, ARM64::ST2Twov4h); - else if (VT == MVT::v8i16) - return SelectStore(Node, 2, ARM64::ST2Twov8h); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectStore(Node, 2, ARM64::ST2Twov2s); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectStore(Node, 2, ARM64::ST2Twov4s); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectStore(Node, 2, ARM64::ST2Twov2d); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectStore(Node, 2, ARM64::ST1Twov1d); - break; - } - case Intrinsic::arm64_neon_st3: { - if (VT == MVT::v8i8) - return SelectStore(Node, 3, ARM64::ST3Threev8b); - else if (VT == MVT::v16i8) - return SelectStore(Node, 3, ARM64::ST3Threev16b); - else if (VT == MVT::v4i16) - return SelectStore(Node, 3, ARM64::ST3Threev4h); - else if (VT == MVT::v8i16) - return SelectStore(Node, 3, ARM64::ST3Threev8h); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectStore(Node, 3, ARM64::ST3Threev2s); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectStore(Node, 3, ARM64::ST3Threev4s); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectStore(Node, 3, ARM64::ST3Threev2d); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectStore(Node, 3, ARM64::ST1Threev1d); - break; - } - case Intrinsic::arm64_neon_st4: { - if (VT == MVT::v8i8) - return SelectStore(Node, 4, ARM64::ST4Fourv8b); - else if (VT == MVT::v16i8) - return SelectStore(Node, 4, ARM64::ST4Fourv16b); - else if (VT == MVT::v4i16) - return SelectStore(Node, 4, ARM64::ST4Fourv4h); - else if (VT == MVT::v8i16) - return SelectStore(Node, 4, ARM64::ST4Fourv8h); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectStore(Node, 4, ARM64::ST4Fourv2s); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectStore(Node, 4, ARM64::ST4Fourv4s); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectStore(Node, 4, ARM64::ST4Fourv2d); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectStore(Node, 4, ARM64::ST1Fourv1d); - break; - } - case Intrinsic::arm64_neon_st2lane: { - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectStoreLane(Node, 2, ARM64::ST2i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) - return SelectStoreLane(Node, 2, ARM64::ST2i16); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectStoreLane(Node, 2, ARM64::ST2i32); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectStoreLane(Node, 2, ARM64::ST2i64); - break; - } - case Intrinsic::arm64_neon_st3lane: { - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectStoreLane(Node, 3, ARM64::ST3i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) - return SelectStoreLane(Node, 3, ARM64::ST3i16); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectStoreLane(Node, 3, ARM64::ST3i32); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectStoreLane(Node, 3, ARM64::ST3i64); - break; - } - case Intrinsic::arm64_neon_st4lane: { - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectStoreLane(Node, 4, ARM64::ST4i8); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) - return SelectStoreLane(Node, 4, ARM64::ST4i16); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectStoreLane(Node, 4, ARM64::ST4i32); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectStoreLane(Node, 4, ARM64::ST4i64); - break; - } - } - } - case ARM64ISD::LD2post: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 2, ARM64::LD2Twov8b_POST, ARM64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 2, ARM64::LD2Twov16b_POST, ARM64::qsub0); - else if (VT == MVT::v4i16) - return SelectPostLoad(Node, 2, ARM64::LD2Twov4h_POST, ARM64::dsub0); - else if (VT == MVT::v8i16) - return SelectPostLoad(Node, 2, ARM64::LD2Twov8h_POST, ARM64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 2, ARM64::LD2Twov2s_POST, ARM64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 2, ARM64::LD2Twov4s_POST, ARM64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 2, ARM64::LD1Twov1d_POST, ARM64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 2, ARM64::LD2Twov2d_POST, ARM64::qsub0); - break; - } - case ARM64ISD::LD3post: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 3, ARM64::LD3Threev8b_POST, ARM64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 3, ARM64::LD3Threev16b_POST, ARM64::qsub0); - else if (VT == MVT::v4i16) - return SelectPostLoad(Node, 3, ARM64::LD3Threev4h_POST, ARM64::dsub0); - else if (VT == MVT::v8i16) - return SelectPostLoad(Node, 3, ARM64::LD3Threev8h_POST, ARM64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 3, ARM64::LD3Threev2s_POST, ARM64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 3, ARM64::LD3Threev4s_POST, ARM64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 3, ARM64::LD1Threev1d_POST, ARM64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 3, ARM64::LD3Threev2d_POST, ARM64::qsub0); - break; - } - case ARM64ISD::LD4post: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 4, ARM64::LD4Fourv8b_POST, ARM64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 4, ARM64::LD4Fourv16b_POST, ARM64::qsub0); - else if (VT == MVT::v4i16) - return SelectPostLoad(Node, 4, ARM64::LD4Fourv4h_POST, ARM64::dsub0); - else if (VT == MVT::v8i16) - return SelectPostLoad(Node, 4, ARM64::LD4Fourv8h_POST, ARM64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 4, ARM64::LD4Fourv2s_POST, ARM64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 4, ARM64::LD4Fourv4s_POST, ARM64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 4, ARM64::LD1Fourv1d_POST, ARM64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 4, ARM64::LD4Fourv2d_POST, ARM64::qsub0); - break; - } - case ARM64ISD::LD1x2post: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 2, ARM64::LD1Twov8b_POST, ARM64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 2, ARM64::LD1Twov16b_POST, ARM64::qsub0); - else if (VT == MVT::v4i16) - return SelectPostLoad(Node, 2, ARM64::LD1Twov4h_POST, ARM64::dsub0); - else if (VT == MVT::v8i16) - return SelectPostLoad(Node, 2, ARM64::LD1Twov8h_POST, ARM64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 2, ARM64::LD1Twov2s_POST, ARM64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 2, ARM64::LD1Twov4s_POST, ARM64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 2, ARM64::LD1Twov1d_POST, ARM64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 2, ARM64::LD1Twov2d_POST, ARM64::qsub0); - break; - } - case ARM64ISD::LD1x3post: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 3, ARM64::LD1Threev8b_POST, ARM64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 3, ARM64::LD1Threev16b_POST, ARM64::qsub0); - else if (VT == MVT::v4i16) - return SelectPostLoad(Node, 3, ARM64::LD1Threev4h_POST, ARM64::dsub0); - else if (VT == MVT::v8i16) - return SelectPostLoad(Node, 3, ARM64::LD1Threev8h_POST, ARM64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 3, ARM64::LD1Threev2s_POST, ARM64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 3, ARM64::LD1Threev4s_POST, ARM64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 3, ARM64::LD1Threev1d_POST, ARM64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 3, ARM64::LD1Threev2d_POST, ARM64::qsub0); - break; - } - case ARM64ISD::LD1x4post: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 4, ARM64::LD1Fourv8b_POST, ARM64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 4, ARM64::LD1Fourv16b_POST, ARM64::qsub0); - else if (VT == MVT::v4i16) - return SelectPostLoad(Node, 4, ARM64::LD1Fourv4h_POST, ARM64::dsub0); - else if (VT == MVT::v8i16) - return SelectPostLoad(Node, 4, ARM64::LD1Fourv8h_POST, ARM64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 4, ARM64::LD1Fourv2s_POST, ARM64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 4, ARM64::LD1Fourv4s_POST, ARM64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 4, ARM64::LD1Fourv1d_POST, ARM64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 4, ARM64::LD1Fourv2d_POST, ARM64::qsub0); - break; - } - case ARM64ISD::LD1DUPpost: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 1, ARM64::LD1Rv8b_POST, ARM64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 1, ARM64::LD1Rv16b_POST, ARM64::qsub0); - else if (VT == MVT::v4i16) - return SelectPostLoad(Node, 1, ARM64::LD1Rv4h_POST, ARM64::dsub0); - else if (VT == MVT::v8i16) - return SelectPostLoad(Node, 1, ARM64::LD1Rv8h_POST, ARM64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 1, ARM64::LD1Rv2s_POST, ARM64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 1, ARM64::LD1Rv4s_POST, ARM64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 1, ARM64::LD1Rv1d_POST, ARM64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 1, ARM64::LD1Rv2d_POST, ARM64::qsub0); - break; - } - case ARM64ISD::LD2DUPpost: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 2, ARM64::LD2Rv8b_POST, ARM64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 2, ARM64::LD2Rv16b_POST, ARM64::qsub0); - else if (VT == MVT::v4i16) - return SelectPostLoad(Node, 2, ARM64::LD2Rv4h_POST, ARM64::dsub0); - else if (VT == MVT::v8i16) - return SelectPostLoad(Node, 2, ARM64::LD2Rv8h_POST, ARM64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 2, ARM64::LD2Rv2s_POST, ARM64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 2, ARM64::LD2Rv4s_POST, ARM64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 2, ARM64::LD2Rv1d_POST, ARM64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 2, ARM64::LD2Rv2d_POST, ARM64::qsub0); - break; - } - case ARM64ISD::LD3DUPpost: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 3, ARM64::LD3Rv8b_POST, ARM64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 3, ARM64::LD3Rv16b_POST, ARM64::qsub0); - else if (VT == MVT::v4i16) - return SelectPostLoad(Node, 3, ARM64::LD3Rv4h_POST, ARM64::dsub0); - else if (VT == MVT::v8i16) - return SelectPostLoad(Node, 3, ARM64::LD3Rv8h_POST, ARM64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 3, ARM64::LD3Rv2s_POST, ARM64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 3, ARM64::LD3Rv4s_POST, ARM64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 3, ARM64::LD3Rv1d_POST, ARM64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 3, ARM64::LD3Rv2d_POST, ARM64::qsub0); - break; - } - case ARM64ISD::LD4DUPpost: { - if (VT == MVT::v8i8) - return SelectPostLoad(Node, 4, ARM64::LD4Rv8b_POST, ARM64::dsub0); - else if (VT == MVT::v16i8) - return SelectPostLoad(Node, 4, ARM64::LD4Rv16b_POST, ARM64::qsub0); - else if (VT == MVT::v4i16) - return SelectPostLoad(Node, 4, ARM64::LD4Rv4h_POST, ARM64::dsub0); - else if (VT == MVT::v8i16) - return SelectPostLoad(Node, 4, ARM64::LD4Rv8h_POST, ARM64::qsub0); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostLoad(Node, 4, ARM64::LD4Rv2s_POST, ARM64::dsub0); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostLoad(Node, 4, ARM64::LD4Rv4s_POST, ARM64::qsub0); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostLoad(Node, 4, ARM64::LD4Rv1d_POST, ARM64::dsub0); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostLoad(Node, 4, ARM64::LD4Rv2d_POST, ARM64::qsub0); - break; - } - case ARM64ISD::LD1LANEpost: { - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectPostLoadLane(Node, 1, ARM64::LD1i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) - return SelectPostLoadLane(Node, 1, ARM64::LD1i16_POST); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectPostLoadLane(Node, 1, ARM64::LD1i32_POST); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectPostLoadLane(Node, 1, ARM64::LD1i64_POST); - break; - } - case ARM64ISD::LD2LANEpost: { - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectPostLoadLane(Node, 2, ARM64::LD2i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) - return SelectPostLoadLane(Node, 2, ARM64::LD2i16_POST); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectPostLoadLane(Node, 2, ARM64::LD2i32_POST); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectPostLoadLane(Node, 2, ARM64::LD2i64_POST); - break; - } - case ARM64ISD::LD3LANEpost: { - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectPostLoadLane(Node, 3, ARM64::LD3i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) - return SelectPostLoadLane(Node, 3, ARM64::LD3i16_POST); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectPostLoadLane(Node, 3, ARM64::LD3i32_POST); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectPostLoadLane(Node, 3, ARM64::LD3i64_POST); - break; - } - case ARM64ISD::LD4LANEpost: { - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectPostLoadLane(Node, 4, ARM64::LD4i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) - return SelectPostLoadLane(Node, 4, ARM64::LD4i16_POST); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectPostLoadLane(Node, 4, ARM64::LD4i32_POST); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectPostLoadLane(Node, 4, ARM64::LD4i64_POST); - break; - } - case ARM64ISD::ST2post: { - VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v8i8) - return SelectPostStore(Node, 2, ARM64::ST2Twov8b_POST); - else if (VT == MVT::v16i8) - return SelectPostStore(Node, 2, ARM64::ST2Twov16b_POST); - else if (VT == MVT::v4i16) - return SelectPostStore(Node, 2, ARM64::ST2Twov4h_POST); - else if (VT == MVT::v8i16) - return SelectPostStore(Node, 2, ARM64::ST2Twov8h_POST); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostStore(Node, 2, ARM64::ST2Twov2s_POST); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostStore(Node, 2, ARM64::ST2Twov4s_POST); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostStore(Node, 2, ARM64::ST2Twov2d_POST); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostStore(Node, 2, ARM64::ST1Twov1d_POST); - break; - } - case ARM64ISD::ST3post: { - VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v8i8) - return SelectPostStore(Node, 3, ARM64::ST3Threev8b_POST); - else if (VT == MVT::v16i8) - return SelectPostStore(Node, 3, ARM64::ST3Threev16b_POST); - else if (VT == MVT::v4i16) - return SelectPostStore(Node, 3, ARM64::ST3Threev4h_POST); - else if (VT == MVT::v8i16) - return SelectPostStore(Node, 3, ARM64::ST3Threev8h_POST); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostStore(Node, 3, ARM64::ST3Threev2s_POST); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostStore(Node, 3, ARM64::ST3Threev4s_POST); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostStore(Node, 3, ARM64::ST3Threev2d_POST); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostStore(Node, 3, ARM64::ST1Threev1d_POST); - break; - } - case ARM64ISD::ST4post: { - VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v8i8) - return SelectPostStore(Node, 4, ARM64::ST4Fourv8b_POST); - else if (VT == MVT::v16i8) - return SelectPostStore(Node, 4, ARM64::ST4Fourv16b_POST); - else if (VT == MVT::v4i16) - return SelectPostStore(Node, 4, ARM64::ST4Fourv4h_POST); - else if (VT == MVT::v8i16) - return SelectPostStore(Node, 4, ARM64::ST4Fourv8h_POST); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostStore(Node, 4, ARM64::ST4Fourv2s_POST); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostStore(Node, 4, ARM64::ST4Fourv4s_POST); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostStore(Node, 4, ARM64::ST4Fourv2d_POST); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostStore(Node, 4, ARM64::ST1Fourv1d_POST); - break; - } - case ARM64ISD::ST1x2post: { - VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v8i8) - return SelectPostStore(Node, 2, ARM64::ST1Twov8b_POST); - else if (VT == MVT::v16i8) - return SelectPostStore(Node, 2, ARM64::ST1Twov16b_POST); - else if (VT == MVT::v4i16) - return SelectPostStore(Node, 2, ARM64::ST1Twov4h_POST); - else if (VT == MVT::v8i16) - return SelectPostStore(Node, 2, ARM64::ST1Twov8h_POST); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostStore(Node, 2, ARM64::ST1Twov2s_POST); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostStore(Node, 2, ARM64::ST1Twov4s_POST); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostStore(Node, 2, ARM64::ST1Twov1d_POST); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostStore(Node, 2, ARM64::ST1Twov2d_POST); - break; - } - case ARM64ISD::ST1x3post: { - VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v8i8) - return SelectPostStore(Node, 3, ARM64::ST1Threev8b_POST); - else if (VT == MVT::v16i8) - return SelectPostStore(Node, 3, ARM64::ST1Threev16b_POST); - else if (VT == MVT::v4i16) - return SelectPostStore(Node, 3, ARM64::ST1Threev4h_POST); - else if (VT == MVT::v8i16) - return SelectPostStore(Node, 3, ARM64::ST1Threev8h_POST); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostStore(Node, 3, ARM64::ST1Threev2s_POST); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostStore(Node, 3, ARM64::ST1Threev4s_POST); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostStore(Node, 3, ARM64::ST1Threev1d_POST); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostStore(Node, 3, ARM64::ST1Threev2d_POST); - break; - } - case ARM64ISD::ST1x4post: { - VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v8i8) - return SelectPostStore(Node, 4, ARM64::ST1Fourv8b_POST); - else if (VT == MVT::v16i8) - return SelectPostStore(Node, 4, ARM64::ST1Fourv16b_POST); - else if (VT == MVT::v4i16) - return SelectPostStore(Node, 4, ARM64::ST1Fourv4h_POST); - else if (VT == MVT::v8i16) - return SelectPostStore(Node, 4, ARM64::ST1Fourv8h_POST); - else if (VT == MVT::v2i32 || VT == MVT::v2f32) - return SelectPostStore(Node, 4, ARM64::ST1Fourv2s_POST); - else if (VT == MVT::v4i32 || VT == MVT::v4f32) - return SelectPostStore(Node, 4, ARM64::ST1Fourv4s_POST); - else if (VT == MVT::v1i64 || VT == MVT::v1f64) - return SelectPostStore(Node, 4, ARM64::ST1Fourv1d_POST); - else if (VT == MVT::v2i64 || VT == MVT::v2f64) - return SelectPostStore(Node, 4, ARM64::ST1Fourv2d_POST); - break; - } - case ARM64ISD::ST2LANEpost: { - VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectPostStoreLane(Node, 2, ARM64::ST2i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) - return SelectPostStoreLane(Node, 2, ARM64::ST2i16_POST); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectPostStoreLane(Node, 2, ARM64::ST2i32_POST); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectPostStoreLane(Node, 2, ARM64::ST2i64_POST); - break; - } - case ARM64ISD::ST3LANEpost: { - VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectPostStoreLane(Node, 3, ARM64::ST3i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) - return SelectPostStoreLane(Node, 3, ARM64::ST3i16_POST); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectPostStoreLane(Node, 3, ARM64::ST3i32_POST); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectPostStoreLane(Node, 3, ARM64::ST3i64_POST); - break; - } - case ARM64ISD::ST4LANEpost: { - VT = Node->getOperand(1).getValueType(); - if (VT == MVT::v16i8 || VT == MVT::v8i8) - return SelectPostStoreLane(Node, 4, ARM64::ST4i8_POST); - else if (VT == MVT::v8i16 || VT == MVT::v4i16) - return SelectPostStoreLane(Node, 4, ARM64::ST4i16_POST); - else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 || - VT == MVT::v2f32) - return SelectPostStoreLane(Node, 4, ARM64::ST4i32_POST); - else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 || - VT == MVT::v1f64) - return SelectPostStoreLane(Node, 4, ARM64::ST4i64_POST); - break; - } - - case ISD::FCEIL: - case ISD::FFLOOR: - case ISD::FTRUNC: - case ISD::FROUND: - if (SDNode *I = SelectLIBM(Node)) - return I; - break; - } - - // Select the default instruction - ResNode = SelectCode(Node); - - DEBUG(errs() << "=> "); - if (ResNode == nullptr || ResNode == Node) - DEBUG(Node->dump(CurDAG)); - else - DEBUG(ResNode->dump(CurDAG)); - DEBUG(errs() << "\n"); - - return ResNode; -} - -/// createARM64ISelDag - This pass converts a legalized DAG into a -/// ARM64-specific DAG, ready for instruction scheduling. -FunctionPass *llvm::createARM64ISelDag(ARM64TargetMachine &TM, - CodeGenOpt::Level OptLevel) { - return new ARM64DAGToDAGISel(TM, OptLevel); -} diff --git a/lib/Target/ARM64/ARM64ISelLowering.cpp b/lib/Target/ARM64/ARM64ISelLowering.cpp deleted file mode 100644 index c24b7deea94..00000000000 --- a/lib/Target/ARM64/ARM64ISelLowering.cpp +++ /dev/null @@ -1,7895 +0,0 @@ -//===-- ARM64ISelLowering.cpp - ARM64 DAG Lowering Implementation --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the ARM64TargetLowering class. -// -//===----------------------------------------------------------------------===// - -#include "ARM64ISelLowering.h" -#include "ARM64PerfectShuffle.h" -#include "ARM64Subtarget.h" -#include "ARM64CallingConv.h" -#include "ARM64MachineFunctionInfo.h" -#include "ARM64TargetMachine.h" -#include "ARM64TargetObjectFile.h" -#include "MCTargetDesc/ARM64AddressingModes.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/CallingConvLower.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/Type.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetOptions.h" -using namespace llvm; - -#define DEBUG_TYPE "arm64-lower" - -STATISTIC(NumTailCalls, "Number of tail calls"); -STATISTIC(NumShiftInserts, "Number of vector shift inserts"); - -enum AlignMode { - StrictAlign, - NoStrictAlign -}; - -static cl::opt -Align(cl::desc("Load/store alignment support"), - cl::Hidden, cl::init(NoStrictAlign), - cl::values( - clEnumValN(StrictAlign, "arm64-strict-align", - "Disallow all unaligned memory accesses"), - clEnumValN(NoStrictAlign, "arm64-no-strict-align", - "Allow unaligned memory accesses"), - clEnumValEnd)); - -// Place holder until extr generation is tested fully. -static cl::opt -EnableARM64ExtrGeneration("arm64-extr-generation", cl::Hidden, - cl::desc("Allow ARM64 (or (shift)(shift))->extract"), - cl::init(true)); - -static cl::opt -EnableARM64SlrGeneration("arm64-shift-insert-generation", cl::Hidden, - cl::desc("Allow ARM64 SLI/SRI formation"), - cl::init(false)); - -//===----------------------------------------------------------------------===// -// ARM64 Lowering public interface. -//===----------------------------------------------------------------------===// -static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) { - if (TM.getSubtarget().isTargetDarwin()) - return new ARM64_MachoTargetObjectFile(); - - return new ARM64_ELFTargetObjectFile(); -} - -ARM64TargetLowering::ARM64TargetLowering(ARM64TargetMachine &TM) - : TargetLowering(TM, createTLOF(TM)) { - Subtarget = &TM.getSubtarget(); - - // ARM64 doesn't have comparisons which set GPRs or setcc instructions, so - // we have to make something up. Arbitrarily, choose ZeroOrOne. - setBooleanContents(ZeroOrOneBooleanContent); - // When comparing vectors the result sets the different elements in the - // vector to all-one or all-zero. - setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); - - // Set up the register classes. - addRegisterClass(MVT::i32, &ARM64::GPR32allRegClass); - addRegisterClass(MVT::i64, &ARM64::GPR64allRegClass); - - if (Subtarget->hasFPARMv8()) { - addRegisterClass(MVT::f16, &ARM64::FPR16RegClass); - addRegisterClass(MVT::f32, &ARM64::FPR32RegClass); - addRegisterClass(MVT::f64, &ARM64::FPR64RegClass); - addRegisterClass(MVT::f128, &ARM64::FPR128RegClass); - } - - if (Subtarget->hasNEON()) { - addRegisterClass(MVT::v16i8, &ARM64::FPR8RegClass); - addRegisterClass(MVT::v8i16, &ARM64::FPR16RegClass); - // Someone set us up the NEON. - addDRTypeForNEON(MVT::v2f32); - addDRTypeForNEON(MVT::v8i8); - addDRTypeForNEON(MVT::v4i16); - addDRTypeForNEON(MVT::v2i32); - addDRTypeForNEON(MVT::v1i64); - addDRTypeForNEON(MVT::v1f64); - - addQRTypeForNEON(MVT::v4f32); - addQRTypeForNEON(MVT::v2f64); - addQRTypeForNEON(MVT::v16i8); - addQRTypeForNEON(MVT::v8i16); - addQRTypeForNEON(MVT::v4i32); - addQRTypeForNEON(MVT::v2i64); - } - - // Compute derived properties from the register classes - computeRegisterProperties(); - - // Provide all sorts of operation actions - setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); - setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); - setOperationAction(ISD::SETCC, MVT::i32, Custom); - setOperationAction(ISD::SETCC, MVT::i64, Custom); - setOperationAction(ISD::SETCC, MVT::f32, Custom); - setOperationAction(ISD::SETCC, MVT::f64, Custom); - setOperationAction(ISD::BRCOND, MVT::Other, Expand); - setOperationAction(ISD::BR_CC, MVT::i32, Custom); - setOperationAction(ISD::BR_CC, MVT::i64, Custom); - setOperationAction(ISD::BR_CC, MVT::f32, Custom); - setOperationAction(ISD::BR_CC, MVT::f64, Custom); - setOperationAction(ISD::SELECT, MVT::i32, Custom); - setOperationAction(ISD::SELECT, MVT::i64, Custom); - setOperationAction(ISD::SELECT, MVT::f32, Custom); - setOperationAction(ISD::SELECT, MVT::f64, Custom); - setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); - setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); - setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); - setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); - setOperationAction(ISD::BR_JT, MVT::Other, Expand); - setOperationAction(ISD::JumpTable, MVT::i64, Custom); - - setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); - setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); - setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); - - setOperationAction(ISD::FREM, MVT::f32, Expand); - setOperationAction(ISD::FREM, MVT::f64, Expand); - setOperationAction(ISD::FREM, MVT::f80, Expand); - - // Custom lowering hooks are needed for XOR - // to fold it into CSINC/CSINV. - setOperationAction(ISD::XOR, MVT::i32, Custom); - setOperationAction(ISD::XOR, MVT::i64, Custom); - - // Virtually no operation on f128 is legal, but LLVM can't expand them when - // there's a valid register class, so we need custom operations in most cases. - setOperationAction(ISD::FABS, MVT::f128, Expand); - setOperationAction(ISD::FADD, MVT::f128, Custom); - setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); - setOperationAction(ISD::FCOS, MVT::f128, Expand); - setOperationAction(ISD::FDIV, MVT::f128, Custom); - setOperationAction(ISD::FMA, MVT::f128, Expand); - setOperationAction(ISD::FMUL, MVT::f128, Custom); - setOperationAction(ISD::FNEG, MVT::f128, Expand); - setOperationAction(ISD::FPOW, MVT::f128, Expand); - setOperationAction(ISD::FREM, MVT::f128, Expand); - setOperationAction(ISD::FRINT, MVT::f128, Expand); - setOperationAction(ISD::FSIN, MVT::f128, Expand); - setOperationAction(ISD::FSINCOS, MVT::f128, Expand); - setOperationAction(ISD::FSQRT, MVT::f128, Expand); - setOperationAction(ISD::FSUB, MVT::f128, Custom); - setOperationAction(ISD::FTRUNC, MVT::f128, Expand); - setOperationAction(ISD::SETCC, MVT::f128, Custom); - setOperationAction(ISD::BR_CC, MVT::f128, Custom); - setOperationAction(ISD::SELECT, MVT::f128, Custom); - setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); - setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); - - // Lowering for many of the conversions is actually specified by the non-f128 - // type. The LowerXXX function will be trivial when f128 isn't involved. - setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); - setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); - setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); - - // Variable arguments. - setOperationAction(ISD::VASTART, MVT::Other, Custom); - setOperationAction(ISD::VAARG, MVT::Other, Custom); - setOperationAction(ISD::VACOPY, MVT::Other, Custom); - setOperationAction(ISD::VAEND, MVT::Other, Expand); - - // Variable-sized objects. - setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); - setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); - - // Exception handling. - // FIXME: These are guesses. Has this been defined yet? - setExceptionPointerRegister(ARM64::X0); - setExceptionSelectorRegister(ARM64::X1); - - // Constant pool entries - setOperationAction(ISD::ConstantPool, MVT::i64, Custom); - - // BlockAddress - setOperationAction(ISD::BlockAddress, MVT::i64, Custom); - - // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences. - setOperationAction(ISD::ADDC, MVT::i32, Custom); - setOperationAction(ISD::ADDE, MVT::i32, Custom); - setOperationAction(ISD::SUBC, MVT::i32, Custom); - setOperationAction(ISD::SUBE, MVT::i32, Custom); - setOperationAction(ISD::ADDC, MVT::i64, Custom); - setOperationAction(ISD::ADDE, MVT::i64, Custom); - setOperationAction(ISD::SUBC, MVT::i64, Custom); - setOperationAction(ISD::SUBE, MVT::i64, Custom); - - // ARM64 lacks both left-rotate and popcount instructions. - setOperationAction(ISD::ROTL, MVT::i32, Expand); - setOperationAction(ISD::ROTL, MVT::i64, Expand); - - // ARM64 doesn't have {U|S}MUL_LOHI. - setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); - setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); - - - // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero - // counterparts, which ARM64 supports directly. - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); - setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); - setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); - - setOperationAction(ISD::CTPOP, MVT::i32, Custom); - setOperationAction(ISD::CTPOP, MVT::i64, Custom); - - setOperationAction(ISD::SDIVREM, MVT::i32, Expand); - setOperationAction(ISD::SDIVREM, MVT::i64, Expand); - setOperationAction(ISD::SREM, MVT::i32, Expand); - setOperationAction(ISD::SREM, MVT::i64, Expand); - setOperationAction(ISD::UDIVREM, MVT::i32, Expand); - setOperationAction(ISD::UDIVREM, MVT::i64, Expand); - setOperationAction(ISD::UREM, MVT::i32, Expand); - setOperationAction(ISD::UREM, MVT::i64, Expand); - - // Custom lower Add/Sub/Mul with overflow. - setOperationAction(ISD::SADDO, MVT::i32, Custom); - setOperationAction(ISD::SADDO, MVT::i64, Custom); - setOperationAction(ISD::UADDO, MVT::i32, Custom); - setOperationAction(ISD::UADDO, MVT::i64, Custom); - setOperationAction(ISD::SSUBO, MVT::i32, Custom); - setOperationAction(ISD::SSUBO, MVT::i64, Custom); - setOperationAction(ISD::USUBO, MVT::i32, Custom); - setOperationAction(ISD::USUBO, MVT::i64, Custom); - setOperationAction(ISD::SMULO, MVT::i32, Custom); - setOperationAction(ISD::SMULO, MVT::i64, Custom); - setOperationAction(ISD::UMULO, MVT::i32, Custom); - setOperationAction(ISD::UMULO, MVT::i64, Custom); - - setOperationAction(ISD::FSIN, MVT::f32, Expand); - setOperationAction(ISD::FSIN, MVT::f64, Expand); - setOperationAction(ISD::FCOS, MVT::f32, Expand); - setOperationAction(ISD::FCOS, MVT::f64, Expand); - setOperationAction(ISD::FPOW, MVT::f32, Expand); - setOperationAction(ISD::FPOW, MVT::f64, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); - setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); - - // ARM64 has implementations of a lot of rounding-like FP operations. - static MVT RoundingTypes[] = { MVT::f32, MVT::f64}; - for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) { - MVT Ty = RoundingTypes[I]; - setOperationAction(ISD::FFLOOR, Ty, Legal); - setOperationAction(ISD::FNEARBYINT, Ty, Legal); - setOperationAction(ISD::FCEIL, Ty, Legal); - setOperationAction(ISD::FRINT, Ty, Legal); - setOperationAction(ISD::FTRUNC, Ty, Legal); - setOperationAction(ISD::FROUND, Ty, Legal); - } - - setOperationAction(ISD::PREFETCH, MVT::Other, Custom); - - if (Subtarget->isTargetMachO()) { - // For iOS, we don't want to the normal expansion of a libcall to - // sincos. We want to issue a libcall to __sincos_stret to avoid memory - // traffic. - setOperationAction(ISD::FSINCOS, MVT::f64, Custom); - setOperationAction(ISD::FSINCOS, MVT::f32, Custom); - } else { - setOperationAction(ISD::FSINCOS, MVT::f64, Expand); - setOperationAction(ISD::FSINCOS, MVT::f32, Expand); - } - - // ARM64 does not have floating-point extending loads, i1 sign-extending load, - // floating-point truncating stores, or v2i32->v2i16 truncating store. - setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand); - setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand); - setTruncStoreAction(MVT::f32, MVT::f16, Expand); - setTruncStoreAction(MVT::f64, MVT::f32, Expand); - setTruncStoreAction(MVT::f64, MVT::f16, Expand); - setTruncStoreAction(MVT::f128, MVT::f80, Expand); - setTruncStoreAction(MVT::f128, MVT::f64, Expand); - setTruncStoreAction(MVT::f128, MVT::f32, Expand); - setTruncStoreAction(MVT::f128, MVT::f16, Expand); - // Indexed loads and stores are supported. - for (unsigned im = (unsigned)ISD::PRE_INC; - im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { - setIndexedLoadAction(im, MVT::i8, Legal); - setIndexedLoadAction(im, MVT::i16, Legal); - setIndexedLoadAction(im, MVT::i32, Legal); - setIndexedLoadAction(im, MVT::i64, Legal); - setIndexedLoadAction(im, MVT::f64, Legal); - setIndexedLoadAction(im, MVT::f32, Legal); - setIndexedStoreAction(im, MVT::i8, Legal); - setIndexedStoreAction(im, MVT::i16, Legal); - setIndexedStoreAction(im, MVT::i32, Legal); - setIndexedStoreAction(im, MVT::i64, Legal); - setIndexedStoreAction(im, MVT::f64, Legal); - setIndexedStoreAction(im, MVT::f32, Legal); - } - - // Trap. - setOperationAction(ISD::TRAP, MVT::Other, Legal); - - // We combine OR nodes for bitfield operations. - setTargetDAGCombine(ISD::OR); - - // Vector add and sub nodes may conceal a high-half opportunity. - // Also, try to fold ADD into CSINC/CSINV.. - setTargetDAGCombine(ISD::ADD); - setTargetDAGCombine(ISD::SUB); - - setTargetDAGCombine(ISD::XOR); - setTargetDAGCombine(ISD::SINT_TO_FP); - setTargetDAGCombine(ISD::UINT_TO_FP); - - setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); - - setTargetDAGCombine(ISD::ANY_EXTEND); - setTargetDAGCombine(ISD::ZERO_EXTEND); - setTargetDAGCombine(ISD::SIGN_EXTEND); - setTargetDAGCombine(ISD::BITCAST); - setTargetDAGCombine(ISD::CONCAT_VECTORS); - setTargetDAGCombine(ISD::STORE); - - setTargetDAGCombine(ISD::MUL); - - setTargetDAGCombine(ISD::SELECT); - setTargetDAGCombine(ISD::VSELECT); - - setTargetDAGCombine(ISD::INTRINSIC_VOID); - setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); - setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); - - MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; - MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; - MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4; - - setStackPointerRegisterToSaveRestore(ARM64::SP); - - setSchedulingPreference(Sched::Hybrid); - - // Enable TBZ/TBNZ - MaskAndBranchFoldingIsLegal = true; - - setMinFunctionAlignment(2); - - RequireStrictAlign = (Align == StrictAlign); - - setHasExtractBitsInsn(true); - - if (Subtarget->hasNEON()) { - // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to - // silliness like this: - setOperationAction(ISD::FABS, MVT::v1f64, Expand); - setOperationAction(ISD::FADD, MVT::v1f64, Expand); - setOperationAction(ISD::FCEIL, MVT::v1f64, Expand); - setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand); - setOperationAction(ISD::FCOS, MVT::v1f64, Expand); - setOperationAction(ISD::FDIV, MVT::v1f64, Expand); - setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand); - setOperationAction(ISD::FMA, MVT::v1f64, Expand); - setOperationAction(ISD::FMUL, MVT::v1f64, Expand); - setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand); - setOperationAction(ISD::FNEG, MVT::v1f64, Expand); - setOperationAction(ISD::FPOW, MVT::v1f64, Expand); - setOperationAction(ISD::FREM, MVT::v1f64, Expand); - setOperationAction(ISD::FROUND, MVT::v1f64, Expand); - setOperationAction(ISD::FRINT, MVT::v1f64, Expand); - setOperationAction(ISD::FSIN, MVT::v1f64, Expand); - setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand); - setOperationAction(ISD::FSQRT, MVT::v1f64, Expand); - setOperationAction(ISD::FSUB, MVT::v1f64, Expand); - setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand); - setOperationAction(ISD::SETCC, MVT::v1f64, Expand); - setOperationAction(ISD::BR_CC, MVT::v1f64, Expand); - setOperationAction(ISD::SELECT, MVT::v1f64, Expand); - setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand); - setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand); - - setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand); - setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand); - setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand); - setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand); - setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand); - - setOperationAction(ISD::MUL, MVT::v1i64, Expand); - - // ARM64 doesn't have a direct vector ->f32 conversion instructions for - // elements smaller than i32, so promote the input to i32 first. - setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote); - setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote); - setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote); - // Similarly, there is no direct i32 -> f64 vector conversion instruction. - setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom); - - // ARM64 doesn't have MUL.2d: - setOperationAction(ISD::MUL, MVT::v2i64, Expand); - setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); - setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); - // Likewise, narrowing and extending vector loads/stores aren't handled - // directly. - for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { - - setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, - Expand); - - setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand); - setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); - - setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); - - for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; - InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) - setTruncStoreAction((MVT::SimpleValueType)VT, - (MVT::SimpleValueType)InnerVT, Expand); - setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); - setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); - setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); - } - - // ARM64 has implementations of a lot of rounding-like FP operations. - static MVT RoundingVecTypes[] = {MVT::v2f32, MVT::v4f32, MVT::v2f64 }; - for (unsigned I = 0; I < array_lengthof(RoundingVecTypes); ++I) { - MVT Ty = RoundingVecTypes[I]; - setOperationAction(ISD::FFLOOR, Ty, Legal); - setOperationAction(ISD::FNEARBYINT, Ty, Legal); - setOperationAction(ISD::FCEIL, Ty, Legal); - setOperationAction(ISD::FRINT, Ty, Legal); - setOperationAction(ISD::FTRUNC, Ty, Legal); - setOperationAction(ISD::FROUND, Ty, Legal); - } - } -} - -void ARM64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { - if (VT == MVT::v2f32) { - setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); - AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32); - - setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); - AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32); - } else if (VT == MVT::v2f64 || VT == MVT::v4f32) { - setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); - AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64); - - setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); - AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64); - } - - // Mark vector float intrinsics as expand. - if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { - setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand); - } - - setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom); - setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom); - setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom); - setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom); - setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom); - setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom); - setOperationAction(ISD::AND, VT.getSimpleVT(), Custom); - setOperationAction(ISD::OR, VT.getSimpleVT(), Custom); - setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom); - setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal); - - setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); - setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); - setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand); - setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand); - - // CNT supports only B element sizes. - if (VT != MVT::v8i8 && VT != MVT::v16i8) - setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand); - - setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand); - setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand); - setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand); - setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand); - setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand); - - setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom); - setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom); - - if (Subtarget->isLittleEndian()) { - for (unsigned im = (unsigned)ISD::PRE_INC; - im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { - setIndexedLoadAction(im, VT.getSimpleVT(), Legal); - setIndexedStoreAction(im, VT.getSimpleVT(), Legal); - } - } -} - -void ARM64TargetLowering::addDRTypeForNEON(MVT VT) { - addRegisterClass(VT, &ARM64::FPR64RegClass); - addTypeForNEON(VT, MVT::v2i32); -} - -void ARM64TargetLowering::addQRTypeForNEON(MVT VT) { - addRegisterClass(VT, &ARM64::FPR128RegClass); - addTypeForNEON(VT, MVT::v4i32); -} - -EVT ARM64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { - if (!VT.isVector()) - return MVT::i32; - return VT.changeVectorElementTypeToInteger(); -} - -/// computeKnownBitsForTargetNode - Determine which of the bits specified in -/// Mask are known to be either zero or one and return them in the -/// KnownZero/KnownOne bitsets. -void ARM64TargetLowering::computeKnownBitsForTargetNode( - const SDValue Op, APInt &KnownZero, APInt &KnownOne, - const SelectionDAG &DAG, unsigned Depth) const { - switch (Op.getOpcode()) { - default: - break; - case ARM64ISD::CSEL: { - APInt KnownZero2, KnownOne2; - DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1); - DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1); - KnownZero &= KnownZero2; - KnownOne &= KnownOne2; - break; - } - case ISD::INTRINSIC_W_CHAIN: { - ConstantSDNode *CN = cast(Op->getOperand(1)); - Intrinsic::ID IntID = static_cast(CN->getZExtValue()); - switch (IntID) { - default: return; - case Intrinsic::arm64_ldaxr: - case Intrinsic::arm64_ldxr: { - unsigned BitWidth = KnownOne.getBitWidth(); - EVT VT = cast(Op)->getMemoryVT(); - unsigned MemBits = VT.getScalarType().getSizeInBits(); - KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); - return; - } - } - break; - } - case ISD::INTRINSIC_WO_CHAIN: - case ISD::INTRINSIC_VOID: { - unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); - switch (IntNo) { - default: - break; - case Intrinsic::arm64_neon_umaxv: - case Intrinsic::arm64_neon_uminv: { - // Figure out the datatype of the vector operand. The UMINV instruction - // will zero extend the result, so we can mark as known zero all the - // bits larger than the element datatype. 32-bit or larget doesn't need - // this as those are legal types and will be handled by isel directly. - MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); - unsigned BitWidth = KnownZero.getBitWidth(); - if (VT == MVT::v8i8 || VT == MVT::v16i8) { - assert(BitWidth >= 8 && "Unexpected width!"); - APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); - KnownZero |= Mask; - } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { - assert(BitWidth >= 16 && "Unexpected width!"); - APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); - KnownZero |= Mask; - } - break; - } break; - } - } - } -} - -MVT ARM64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const { - return MVT::i64; -} - -unsigned ARM64TargetLowering::getMaximalGlobalOffset() const { - // FIXME: On ARM64, this depends on the type. - // Basically, the addressable offsets are o to 4095 * Ty.getSizeInBytes(). - // and the offset has to be a multiple of the related size in bytes. - return 4095; -} - -FastISel * -ARM64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, - const TargetLibraryInfo *libInfo) const { - return ARM64::createFastISel(funcInfo, libInfo); -} - -const char *ARM64TargetLowering::getTargetNodeName(unsigned Opcode) const { - switch (Opcode) { - default: - return nullptr; - case ARM64ISD::CALL: return "ARM64ISD::CALL"; - case ARM64ISD::ADRP: return "ARM64ISD::ADRP"; - case ARM64ISD::ADDlow: return "ARM64ISD::ADDlow"; - case ARM64ISD::LOADgot: return "ARM64ISD::LOADgot"; - case ARM64ISD::RET_FLAG: return "ARM64ISD::RET_FLAG"; - case ARM64ISD::BRCOND: return "ARM64ISD::BRCOND"; - case ARM64ISD::CSEL: return "ARM64ISD::CSEL"; - case ARM64ISD::FCSEL: return "ARM64ISD::FCSEL"; - case ARM64ISD::CSINV: return "ARM64ISD::CSINV"; - case ARM64ISD::CSNEG: return "ARM64ISD::CSNEG"; - case ARM64ISD::CSINC: return "ARM64ISD::CSINC"; - case ARM64ISD::THREAD_POINTER: return "ARM64ISD::THREAD_POINTER"; - case ARM64ISD::TLSDESC_CALL: return "ARM64ISD::TLSDESC_CALL"; - case ARM64ISD::ADC: return "ARM64ISD::ADC"; - case ARM64ISD::SBC: return "ARM64ISD::SBC"; - case ARM64ISD::ADDS: return "ARM64ISD::ADDS"; - case ARM64ISD::SUBS: return "ARM64ISD::SUBS"; - case ARM64ISD::ADCS: return "ARM64ISD::ADCS"; - case ARM64ISD::SBCS: return "ARM64ISD::SBCS"; - case ARM64ISD::ANDS: return "ARM64ISD::ANDS"; - case ARM64ISD::FCMP: return "ARM64ISD::FCMP"; - case ARM64ISD::FMIN: return "ARM64ISD::FMIN"; - case ARM64ISD::FMAX: return "ARM64ISD::FMAX"; - case ARM64ISD::DUP: return "ARM64ISD::DUP"; - case ARM64ISD::DUPLANE8: return "ARM64ISD::DUPLANE8"; - case ARM64ISD::DUPLANE16: return "ARM64ISD::DUPLANE16"; - case ARM64ISD::DUPLANE32: return "ARM64ISD::DUPLANE32"; - case ARM64ISD::DUPLANE64: return "ARM64ISD::DUPLANE64"; - case ARM64ISD::MOVI: return "ARM64ISD::MOVI"; - case ARM64ISD::MOVIshift: return "ARM64ISD::MOVIshift"; - case ARM64ISD::MOVIedit: return "ARM64ISD::MOVIedit"; - case ARM64ISD::MOVImsl: return "ARM64ISD::MOVImsl"; - case ARM64ISD::FMOV: return "ARM64ISD::FMOV"; - case ARM64ISD::MVNIshift: return "ARM64ISD::MVNIshift"; - case ARM64ISD::MVNImsl: return "ARM64ISD::MVNImsl"; - case ARM64ISD::BICi: return "ARM64ISD::BICi"; - case ARM64ISD::ORRi: return "ARM64ISD::ORRi"; - case ARM64ISD::BSL: return "ARM64ISD::BSL"; - case ARM64ISD::NEG: return "ARM64ISD::NEG"; - case ARM64ISD::EXTR: return "ARM64ISD::EXTR"; - case ARM64ISD::ZIP1: return "ARM64ISD::ZIP1"; - case ARM64ISD::ZIP2: return "ARM64ISD::ZIP2"; - case ARM64ISD::UZP1: return "ARM64ISD::UZP1"; - case ARM64ISD::UZP2: return "ARM64ISD::UZP2"; - case ARM64ISD::TRN1: return "ARM64ISD::TRN1"; - case ARM64ISD::TRN2: return "ARM64ISD::TRN2"; - case ARM64ISD::REV16: return "ARM64ISD::REV16"; - case ARM64ISD::REV32: return "ARM64ISD::REV32"; - case ARM64ISD::REV64: return "ARM64ISD::REV64"; - case ARM64ISD::EXT: return "ARM64ISD::EXT"; - case ARM64ISD::VSHL: return "ARM64ISD::VSHL"; - case ARM64ISD::VLSHR: return "ARM64ISD::VLSHR"; - case ARM64ISD::VASHR: return "ARM64ISD::VASHR"; - case ARM64ISD::CMEQ: return "ARM64ISD::CMEQ"; - case ARM64ISD::CMGE: return "ARM64ISD::CMGE"; - case ARM64ISD::CMGT: return "ARM64ISD::CMGT"; - case ARM64ISD::CMHI: return "ARM64ISD::CMHI"; - case ARM64ISD::CMHS: return "ARM64ISD::CMHS"; - case ARM64ISD::FCMEQ: return "ARM64ISD::FCMEQ"; - case ARM64ISD::FCMGE: return "ARM64ISD::FCMGE"; - case ARM64ISD::FCMGT: return "ARM64ISD::FCMGT"; - case ARM64ISD::CMEQz: return "ARM64ISD::CMEQz"; - case ARM64ISD::CMGEz: return "ARM64ISD::CMGEz"; - case ARM64ISD::CMGTz: return "ARM64ISD::CMGTz"; - case ARM64ISD::CMLEz: return "ARM64ISD::CMLEz"; - case ARM64ISD::CMLTz: return "ARM64ISD::CMLTz"; - case ARM64ISD::FCMEQz: return "ARM64ISD::FCMEQz"; - case ARM64ISD::FCMGEz: return "ARM64ISD::FCMGEz"; - case ARM64ISD::FCMGTz: return "ARM64ISD::FCMGTz"; - case ARM64ISD::FCMLEz: return "ARM64ISD::FCMLEz"; - case ARM64ISD::FCMLTz: return "ARM64ISD::FCMLTz"; - case ARM64ISD::NOT: return "ARM64ISD::NOT"; - case ARM64ISD::BIT: return "ARM64ISD::BIT"; - case ARM64ISD::CBZ: return "ARM64ISD::CBZ"; - case ARM64ISD::CBNZ: return "ARM64ISD::CBNZ"; - case ARM64ISD::TBZ: return "ARM64ISD::TBZ"; - case ARM64ISD::TBNZ: return "ARM64ISD::TBNZ"; - case ARM64ISD::TC_RETURN: return "ARM64ISD::TC_RETURN"; - case ARM64ISD::SITOF: return "ARM64ISD::SITOF"; - case ARM64ISD::UITOF: return "ARM64ISD::UITOF"; - case ARM64ISD::SQSHL_I: return "ARM64ISD::SQSHL_I"; - case ARM64ISD::UQSHL_I: return "ARM64ISD::UQSHL_I"; - case ARM64ISD::SRSHR_I: return "ARM64ISD::SRSHR_I"; - case ARM64ISD::URSHR_I: return "ARM64ISD::URSHR_I"; - case ARM64ISD::SQSHLU_I: return "ARM64ISD::SQSHLU_I"; - case ARM64ISD::WrapperLarge: return "ARM64ISD::WrapperLarge"; - case ARM64ISD::LD2post: return "ARM64ISD::LD2post"; - case ARM64ISD::LD3post: return "ARM64ISD::LD3post"; - case ARM64ISD::LD4post: return "ARM64ISD::LD4post"; - case ARM64ISD::ST2post: return "ARM64ISD::ST2post"; - case ARM64ISD::ST3post: return "ARM64ISD::ST3post"; - case ARM64ISD::ST4post: return "ARM64ISD::ST4post"; - case ARM64ISD::LD1x2post: return "ARM64ISD::LD1x2post"; - case ARM64ISD::LD1x3post: return "ARM64ISD::LD1x3post"; - case ARM64ISD::LD1x4post: return "ARM64ISD::LD1x4post"; - case ARM64ISD::ST1x2post: return "ARM64ISD::ST1x2post"; - case ARM64ISD::ST1x3post: return "ARM64ISD::ST1x3post"; - case ARM64ISD::ST1x4post: return "ARM64ISD::ST1x4post"; - case ARM64ISD::LD1DUPpost: return "ARM64ISD::LD1DUPpost"; - case ARM64ISD::LD2DUPpost: return "ARM64ISD::LD2DUPpost"; - case ARM64ISD::LD3DUPpost: return "ARM64ISD::LD3DUPpost"; - case ARM64ISD::LD4DUPpost: return "ARM64ISD::LD4DUPpost"; - case ARM64ISD::LD1LANEpost: return "ARM64ISD::LD1LANEpost"; - case ARM64ISD::LD2LANEpost: return "ARM64ISD::LD2LANEpost"; - case ARM64ISD::LD3LANEpost: return "ARM64ISD::LD3LANEpost"; - case ARM64ISD::LD4LANEpost: return "ARM64ISD::LD4LANEpost"; - case ARM64ISD::ST2LANEpost: return "ARM64ISD::ST2LANEpost"; - case ARM64ISD::ST3LANEpost: return "ARM64ISD::ST3LANEpost"; - case ARM64ISD::ST4LANEpost: return "ARM64ISD::ST4LANEpost"; - } -} - -MachineBasicBlock * -ARM64TargetLowering::EmitF128CSEL(MachineInstr *MI, - MachineBasicBlock *MBB) const { - // We materialise the F128CSEL pseudo-instruction as some control flow and a - // phi node: - - // OrigBB: - // [... previous instrs leading to comparison ...] - // b.ne TrueBB - // b EndBB - // TrueBB: - // ; Fallthrough - // EndBB: - // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] - - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - MachineFunction *MF = MBB->getParent(); - const BasicBlock *LLVM_BB = MBB->getBasicBlock(); - DebugLoc DL = MI->getDebugLoc(); - MachineFunction::iterator It = MBB; - ++It; - - unsigned DestReg = MI->getOperand(0).getReg(); - unsigned IfTrueReg = MI->getOperand(1).getReg(); - unsigned IfFalseReg = MI->getOperand(2).getReg(); - unsigned CondCode = MI->getOperand(3).getImm(); - bool NZCVKilled = MI->getOperand(4).isKill(); - - MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); - MF->insert(It, TrueBB); - MF->insert(It, EndBB); - - // Transfer rest of current basic-block to EndBB - EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), - MBB->end()); - EndBB->transferSuccessorsAndUpdatePHIs(MBB); - - BuildMI(MBB, DL, TII->get(ARM64::Bcc)).addImm(CondCode).addMBB(TrueBB); - BuildMI(MBB, DL, TII->get(ARM64::B)).addMBB(EndBB); - MBB->addSuccessor(TrueBB); - MBB->addSuccessor(EndBB); - - // TrueBB falls through to the end. - TrueBB->addSuccessor(EndBB); - - if (!NZCVKilled) { - TrueBB->addLiveIn(ARM64::NZCV); - EndBB->addLiveIn(ARM64::NZCV); - } - - BuildMI(*EndBB, EndBB->begin(), DL, TII->get(ARM64::PHI), DestReg) - .addReg(IfTrueReg) - .addMBB(TrueBB) - .addReg(IfFalseReg) - .addMBB(MBB); - - MI->eraseFromParent(); - return EndBB; -} - -MachineBasicBlock * -ARM64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, - MachineBasicBlock *BB) const { - switch (MI->getOpcode()) { - default: -#ifndef NDEBUG - MI->dump(); -#endif - assert(0 && "Unexpected instruction for custom inserter!"); - break; - - case ARM64::F128CSEL: - return EmitF128CSEL(MI, BB); - - case TargetOpcode::STACKMAP: - case TargetOpcode::PATCHPOINT: - return emitPatchPoint(MI, BB); - } - llvm_unreachable("Unexpected instruction for custom inserter!"); -} - -//===----------------------------------------------------------------------===// -// ARM64 Lowering private implementation. -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// Lowering Code -//===----------------------------------------------------------------------===// - -/// changeIntCCToARM64CC - Convert a DAG integer condition code to an ARM64 CC -static ARM64CC::CondCode changeIntCCToARM64CC(ISD::CondCode CC) { - switch (CC) { - default: - llvm_unreachable("Unknown condition code!"); - case ISD::SETNE: - return ARM64CC::NE; - case ISD::SETEQ: - return ARM64CC::EQ; - case ISD::SETGT: - return ARM64CC::GT; - case ISD::SETGE: - return ARM64CC::GE; - case ISD::SETLT: - return ARM64CC::LT; - case ISD::SETLE: - return ARM64CC::LE; - case ISD::SETUGT: - return ARM64CC::HI; - case ISD::SETUGE: - return ARM64CC::HS; - case ISD::SETULT: - return ARM64CC::LO; - case ISD::SETULE: - return ARM64CC::LS; - } -} - -/// changeFPCCToARM64CC - Convert a DAG fp condition code to an ARM64 CC. -static void changeFPCCToARM64CC(ISD::CondCode CC, ARM64CC::CondCode &CondCode, - ARM64CC::CondCode &CondCode2) { - CondCode2 = ARM64CC::AL; - switch (CC) { - default: - llvm_unreachable("Unknown FP condition!"); - case ISD::SETEQ: - case ISD::SETOEQ: - CondCode = ARM64CC::EQ; - break; - case ISD::SETGT: - case ISD::SETOGT: - CondCode = ARM64CC::GT; - break; - case ISD::SETGE: - case ISD::SETOGE: - CondCode = ARM64CC::GE; - break; - case ISD::SETOLT: - CondCode = ARM64CC::MI; - break; - case ISD::SETOLE: - CondCode = ARM64CC::LS; - break; - case ISD::SETONE: - CondCode = ARM64CC::MI; - CondCode2 = ARM64CC::GT; - break; - case ISD::SETO: - CondCode = ARM64CC::VC; - break; - case ISD::SETUO: - CondCode = ARM64CC::VS; - break; - case ISD::SETUEQ: - CondCode = ARM64CC::EQ; - CondCode2 = ARM64CC::VS; - break; - case ISD::SETUGT: - CondCode = ARM64CC::HI; - break; - case ISD::SETUGE: - CondCode = ARM64CC::PL; - break; - case ISD::SETLT: - case ISD::SETULT: - CondCode = ARM64CC::LT; - break; - case ISD::SETLE: - case ISD::SETULE: - CondCode = ARM64CC::LE; - break; - case ISD::SETNE: - case ISD::SETUNE: - CondCode = ARM64CC::NE; - break; - } -} - -/// changeVectorFPCCToARM64CC - Convert a DAG fp condition code to an ARM64 CC -/// usable with the vector instructions. Fewer operations are available without -/// a real NZCV register, so we have to use less efficient combinations to get -/// the same effect. -static void changeVectorFPCCToARM64CC(ISD::CondCode CC, - ARM64CC::CondCode &CondCode, - ARM64CC::CondCode &CondCode2, - bool &Invert) { - Invert = false; - switch (CC) { - default: - // Mostly the scalar mappings work fine. - changeFPCCToARM64CC(CC, CondCode, CondCode2); - break; - case ISD::SETUO: - Invert = true; // Fallthrough - case ISD::SETO: - CondCode = ARM64CC::MI; - CondCode2 = ARM64CC::GE; - break; - case ISD::SETUEQ: - case ISD::SETULT: - case ISD::SETULE: - case ISD::SETUGT: - case ISD::SETUGE: - // All of the compare-mask comparisons are ordered, but we can switch - // between the two by a double inversion. E.g. ULE == !OGT. - Invert = true; - changeFPCCToARM64CC(getSetCCInverse(CC, false), CondCode, CondCode2); - break; - } -} - -static bool isLegalArithImmed(uint64_t C) { - // Matches ARM64DAGToDAGISel::SelectArithImmed(). - return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); -} - -static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, - SDLoc dl, SelectionDAG &DAG) { - EVT VT = LHS.getValueType(); - - if (VT.isFloatingPoint()) - return DAG.getNode(ARM64ISD::FCMP, dl, VT, LHS, RHS); - - // The CMP instruction is just an alias for SUBS, and representing it as - // SUBS means that it's possible to get CSE with subtract operations. - // A later phase can perform the optimization of setting the destination - // register to WZR/XZR if it ends up being unused. - unsigned Opcode = ARM64ISD::SUBS; - - if (RHS.getOpcode() == ISD::SUB && isa(RHS.getOperand(0)) && - cast(RHS.getOperand(0))->getZExtValue() == 0 && - (CC == ISD::SETEQ || CC == ISD::SETNE)) { - // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on - // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags - // can be set differently by this operation. It comes down to whether - // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then - // everything is fine. If not then the optimization is wrong. Thus general - // comparisons are only valid if op2 != 0. - - // So, finally, the only LLVM-native comparisons that don't mention C and V - // are SETEQ and SETNE. They're the only ones we can safely use CMN for in - // the absence of information about op2. - Opcode = ARM64ISD::ADDS; - RHS = RHS.getOperand(1); - } else if (LHS.getOpcode() == ISD::AND && isa(RHS) && - cast(RHS)->getZExtValue() == 0 && - !isUnsignedIntSetCC(CC)) { - // Similarly, (CMP (and X, Y), 0) can be implemented with a TST - // (a.k.a. ANDS) except that the flags are only guaranteed to work for one - // of the signed comparisons. - Opcode = ARM64ISD::ANDS; - RHS = LHS.getOperand(1); - LHS = LHS.getOperand(0); - } - - return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS) - .getValue(1); -} - -static SDValue getARM64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, - SDValue &ARM64cc, SelectionDAG &DAG, SDLoc dl) { - if (ConstantSDNode *RHSC = dyn_cast(RHS.getNode())) { - EVT VT = RHS.getValueType(); - uint64_t C = RHSC->getZExtValue(); - if (!isLegalArithImmed(C)) { - // Constant does not fit, try adjusting it by one? - switch (CC) { - default: - break; - case ISD::SETLT: - case ISD::SETGE: - if ((VT == MVT::i32 && C != 0x80000000 && - isLegalArithImmed((uint32_t)(C - 1))) || - (VT == MVT::i64 && C != 0x80000000ULL && - isLegalArithImmed(C - 1ULL))) { - CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; - C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; - RHS = DAG.getConstant(C, VT); - } - break; - case ISD::SETULT: - case ISD::SETUGE: - if ((VT == MVT::i32 && C != 0 && - isLegalArithImmed((uint32_t)(C - 1))) || - (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) { - CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; - C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; - RHS = DAG.getConstant(C, VT); - } - break; - case ISD::SETLE: - case ISD::SETGT: - if ((VT == MVT::i32 && C != 0x7fffffff && - isLegalArithImmed((uint32_t)(C + 1))) || - (VT == MVT::i64 && C != 0x7ffffffffffffffULL && - isLegalArithImmed(C + 1ULL))) { - CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; - C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; - RHS = DAG.getConstant(C, VT); - } - break; - case ISD::SETULE: - case ISD::SETUGT: - if ((VT == MVT::i32 && C != 0xffffffff && - isLegalArithImmed((uint32_t)(C + 1))) || - (VT == MVT::i64 && C != 0xfffffffffffffffULL && - isLegalArithImmed(C + 1ULL))) { - CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; - C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; - RHS = DAG.getConstant(C, VT); - } - break; - } - } - } - - SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); - ARM64CC::CondCode ARM64CC = changeIntCCToARM64CC(CC); - ARM64cc = DAG.getConstant(ARM64CC, MVT::i32); - return Cmp; -} - -static std::pair -getARM64XALUOOp(ARM64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { - assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) && - "Unsupported value type"); - SDValue Value, Overflow; - SDLoc DL(Op); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - unsigned Opc = 0; - switch (Op.getOpcode()) { - default: - llvm_unreachable("Unknown overflow instruction!"); - case ISD::SADDO: - Opc = ARM64ISD::ADDS; - CC = ARM64CC::VS; - break; - case ISD::UADDO: - Opc = ARM64ISD::ADDS; - CC = ARM64CC::HS; - break; - case ISD::SSUBO: - Opc = ARM64ISD::SUBS; - CC = ARM64CC::VS; - break; - case ISD::USUBO: - Opc = ARM64ISD::SUBS; - CC = ARM64CC::LO; - break; - // Multiply needs a little bit extra work. - case ISD::SMULO: - case ISD::UMULO: { - CC = ARM64CC::NE; - bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false; - if (Op.getValueType() == MVT::i32) { - unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; - // For a 32 bit multiply with overflow check we want the instruction - // selector to generate a widening multiply (SMADDL/UMADDL). For that we - // need to generate the following pattern: - // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b)) - LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS); - RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS); - SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); - SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul, - DAG.getConstant(0, MVT::i64)); - // On ARM64 the upper 32 bits are always zero extended for a 32 bit - // operation. We need to clear out the upper 32 bits, because we used a - // widening multiply that wrote all 64 bits. In the end this should be a - // noop. - Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add); - if (IsSigned) { - // The signed overflow check requires more than just a simple check for - // any bit set in the upper 32 bits of the result. These bits could be - // just the sign bits of a negative number. To perform the overflow - // check we have to arithmetic shift right the 32nd bit of the result by - // 31 bits. Then we compare the result to the upper 32 bits. - SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add, - DAG.getConstant(32, MVT::i64)); - UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits); - SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value, - DAG.getConstant(31, MVT::i64)); - // It is important that LowerBits is last, otherwise the arithmetic - // shift will not be folded into the compare (SUBS). - SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32); - Overflow = DAG.getNode(ARM64ISD::SUBS, DL, VTs, UpperBits, LowerBits) - .getValue(1); - } else { - // The overflow check for unsigned multiply is easy. We only need to - // check if any of the upper 32 bits are set. This can be done with a - // CMP (shifted register). For that we need to generate the following - // pattern: - // (i64 ARM64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32) - SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul, - DAG.getConstant(32, MVT::i64)); - SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); - Overflow = - DAG.getNode(ARM64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64), - UpperBits).getValue(1); - } - break; - } - assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type"); - // For the 64 bit multiply - Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); - if (IsSigned) { - SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS); - SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value, - DAG.getConstant(63, MVT::i64)); - // It is important that LowerBits is last, otherwise the arithmetic - // shift will not be folded into the compare (SUBS). - SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); - Overflow = DAG.getNode(ARM64ISD::SUBS, DL, VTs, UpperBits, LowerBits) - .getValue(1); - } else { - SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); - SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); - Overflow = - DAG.getNode(ARM64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64), - UpperBits).getValue(1); - } - break; - } - } // switch (...) - - if (Opc) { - SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); - - // Emit the ARM64 operation with overflow check. - Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); - Overflow = Value.getValue(1); - } - return std::make_pair(Value, Overflow); -} - -SDValue ARM64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, - RTLIB::Libcall Call) const { - SmallVector Ops; - for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) - Ops.push_back(Op.getOperand(i)); - - return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false, - SDLoc(Op)).first; -} - -static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { - SDValue Sel = Op.getOperand(0); - SDValue Other = Op.getOperand(1); - - // If neither operand is a SELECT_CC, give up. - if (Sel.getOpcode() != ISD::SELECT_CC) - std::swap(Sel, Other); - if (Sel.getOpcode() != ISD::SELECT_CC) - return Op; - - // The folding we want to perform is: - // (xor x, (select_cc a, b, cc, 0, -1) ) - // --> - // (csel x, (xor x, -1), cc ...) - // - // The latter will get matched to a CSINV instruction. - - ISD::CondCode CC = cast(Sel.getOperand(4))->get(); - SDValue LHS = Sel.getOperand(0); - SDValue RHS = Sel.getOperand(1); - SDValue TVal = Sel.getOperand(2); - SDValue FVal = Sel.getOperand(3); - SDLoc dl(Sel); - - // FIXME: This could be generalized to non-integer comparisons. - if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) - return Op; - - ConstantSDNode *CFVal = dyn_cast(FVal); - ConstantSDNode *CTVal = dyn_cast(TVal); - - // The the values aren't constants, this isn't the pattern we're looking for. - if (!CFVal || !CTVal) - return Op; - - // We can commute the SELECT_CC by inverting the condition. This - // might be needed to make this fit into a CSINV pattern. - if (CTVal->isAllOnesValue() && CFVal->isNullValue()) { - std::swap(TVal, FVal); - std::swap(CTVal, CFVal); - CC = ISD::getSetCCInverse(CC, true); - } - - // If the constants line up, perform the transform! - if (CTVal->isNullValue() && CFVal->isAllOnesValue()) { - SDValue CCVal; - SDValue Cmp = getARM64Cmp(LHS, RHS, CC, CCVal, DAG, dl); - - FVal = Other; - TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other, - DAG.getConstant(-1ULL, Other.getValueType())); - - return DAG.getNode(ARM64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal, - CCVal, Cmp); - } - - return Op; -} - -static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { - EVT VT = Op.getValueType(); - - // Let legalize expand this if it isn't a legal type yet. - if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) - return SDValue(); - - SDVTList VTs = DAG.getVTList(VT, MVT::i32); - - unsigned Opc; - bool ExtraOp = false; - switch (Op.getOpcode()) { - default: - assert(0 && "Invalid code"); - case ISD::ADDC: - Opc = ARM64ISD::ADDS; - break; - case ISD::SUBC: - Opc = ARM64ISD::SUBS; - break; - case ISD::ADDE: - Opc = ARM64ISD::ADCS; - ExtraOp = true; - break; - case ISD::SUBE: - Opc = ARM64ISD::SBCS; - ExtraOp = true; - break; - } - - if (!ExtraOp) - return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); - return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), - Op.getOperand(2)); -} - -static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { - // Let legalize expand this if it isn't a legal type yet. - if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) - return SDValue(); - - ARM64CC::CondCode CC; - // The actual operation that sets the overflow or carry flag. - SDValue Value, Overflow; - std::tie(Value, Overflow) = getARM64XALUOOp(CC, Op, DAG); - - // We use 0 and 1 as false and true values. - SDValue TVal = DAG.getConstant(1, MVT::i32); - SDValue FVal = DAG.getConstant(0, MVT::i32); - - // We use an inverted condition, because the conditional select is inverted - // too. This will allow it to be selected to a single instruction: - // CSINC Wd, WZR, WZR, invert(cond). - SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), MVT::i32); - Overflow = DAG.getNode(ARM64ISD::CSEL, SDLoc(Op), MVT::i32, FVal, TVal, CCVal, - Overflow); - - SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); - return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow); -} - -// Prefetch operands are: -// 1: Address to prefetch -// 2: bool isWrite -// 3: int locality (0 = no locality ... 3 = extreme locality) -// 4: bool isDataCache -static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { - SDLoc DL(Op); - unsigned IsWrite = cast(Op.getOperand(2))->getZExtValue(); - unsigned Locality = cast(Op.getOperand(3))->getZExtValue(); - // The data thing is not used. - // unsigned isData = cast(Op.getOperand(4))->getZExtValue(); - - bool IsStream = !Locality; - // When the locality number is set - if (Locality) { - // The front-end should have filtered out the out-of-range values - assert(Locality <= 3 && "Prefetch locality out-of-range"); - // The locality degree is the opposite of the cache speed. - // Put the number the other way around. - // The encoding starts at 0 for level 1 - Locality = 3 - Locality; - } - - // built the mask value encoding the expected behavior. - unsigned PrfOp = (IsWrite << 4) | // Load/Store bit - (Locality << 1) | // Cache level bits - (unsigned)IsStream; // Stream bit - return DAG.getNode(ARM64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), - DAG.getConstant(PrfOp, MVT::i32), Op.getOperand(1)); -} - -SDValue ARM64TargetLowering::LowerFP_EXTEND(SDValue Op, - SelectionDAG &DAG) const { - assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); - - RTLIB::Libcall LC; - LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); - - return LowerF128Call(Op, DAG, LC); -} - -SDValue ARM64TargetLowering::LowerFP_ROUND(SDValue Op, - SelectionDAG &DAG) const { - if (Op.getOperand(0).getValueType() != MVT::f128) { - // It's legal except when f128 is involved - return Op; - } - - RTLIB::Libcall LC; - LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); - - // FP_ROUND node has a second operand indicating whether it is known to be - // precise. That doesn't take part in the LibCall so we can't directly use - // LowerF128Call. - SDValue SrcVal = Op.getOperand(0); - return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1, - /*isSigned*/ false, SDLoc(Op)).first; -} - -static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { - // Warning: We maintain cost tables in ARM64TargetTransformInfo.cpp. - // Any additional optimization in this function should be recorded - // in the cost tables. - EVT InVT = Op.getOperand(0).getValueType(); - EVT VT = Op.getValueType(); - - // FP_TO_XINT conversion from the same type are legal. - if (VT.getSizeInBits() == InVT.getSizeInBits()) - return Op; - - if (InVT == MVT::v2f64 || InVT == MVT::v4f32) { - SDLoc dl(Op); - SDValue Cv = - DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), - Op.getOperand(0)); - return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); - } else if (InVT == MVT::v2f32) { - SDLoc dl(Op); - SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0)); - return DAG.getNode(Op.getOpcode(), dl, VT, Ext); - } - - // Type changing conversions are illegal. - return SDValue(); -} - -SDValue ARM64TargetLowering::LowerFP_TO_INT(SDValue Op, - SelectionDAG &DAG) const { - if (Op.getOperand(0).getValueType().isVector()) - return LowerVectorFP_TO_INT(Op, DAG); - - if (Op.getOperand(0).getValueType() != MVT::f128) { - // It's legal except when f128 is involved - return Op; - } - - RTLIB::Libcall LC; - if (Op.getOpcode() == ISD::FP_TO_SINT) - LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); - else - LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); - - SmallVector Ops; - for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) - Ops.push_back(Op.getOperand(i)); - - return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false, - SDLoc(Op)).first; -} - -static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { - // Warning: We maintain cost tables in ARM64TargetTransformInfo.cpp. - // Any additional optimization in this function should be recorded - // in the cost tables. - EVT VT = Op.getValueType(); - SDLoc dl(Op); - SDValue In = Op.getOperand(0); - EVT InVT = In.getValueType(); - - // v2i32 to v2f32 is legal. - if (VT == MVT::v2f32 && InVT == MVT::v2i32) - return Op; - - // This function only handles v2f64 outputs. - if (VT == MVT::v2f64) { - // Extend the input argument to a v2i64 that we can feed into the - // floating point conversion. Zero or sign extend based on whether - // we're doing a signed or unsigned float conversion. - unsigned Opc = - Op.getOpcode() == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; - assert(Op.getNumOperands() == 1 && "FP conversions take one argument"); - SDValue Promoted = DAG.getNode(Opc, dl, MVT::v2i64, Op.getOperand(0)); - return DAG.getNode(Op.getOpcode(), dl, Op.getValueType(), Promoted); - } - - // Scalarize v2i64 to v2f32 conversions. - std::vector BuildVectorOps; - for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { - SDValue Sclr = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, In, - DAG.getConstant(i, MVT::i64)); - Sclr = DAG.getNode(Op->getOpcode(), dl, MVT::f32, Sclr); - BuildVectorOps.push_back(Sclr); - } - - return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, BuildVectorOps); -} - -SDValue ARM64TargetLowering::LowerINT_TO_FP(SDValue Op, - SelectionDAG &DAG) const { - if (Op.getValueType().isVector()) - return LowerVectorINT_TO_FP(Op, DAG); - - // i128 conversions are libcalls. - if (Op.getOperand(0).getValueType() == MVT::i128) - return SDValue(); - - // Other conversions are legal, unless it's to the completely software-based - // fp128. - if (Op.getValueType() != MVT::f128) - return Op; - - RTLIB::Libcall LC; - if (Op.getOpcode() == ISD::SINT_TO_FP) - LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); - else - LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); - - return LowerF128Call(Op, DAG, LC); -} - -SDValue ARM64TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { - // For iOS, we want to call an alternative entry point: __sincos_stret, - // which returns the values in two S / D registers. - SDLoc dl(Op); - SDValue Arg = Op.getOperand(0); - EVT ArgVT = Arg.getValueType(); - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - - ArgListTy Args; - ArgListEntry Entry; - - Entry.Node = Arg; - Entry.Ty = ArgTy; - Entry.isSExt = false; - Entry.isZExt = false; - Args.push_back(Entry); - - const char *LibcallName = - (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; - SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy()); - - StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL); - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::Fast, RetTy, Callee, &Args, 0); - - std::pair CallResult = LowerCallTo(CLI); - return CallResult.first; -} - -SDValue ARM64TargetLowering::LowerOperation(SDValue Op, - SelectionDAG &DAG) const { - switch (Op.getOpcode()) { - default: - llvm_unreachable("unimplemented operand"); - return SDValue(); - case ISD::GlobalAddress: - return LowerGlobalAddress(Op, DAG); - case ISD::GlobalTLSAddress: - return LowerGlobalTLSAddress(Op, DAG); - case ISD::SETCC: - return LowerSETCC(Op, DAG); - case ISD::BR_CC: - return LowerBR_CC(Op, DAG); - case ISD::SELECT: - return LowerSELECT(Op, DAG); - case ISD::SELECT_CC: - return LowerSELECT_CC(Op, DAG); - case ISD::JumpTable: - return LowerJumpTable(Op, DAG); - case ISD::ConstantPool: - return LowerConstantPool(Op, DAG); - case ISD::BlockAddress: - return LowerBlockAddress(Op, DAG); - case ISD::VASTART: - return LowerVASTART(Op, DAG); - case ISD::VACOPY: - return LowerVACOPY(Op, DAG); - case ISD::VAARG: - return LowerVAARG(Op, DAG); - case ISD::ADDC: - case ISD::ADDE: - case ISD::SUBC: - case ISD::SUBE: - return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); - case ISD::SADDO: - case ISD::UADDO: - case ISD::SSUBO: - case ISD::USUBO: - case ISD::SMULO: - case ISD::UMULO: - return LowerXALUO(Op, DAG); - case ISD::FADD: - return LowerF128Call(Op, DAG, RTLIB::ADD_F128); - case ISD::FSUB: - return LowerF128Call(Op, DAG, RTLIB::SUB_F128); - case ISD::FMUL: - return LowerF128Call(Op, DAG, RTLIB::MUL_F128); - case ISD::FDIV: - return LowerF128Call(Op, DAG, RTLIB::DIV_F128); - case ISD::FP_ROUND: - return LowerFP_ROUND(Op, DAG); - case ISD::FP_EXTEND: - return LowerFP_EXTEND(Op, DAG); - case ISD::FRAMEADDR: - return LowerFRAMEADDR(Op, DAG); - case ISD::RETURNADDR: - return LowerRETURNADDR(Op, DAG); - case ISD::INSERT_VECTOR_ELT: - return LowerINSERT_VECTOR_ELT(Op, DAG); - case ISD::EXTRACT_VECTOR_ELT: - return LowerEXTRACT_VECTOR_ELT(Op, DAG); - case ISD::BUILD_VECTOR: - return LowerBUILD_VECTOR(Op, DAG); - case ISD::VECTOR_SHUFFLE: - return LowerVECTOR_SHUFFLE(Op, DAG); - case ISD::EXTRACT_SUBVECTOR: - return LowerEXTRACT_SUBVECTOR(Op, DAG); - case ISD::SRA: - case ISD::SRL: - case ISD::SHL: - return LowerVectorSRA_SRL_SHL(Op, DAG); - case ISD::SHL_PARTS: - return LowerShiftLeftParts(Op, DAG); - case ISD::SRL_PARTS: - case ISD::SRA_PARTS: - return LowerShiftRightParts(Op, DAG); - case ISD::CTPOP: - return LowerCTPOP(Op, DAG); - case ISD::FCOPYSIGN: - return LowerFCOPYSIGN(Op, DAG); - case ISD::AND: - return LowerVectorAND(Op, DAG); - case ISD::OR: - return LowerVectorOR(Op, DAG); - case ISD::XOR: - return LowerXOR(Op, DAG); - case ISD::PREFETCH: - return LowerPREFETCH(Op, DAG); - case ISD::SINT_TO_FP: - case ISD::UINT_TO_FP: - return LowerINT_TO_FP(Op, DAG); - case ISD::FP_TO_SINT: - case ISD::FP_TO_UINT: - return LowerFP_TO_INT(Op, DAG); - case ISD::FSINCOS: - return LowerFSINCOS(Op, DAG); - } -} - -/// getFunctionAlignment - Return the Log2 alignment of this function. -unsigned ARM64TargetLowering::getFunctionAlignment(const Function *F) const { - return 2; -} - -//===----------------------------------------------------------------------===// -// Calling Convention Implementation -//===----------------------------------------------------------------------===// - -#include "ARM64GenCallingConv.inc" - -/// Selects the correct CCAssignFn for a the given CallingConvention -/// value. -CCAssignFn *ARM64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, - bool IsVarArg) const { - switch (CC) { - default: - llvm_unreachable("Unsupported calling convention."); - case CallingConv::WebKit_JS: - return CC_ARM64_WebKit_JS; - case CallingConv::C: - case CallingConv::Fast: - if (!Subtarget->isTargetDarwin()) - return CC_ARM64_AAPCS; - return IsVarArg ? CC_ARM64_DarwinPCS_VarArg : CC_ARM64_DarwinPCS; - } -} - -SDValue ARM64TargetLowering::LowerFormalArguments( - SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Ins, SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl &InVals) const { - MachineFunction &MF = DAG.getMachineFunction(); - MachineFrameInfo *MFI = MF.getFrameInfo(); - - // Assign locations to all of the incoming arguments. - SmallVector ArgLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); - - // At this point, Ins[].VT may already be promoted to i32. To correctly - // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and - // i8 to CC_ARM64_AAPCS with i32 being ValVT and i8 being LocVT. - // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here - // we use a special version of AnalyzeFormalArguments to pass in ValVT and - // LocVT. - unsigned NumArgs = Ins.size(); - Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); - unsigned CurArgIdx = 0; - for (unsigned i = 0; i != NumArgs; ++i) { - MVT ValVT = Ins[i].VT; - std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx); - CurArgIdx = Ins[i].OrigArgIndex; - - // Get type of the original argument. - EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true); - MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; - // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. - MVT LocVT = ValVT; - if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) - LocVT = MVT::i8; - else if (ActualMVT == MVT::i16) - LocVT = MVT::i16; - - CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); - bool Res = - AssignFn(i, ValVT, LocVT, CCValAssign::Full, Ins[i].Flags, CCInfo); - assert(!Res && "Call operand has unhandled type"); - (void)Res; - } - assert(ArgLocs.size() == Ins.size()); - SmallVector ArgValues; - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i]; - - if (Ins[i].Flags.isByVal()) { - // Byval is used for HFAs in the PCS, but the system should work in a - // non-compliant manner for larger structs. - EVT PtrTy = getPointerTy(); - int Size = Ins[i].Flags.getByValSize(); - unsigned NumRegs = (Size + 7) / 8; - - // FIXME: This works on big-endian for composite byvals, which are the common - // case. It should also work for fundamental types too. - unsigned FrameIdx = - MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); - SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy); - InVals.push_back(FrameIdxN); - - continue; - } if (VA.isRegLoc()) { - // Arguments stored in registers. - EVT RegVT = VA.getLocVT(); - - SDValue ArgValue; - const TargetRegisterClass *RC; - - if (RegVT == MVT::i32) - RC = &ARM64::GPR32RegClass; - else if (RegVT == MVT::i64) - RC = &ARM64::GPR64RegClass; - else if (RegVT == MVT::f32) - RC = &ARM64::FPR32RegClass; - else if (RegVT == MVT::f64 || RegVT.is64BitVector()) - RC = &ARM64::FPR64RegClass; - else if (RegVT == MVT::f128 || RegVT.is128BitVector()) - RC = &ARM64::FPR128RegClass; - else - llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); - - // Transform the arguments in physical registers into virtual ones. - unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); - ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); - - // If this is an 8, 16 or 32-bit value, it is really passed promoted - // to 64 bits. Insert an assert[sz]ext to capture this, then - // truncate to the right size. - switch (VA.getLocInfo()) { - default: - llvm_unreachable("Unknown loc info!"); - case CCValAssign::Full: - break; - case CCValAssign::BCvt: - ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); - break; - case CCValAssign::SExt: - ArgValue = DAG.getNode(ISD::AssertSext, DL, RegVT, ArgValue, - DAG.getValueType(VA.getValVT())); - ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue); - break; - case CCValAssign::ZExt: - ArgValue = DAG.getNode(ISD::AssertZext, DL, RegVT, ArgValue, - DAG.getValueType(VA.getValVT())); - ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue); - break; - } - - InVals.push_back(ArgValue); - - } else { // VA.isRegLoc() - assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); - unsigned ArgOffset = VA.getLocMemOffset(); - unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8; - - uint32_t BEAlign = 0; - if (ArgSize < 8 && !Subtarget->isLittleEndian()) - BEAlign = 8 - ArgSize; - - int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); - - // Create load nodes to retrieve arguments from the stack. - SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); - SDValue ArgValue; - - // If the loc type and val type are not the same, create an anyext load. - if (VA.getLocVT().getSizeInBits() != VA.getValVT().getSizeInBits()) { - // We should only get here if this is a pure integer. - assert(!VA.getValVT().isVector() && VA.getValVT().isInteger() && - "Only integer extension supported!"); - ArgValue = DAG.getExtLoad(ISD::EXTLOAD, DL, VA.getValVT(), Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - VA.getLocVT(), - false, false, false, 0); - } else { - ArgValue = DAG.getLoad(VA.getValVT(), DL, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), false, - false, false, 0); - } - - InVals.push_back(ArgValue); - } - } - - // varargs - if (isVarArg) { - if (!Subtarget->isTargetDarwin()) { - // The AAPCS variadic function ABI is identical to the non-variadic - // one. As a result there may be more arguments in registers and we should - // save them for future reference. - saveVarArgRegisters(CCInfo, DAG, DL, Chain); - } - - ARM64FunctionInfo *AFI = MF.getInfo(); - // This will point to the next argument passed via stack. - unsigned StackOffset = CCInfo.getNextStackOffset(); - // We currently pass all varargs at 8-byte alignment. - StackOffset = ((StackOffset + 7) & ~7); - AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true)); - } - - ARM64FunctionInfo *FuncInfo = MF.getInfo(); - unsigned StackArgSize = CCInfo.getNextStackOffset(); - bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; - if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { - // This is a non-standard ABI so by fiat I say we're allowed to make full - // use of the stack area to be popped, which must be aligned to 16 bytes in - // any case: - StackArgSize = RoundUpToAlignment(StackArgSize, 16); - - // If we're expected to restore the stack (e.g. fastcc) then we'll be adding - // a multiple of 16. - FuncInfo->setArgumentStackToRestore(StackArgSize); - - // This realignment carries over to the available bytes below. Our own - // callers will guarantee the space is free by giving an aligned value to - // CALLSEQ_START. - } - // Even if we're not expected to free up the space, it's useful to know how - // much is there while considering tail calls (because we can reuse it). - FuncInfo->setBytesInStackArgArea(StackArgSize); - - return Chain; -} - -void ARM64TargetLowering::saveVarArgRegisters(CCState &CCInfo, - SelectionDAG &DAG, SDLoc DL, - SDValue &Chain) const { - MachineFunction &MF = DAG.getMachineFunction(); - MachineFrameInfo *MFI = MF.getFrameInfo(); - ARM64FunctionInfo *FuncInfo = MF.getInfo(); - - SmallVector MemOps; - - static const MCPhysReg GPRArgRegs[] = { ARM64::X0, ARM64::X1, ARM64::X2, - ARM64::X3, ARM64::X4, ARM64::X5, - ARM64::X6, ARM64::X7 }; - static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs); - unsigned FirstVariadicGPR = - CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs); - - unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); - int GPRIdx = 0; - if (GPRSaveSize != 0) { - GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false); - - SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy()); - - for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { - unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &ARM64::GPR64RegClass); - SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); - SDValue Store = - DAG.getStore(Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getStack(i * 8), false, false, 0); - MemOps.push_back(Store); - FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, - DAG.getConstant(8, getPointerTy())); - } - } - FuncInfo->setVarArgsGPRIndex(GPRIdx); - FuncInfo->setVarArgsGPRSize(GPRSaveSize); - - if (Subtarget->hasFPARMv8()) { - static const MCPhysReg FPRArgRegs[] = { ARM64::Q0, ARM64::Q1, ARM64::Q2, - ARM64::Q3, ARM64::Q4, ARM64::Q5, - ARM64::Q6, ARM64::Q7 }; - static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs); - unsigned FirstVariadicFPR = - CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs); - - unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); - int FPRIdx = 0; - if (FPRSaveSize != 0) { - FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false); - - SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy()); - - for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { - unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &ARM64::FPR128RegClass); - SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); - - SDValue Store = - DAG.getStore(Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getStack(i * 16), false, false, 0); - MemOps.push_back(Store); - FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, - DAG.getConstant(16, getPointerTy())); - } - } - FuncInfo->setVarArgsFPRIndex(FPRIdx); - FuncInfo->setVarArgsFPRSize(FPRSaveSize); - } - - if (!MemOps.empty()) { - Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); - } -} - -/// LowerCallResult - Lower the result values of a call into the -/// appropriate copies out of appropriate physical registers. -SDValue ARM64TargetLowering::LowerCallResult( - SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Ins, SDLoc DL, SelectionDAG &DAG, - SmallVectorImpl &InVals, bool isThisReturn, - SDValue ThisVal) const { - CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS - : RetCC_ARM64_AAPCS; - // Assign locations to each value returned by this call. - SmallVector RVLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); - CCInfo.AnalyzeCallResult(Ins, RetCC); - - // Copy all of the result registers out of their specified physreg. - for (unsigned i = 0; i != RVLocs.size(); ++i) { - CCValAssign VA = RVLocs[i]; - - // Pass 'this' value directly from the argument to return value, to avoid - // reg unit interference - if (i == 0 && isThisReturn) { - assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && - "unexpected return calling convention register assignment"); - InVals.push_back(ThisVal); - continue; - } - - SDValue Val = - DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); - Chain = Val.getValue(1); - InFlag = Val.getValue(2); - - switch (VA.getLocInfo()) { - default: - llvm_unreachable("Unknown loc info!"); - case CCValAssign::Full: - break; - case CCValAssign::BCvt: - Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); - break; - } - - InVals.push_back(Val); - } - - return Chain; -} - -bool ARM64TargetLowering::isEligibleForTailCallOptimization( - SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, - bool isCalleeStructRet, bool isCallerStructRet, - const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, - const SmallVectorImpl &Ins, SelectionDAG &DAG) const { - // For CallingConv::C this function knows whether the ABI needs - // changing. That's not true for other conventions so they will have to opt in - // manually. - if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) - return false; - - const MachineFunction &MF = DAG.getMachineFunction(); - const Function *CallerF = MF.getFunction(); - CallingConv::ID CallerCC = CallerF->getCallingConv(); - bool CCMatch = CallerCC == CalleeCC; - - // Byval parameters hand the function a pointer directly into the stack area - // we want to reuse during a tail call. Working around this *is* possible (see - // X86) but less efficient and uglier in LowerCall. - for (Function::const_arg_iterator i = CallerF->arg_begin(), - e = CallerF->arg_end(); - i != e; ++i) - if (i->hasByValAttr()) - return false; - - if (getTargetMachine().Options.GuaranteedTailCallOpt) { - if (IsTailCallConvention(CalleeCC) && CCMatch) - return true; - return false; - } - - // Now we search for cases where we can use a tail call without changing the - // ABI. Sibcall is used in some places (particularly gcc) to refer to this - // concept. - - // I want anyone implementing a new calling convention to think long and hard - // about this assert. - assert((!isVarArg || CalleeCC == CallingConv::C) && - "Unexpected variadic calling convention"); - - if (isVarArg && !Outs.empty()) { - // At least two cases here: if caller is fastcc then we can't have any - // memory arguments (we'd be expected to clean up the stack afterwards). If - // caller is C then we could potentially use its argument area. - - // FIXME: for now we take the most conservative of these in both cases: - // disallow all variadic memory operands. - SmallVector ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); - - CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) - if (!ArgLocs[i].isRegLoc()) - return false; - } - - // If the calling conventions do not match, then we'd better make sure the - // results are returned in the same way as what the caller expects. - if (!CCMatch) { - SmallVector RVLocs1; - CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), - getTargetMachine(), RVLocs1, *DAG.getContext()); - CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg)); - - SmallVector RVLocs2; - CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), - getTargetMachine(), RVLocs2, *DAG.getContext()); - CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg)); - - if (RVLocs1.size() != RVLocs2.size()) - return false; - for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { - if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) - return false; - if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) - return false; - if (RVLocs1[i].isRegLoc()) { - if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) - return false; - } else { - if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) - return false; - } - } - } - - // Nothing more to check if the callee is taking no arguments - if (Outs.empty()) - return true; - - SmallVector ArgLocs; - CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); - - CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); - - const ARM64FunctionInfo *FuncInfo = MF.getInfo(); - - // If the stack arguments for this call would fit into our own save area then - // the call can be made tail. - return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea(); -} - -SDValue ARM64TargetLowering::addTokenForArgument(SDValue Chain, - SelectionDAG &DAG, - MachineFrameInfo *MFI, - int ClobberedFI) const { - SmallVector ArgChains; - int64_t FirstByte = MFI->getObjectOffset(ClobberedFI); - int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1; - - // Include the original chain at the beginning of the list. When this is - // used by target LowerCall hooks, this helps legalize find the - // CALLSEQ_BEGIN node. - ArgChains.push_back(Chain); - - // Add a chain value for each stack argument corresponding - for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), - UE = DAG.getEntryNode().getNode()->use_end(); - U != UE; ++U) - if (LoadSDNode *L = dyn_cast(*U)) - if (FrameIndexSDNode *FI = dyn_cast(L->getBasePtr())) - if (FI->getIndex() < 0) { - int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex()); - int64_t InLastByte = InFirstByte; - InLastByte += MFI->getObjectSize(FI->getIndex()) - 1; - - if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || - (FirstByte <= InFirstByte && InFirstByte <= LastByte)) - ArgChains.push_back(SDValue(L, 1)); - } - - // Build a tokenfactor for all the chains. - return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); -} - -bool ARM64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, - bool TailCallOpt) const { - return CallCC == CallingConv::Fast && TailCallOpt; -} - -bool ARM64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const { - return CallCC == CallingConv::Fast; -} - -/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, -/// and add input and output parameter nodes. -SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI, - SmallVectorImpl &InVals) const { - SelectionDAG &DAG = CLI.DAG; - SDLoc &DL = CLI.DL; - SmallVector &Outs = CLI.Outs; - SmallVector &OutVals = CLI.OutVals; - SmallVector &Ins = CLI.Ins; - SDValue Chain = CLI.Chain; - SDValue Callee = CLI.Callee; - bool &IsTailCall = CLI.IsTailCall; - CallingConv::ID CallConv = CLI.CallConv; - bool IsVarArg = CLI.IsVarArg; - - MachineFunction &MF = DAG.getMachineFunction(); - bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); - bool IsThisReturn = false; - - ARM64FunctionInfo *FuncInfo = MF.getInfo(); - bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; - bool IsSibCall = false; - - if (IsTailCall) { - // Check if it's really possible to do a tail call. - IsTailCall = isEligibleForTailCallOptimization( - Callee, CallConv, IsVarArg, IsStructRet, - MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG); - if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall()) - report_fatal_error("failed to perform tail call elimination on a call " - "site marked musttail"); - - // A sibling call is one where we're under the usual C ABI and not planning - // to change that but can still do a tail call: - if (!TailCallOpt && IsTailCall) - IsSibCall = true; - - if (IsTailCall) - ++NumTailCalls; - } - - // Analyze operands of the call, assigning locations to each operand. - SmallVector ArgLocs; - CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), - getTargetMachine(), ArgLocs, *DAG.getContext()); - - if (IsVarArg) { - // Handle fixed and variable vector arguments differently. - // Variable vector arguments always go into memory. - unsigned NumArgs = Outs.size(); - - for (unsigned i = 0; i != NumArgs; ++i) { - MVT ArgVT = Outs[i].VT; - ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; - CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, - /*IsVarArg=*/ !Outs[i].IsFixed); - bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); - assert(!Res && "Call operand has unhandled type"); - (void)Res; - } - } else { - // At this point, Outs[].VT may already be promoted to i32. To correctly - // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and - // i8 to CC_ARM64_AAPCS with i32 being ValVT and i8 being LocVT. - // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here - // we use a special version of AnalyzeCallOperands to pass in ValVT and - // LocVT. - unsigned NumArgs = Outs.size(); - for (unsigned i = 0; i != NumArgs; ++i) { - MVT ValVT = Outs[i].VT; - // Get type of the original argument. - EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty, - /*AllowUnknown*/ true); - MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT; - ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; - // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. - MVT LocVT = ValVT; - if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) - LocVT = MVT::i8; - else if (ActualMVT == MVT::i16) - LocVT = MVT::i16; - - CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); - bool Res = AssignFn(i, ValVT, LocVT, CCValAssign::Full, ArgFlags, CCInfo); - assert(!Res && "Call operand has unhandled type"); - (void)Res; - } - } - - // Get a count of how many bytes are to be pushed on the stack. - unsigned NumBytes = CCInfo.getNextStackOffset(); - - if (IsSibCall) { - // Since we're not changing the ABI to make this a tail call, the memory - // operands are already available in the caller's incoming argument space. - NumBytes = 0; - } - - // FPDiff is the byte offset of the call's argument area from the callee's. - // Stores to callee stack arguments will be placed in FixedStackSlots offset - // by this amount for a tail call. In a sibling call it must be 0 because the - // caller will deallocate the entire stack and the callee still expects its - // arguments to begin at SP+0. Completely unused for non-tail calls. - int FPDiff = 0; - - if (IsTailCall && !IsSibCall) { - unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); - - // Since callee will pop argument stack as a tail call, we must keep the - // popped size 16-byte aligned. - NumBytes = RoundUpToAlignment(NumBytes, 16); - - // FPDiff will be negative if this tail call requires more space than we - // would automatically have in our incoming argument space. Positive if we - // can actually shrink the stack. - FPDiff = NumReusableBytes - NumBytes; - - // The stack pointer must be 16-byte aligned at all times it's used for a - // memory operation, which in practice means at *all* times and in - // particular across call boundaries. Therefore our own arguments started at - // a 16-byte aligned SP and the delta applied for the tail call should - // satisfy the same constraint. - assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); - } - - // Adjust the stack pointer for the new arguments... - // These operations are automatically eliminated by the prolog/epilog pass - if (!IsSibCall) - Chain = - DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL); - - SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, ARM64::SP, getPointerTy()); - - SmallVector, 8> RegsToPass; - SmallVector MemOpChains; - - // Walk the register/memloc assignments, inserting copies/loads. - for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; - ++i, ++realArgIdx) { - CCValAssign &VA = ArgLocs[i]; - SDValue Arg = OutVals[realArgIdx]; - ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; - - // Promote the value if needed. - switch (VA.getLocInfo()) { - default: - llvm_unreachable("Unknown loc info!"); - case CCValAssign::Full: - break; - case CCValAssign::SExt: - Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); - break; - case CCValAssign::ZExt: - Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); - break; - case CCValAssign::AExt: - Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); - break; - case CCValAssign::BCvt: - Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); - break; - case CCValAssign::FPExt: - Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); - break; - } - - if (VA.isRegLoc()) { - if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) { - assert(VA.getLocVT() == MVT::i64 && - "unexpected calling convention register assignment"); - assert(!Ins.empty() && Ins[0].VT == MVT::i64 && - "unexpected use of 'returned'"); - IsThisReturn = true; - } - RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); - } else { - assert(VA.isMemLoc()); - - SDValue DstAddr; - MachinePointerInfo DstInfo; - - // FIXME: This works on big-endian for composite byvals, which are the - // common case. It should also work for fundamental types too. - uint32_t BEAlign = 0; - unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 - : VA.getLocVT().getSizeInBits(); - OpSize = (OpSize + 7) / 8; - if (!Subtarget->isLittleEndian() && !Flags.isByVal()) { - if (OpSize < 8) - BEAlign = 8 - OpSize; - } - unsigned LocMemOffset = VA.getLocMemOffset(); - int32_t Offset = LocMemOffset + BEAlign; - SDValue PtrOff = DAG.getIntPtrConstant(Offset); - PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff); - - if (IsTailCall) { - Offset = Offset + FPDiff; - int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); - - DstAddr = DAG.getFrameIndex(FI, getPointerTy()); - DstInfo = MachinePointerInfo::getFixedStack(FI); - - // Make sure any stack arguments overlapping with where we're storing - // are loaded before this eventual operation. Otherwise they'll be - // clobbered. - Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); - } else { - SDValue PtrOff = DAG.getIntPtrConstant(Offset); - - DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff); - DstInfo = MachinePointerInfo::getStack(LocMemOffset); - } - - if (Outs[i].Flags.isByVal()) { - SDValue SizeNode = - DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64); - SDValue Cpy = DAG.getMemcpy( - Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), - /*isVolatile = */ false, - /*alwaysInline = */ false, DstInfo, MachinePointerInfo()); - - MemOpChains.push_back(Cpy); - } else { - // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already - // promoted to a legal register type i32, we should truncate Arg back to - // i1/i8/i16. - if (Arg.getValueType().isSimple() && - Arg.getValueType().getSimpleVT() == MVT::i32 && - (VA.getLocVT() == MVT::i1 || VA.getLocVT() == MVT::i8 || - VA.getLocVT() == MVT::i16)) - Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getLocVT(), Arg); - - SDValue Store = - DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0); - MemOpChains.push_back(Store); - } - } - } - - if (!MemOpChains.empty()) - Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); - - // Build a sequence of copy-to-reg nodes chained together with token chain - // and flag operands which copy the outgoing args into the appropriate regs. - SDValue InFlag; - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first, - RegsToPass[i].second, InFlag); - InFlag = Chain.getValue(1); - } - - // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every - // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol - // node so that legalize doesn't hack it. - if (getTargetMachine().getCodeModel() == CodeModel::Large && - Subtarget->isTargetMachO()) { - if (GlobalAddressSDNode *G = dyn_cast(Callee)) { - const GlobalValue *GV = G->getGlobal(); - bool InternalLinkage = GV->hasInternalLinkage(); - if (InternalLinkage) - Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0); - else { - Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, - ARM64II::MO_GOT); - Callee = DAG.getNode(ARM64ISD::LOADgot, DL, getPointerTy(), Callee); - } - } else if (ExternalSymbolSDNode *S = - dyn_cast(Callee)) { - const char *Sym = S->getSymbol(); - Callee = - DAG.getTargetExternalSymbol(Sym, getPointerTy(), ARM64II::MO_GOT); - Callee = DAG.getNode(ARM64ISD::LOADgot, DL, getPointerTy(), Callee); - } - } else if (GlobalAddressSDNode *G = dyn_cast(Callee)) { - const GlobalValue *GV = G->getGlobal(); - Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0); - } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { - const char *Sym = S->getSymbol(); - Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0); - } - - // We don't usually want to end the call-sequence here because we would tidy - // the frame up *after* the call, however in the ABI-changing tail-call case - // we've carefully laid out the parameters so that when sp is reset they'll be - // in the correct location. - if (IsTailCall && !IsSibCall) { - Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), - DAG.getIntPtrConstant(0, true), InFlag, DL); - InFlag = Chain.getValue(1); - } - - std::vector Ops; - Ops.push_back(Chain); - Ops.push_back(Callee); - - if (IsTailCall) { - // Each tail call may have to adjust the stack by a different amount, so - // this information must travel along with the operation for eventual - // consumption by emitEpilogue. - Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32)); - } - - // Add argument registers to the end of the list so that they are known live - // into the call. - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) - Ops.push_back(DAG.getRegister(RegsToPass[i].first, - RegsToPass[i].second.getValueType())); - - // Add a register mask operand representing the call-preserved registers. - const uint32_t *Mask; - const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); - const ARM64RegisterInfo *ARI = static_cast(TRI); - if (IsThisReturn) { - // For 'this' returns, use the X0-preserving mask if applicable - Mask = ARI->getThisReturnPreservedMask(CallConv); - if (!Mask) { - IsThisReturn = false; - Mask = ARI->getCallPreservedMask(CallConv); - } - } else - Mask = ARI->getCallPreservedMask(CallConv); - - assert(Mask && "Missing call preserved mask for calling convention"); - Ops.push_back(DAG.getRegisterMask(Mask)); - - if (InFlag.getNode()) - Ops.push_back(InFlag); - - SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); - - // If we're doing a tall call, use a TC_RETURN here rather than an - // actual call instruction. - if (IsTailCall) - return DAG.getNode(ARM64ISD::TC_RETURN, DL, NodeTys, Ops); - - // Returns a chain and a flag for retval copy to use. - Chain = DAG.getNode(ARM64ISD::CALL, DL, NodeTys, Ops); - InFlag = Chain.getValue(1); - - uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt) - ? RoundUpToAlignment(NumBytes, 16) - : 0; - - Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), - DAG.getIntPtrConstant(CalleePopBytes, true), - InFlag, DL); - if (!Ins.empty()) - InFlag = Chain.getValue(1); - - // Handle result values, copying them out of physregs into vregs that we - // return. - return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, - InVals, IsThisReturn, - IsThisReturn ? OutVals[0] : SDValue()); -} - -bool ARM64TargetLowering::CanLowerReturn( - CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, - const SmallVectorImpl &Outs, LLVMContext &Context) const { - CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS - : RetCC_ARM64_AAPCS; - SmallVector RVLocs; - CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context); - return CCInfo.CheckReturn(Outs, RetCC); -} - -SDValue -ARM64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, - SDLoc DL, SelectionDAG &DAG) const { - CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS - : RetCC_ARM64_AAPCS; - SmallVector RVLocs; - CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), - getTargetMachine(), RVLocs, *DAG.getContext()); - CCInfo.AnalyzeReturn(Outs, RetCC); - - // Copy the result values into the output registers. - SDValue Flag; - SmallVector RetOps(1, Chain); - for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); - ++i, ++realRVLocIdx) { - CCValAssign &VA = RVLocs[i]; - assert(VA.isRegLoc() && "Can only return in registers!"); - SDValue Arg = OutVals[realRVLocIdx]; - - switch (VA.getLocInfo()) { - default: - llvm_unreachable("Unknown loc info!"); - case CCValAssign::Full: - break; - case CCValAssign::BCvt: - Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); - break; - } - - Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); - Flag = Chain.getValue(1); - RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); - } - - RetOps[0] = Chain; // Update chain. - - // Add the flag if we have it. - if (Flag.getNode()) - RetOps.push_back(Flag); - - return DAG.getNode(ARM64ISD::RET_FLAG, DL, MVT::Other, RetOps); -} - -//===----------------------------------------------------------------------===// -// Other Lowering Code -//===----------------------------------------------------------------------===// - -SDValue ARM64TargetLowering::LowerGlobalAddress(SDValue Op, - SelectionDAG &DAG) const { - EVT PtrVT = getPointerTy(); - SDLoc DL(Op); - const GlobalValue *GV = cast(Op)->getGlobal(); - unsigned char OpFlags = - Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); - - assert(cast(Op)->getOffset() == 0 && - "unexpected offset in global node"); - - // This also catched the large code model case for Darwin. - if ((OpFlags & ARM64II::MO_GOT) != 0) { - SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); - // FIXME: Once remat is capable of dealing with instructions with register - // operands, expand this into two nodes instead of using a wrapper node. - return DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, GotAddr); - } - - if (getTargetMachine().getCodeModel() == CodeModel::Large) { - const unsigned char MO_NC = ARM64II::MO_NC; - return DAG.getNode( - ARM64ISD::WrapperLarge, DL, PtrVT, - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G3), - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G2 | MO_NC), - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G1 | MO_NC), - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G0 | MO_NC)); - } else { - // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and - // the only correct model on Darwin. - SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, - OpFlags | ARM64II::MO_PAGE); - unsigned char LoFlags = OpFlags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC; - SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags); - - SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi); - return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo); - } -} - -/// \brief Convert a TLS address reference into the correct sequence of loads -/// and calls to compute the variable's address (for Darwin, currently) and -/// return an SDValue containing the final node. - -/// Darwin only has one TLS scheme which must be capable of dealing with the -/// fully general situation, in the worst case. This means: -/// + "extern __thread" declaration. -/// + Defined in a possibly unknown dynamic library. -/// -/// The general system is that each __thread variable has a [3 x i64] descriptor -/// which contains information used by the runtime to calculate the address. The -/// only part of this the compiler needs to know about is the first xword, which -/// contains a function pointer that must be called with the address of the -/// entire descriptor in "x0". -/// -/// Since this descriptor may be in a different unit, in general even the -/// descriptor must be accessed via an indirect load. The "ideal" code sequence -/// is: -/// adrp x0, _var@TLVPPAGE -/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor -/// ldr x1, [x0] ; x1 contains 1st entry of descriptor, -/// ; the function pointer -/// blr x1 ; Uses descriptor address in x0 -/// ; Address of _var is now in x0. -/// -/// If the address of _var's descriptor *is* known to the linker, then it can -/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for -/// a slight efficiency gain. -SDValue -ARM64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, - SelectionDAG &DAG) const { - assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin"); - - SDLoc DL(Op); - MVT PtrVT = getPointerTy(); - const GlobalValue *GV = cast(Op)->getGlobal(); - - SDValue TLVPAddr = - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_TLS); - SDValue DescAddr = DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, TLVPAddr); - - // The first entry in the descriptor is a function pointer that we must call - // to obtain the address of the variable. - SDValue Chain = DAG.getEntryNode(); - SDValue FuncTLVGet = - DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(), - false, true, true, 8); - Chain = FuncTLVGet.getValue(1); - - MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); - MFI->setAdjustsStack(true); - - // TLS calls preserve all registers except those that absolutely must be - // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be - // silly). - const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); - const ARM64RegisterInfo *ARI = static_cast(TRI); - const uint32_t *Mask = ARI->getTLSCallPreservedMask(); - - // Finally, we can make the call. This is just a degenerate version of a - // normal ARM64 call node: x0 takes the address of the descriptor, and returns - // the address of the variable in this thread. - Chain = DAG.getCopyToReg(Chain, DL, ARM64::X0, DescAddr, SDValue()); - Chain = DAG.getNode(ARM64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), - Chain, FuncTLVGet, DAG.getRegister(ARM64::X0, MVT::i64), - DAG.getRegisterMask(Mask), Chain.getValue(1)); - return DAG.getCopyFromReg(Chain, DL, ARM64::X0, PtrVT, Chain.getValue(1)); -} - -/// When accessing thread-local variables under either the general-dynamic or -/// local-dynamic system, we make a "TLS-descriptor" call. The variable will -/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry -/// is a function pointer to carry out the resolution. This function takes the -/// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All -/// other registers (except LR, NZCV) are preserved. -/// -/// Thus, the ideal call sequence on AArch64 is: -/// -/// adrp x0, :tlsdesc:thread_var -/// ldr x8, [x0, :tlsdesc_lo12:thread_var] -/// add x0, x0, :tlsdesc_lo12:thread_var -/// .tlsdesccall thread_var -/// blr x8 -/// (TPIDR_EL0 offset now in x0). -/// -/// The ".tlsdesccall" directive instructs the assembler to insert a particular -/// relocation to help the linker relax this sequence if it turns out to be too -/// conservative. -/// -/// FIXME: we currently produce an extra, duplicated, ADRP instruction, but this -/// is harmless. -SDValue ARM64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr, - SDValue DescAddr, SDLoc DL, - SelectionDAG &DAG) const { - EVT PtrVT = getPointerTy(); - - // The function we need to call is simply the first entry in the GOT for this - // descriptor, load it in preparation. - SDValue Func = DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, SymAddr); - - // TLS calls preserve all registers except those that absolutely must be - // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be - // silly). - const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo(); - const ARM64RegisterInfo *ARI = static_cast(TRI); - const uint32_t *Mask = ARI->getTLSCallPreservedMask(); - - // The function takes only one argument: the address of the descriptor itself - // in X0. - SDValue Glue, Chain; - Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM64::X0, DescAddr, Glue); - Glue = Chain.getValue(1); - - // We're now ready to populate the argument list, as with a normal call: - SmallVector Ops; - Ops.push_back(Chain); - Ops.push_back(Func); - Ops.push_back(SymAddr); - Ops.push_back(DAG.getRegister(ARM64::X0, PtrVT)); - Ops.push_back(DAG.getRegisterMask(Mask)); - Ops.push_back(Glue); - - SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); - Chain = DAG.getNode(ARM64ISD::TLSDESC_CALL, DL, NodeTys, Ops); - Glue = Chain.getValue(1); - - return DAG.getCopyFromReg(Chain, DL, ARM64::X0, PtrVT, Glue); -} - -SDValue ARM64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, - SelectionDAG &DAG) const { - assert(Subtarget->isTargetELF() && "This function expects an ELF target"); - assert(getTargetMachine().getCodeModel() == CodeModel::Small && - "ELF TLS only supported in small memory model"); - const GlobalAddressSDNode *GA = cast(Op); - - TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); - - SDValue TPOff; - EVT PtrVT = getPointerTy(); - SDLoc DL(Op); - const GlobalValue *GV = GA->getGlobal(); - - SDValue ThreadBase = DAG.getNode(ARM64ISD::THREAD_POINTER, DL, PtrVT); - - if (Model == TLSModel::LocalExec) { - SDValue HiVar = DAG.getTargetGlobalAddress( - GV, DL, PtrVT, 0, ARM64II::MO_TLS | ARM64II::MO_G1); - SDValue LoVar = DAG.getTargetGlobalAddress( - GV, DL, PtrVT, 0, ARM64II::MO_TLS | ARM64II::MO_G0 | ARM64II::MO_NC); - - TPOff = SDValue(DAG.getMachineNode(ARM64::MOVZXi, DL, PtrVT, HiVar, - DAG.getTargetConstant(16, MVT::i32)), - 0); - TPOff = SDValue(DAG.getMachineNode(ARM64::MOVKXi, DL, PtrVT, TPOff, LoVar, - DAG.getTargetConstant(0, MVT::i32)), - 0); - } else if (Model == TLSModel::InitialExec) { - TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_TLS); - TPOff = DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, TPOff); - } else if (Model == TLSModel::LocalDynamic) { - // Local-dynamic accesses proceed in two phases. A general-dynamic TLS - // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate - // the beginning of the module's TLS region, followed by a DTPREL offset - // calculation. - - // These accesses will need deduplicating if there's more than one. - ARM64FunctionInfo *MFI = - DAG.getMachineFunction().getInfo(); - MFI->incNumLocalDynamicTLSAccesses(); - - // Accesses used in this sequence go via the TLS descriptor which lives in - // the GOT. Prepare an address we can use to handle this. - SDValue HiDesc = DAG.getTargetExternalSymbol( - "_TLS_MODULE_BASE_", PtrVT, ARM64II::MO_TLS | ARM64II::MO_PAGE); - SDValue LoDesc = DAG.getTargetExternalSymbol( - "_TLS_MODULE_BASE_", PtrVT, - ARM64II::MO_TLS | ARM64II::MO_PAGEOFF | ARM64II::MO_NC); - - // First argument to the descriptor call is the address of the descriptor - // itself. - SDValue DescAddr = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, HiDesc); - DescAddr = DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc); - - // The call needs a relocation too for linker relaxation. It doesn't make - // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of - // the address. - SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, - ARM64II::MO_TLS); - - // Now we can calculate the offset from TPIDR_EL0 to this module's - // thread-local area. - TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG); - - // Now use :dtprel_whatever: operations to calculate this variable's offset - // in its thread-storage area. - SDValue HiVar = DAG.getTargetGlobalAddress( - GV, DL, MVT::i64, 0, ARM64II::MO_TLS | ARM64II::MO_G1); - SDValue LoVar = DAG.getTargetGlobalAddress( - GV, DL, MVT::i64, 0, ARM64II::MO_TLS | ARM64II::MO_G0 | ARM64II::MO_NC); - - SDValue DTPOff = - SDValue(DAG.getMachineNode(ARM64::MOVZXi, DL, PtrVT, HiVar, - DAG.getTargetConstant(16, MVT::i32)), - 0); - DTPOff = SDValue(DAG.getMachineNode(ARM64::MOVKXi, DL, PtrVT, DTPOff, LoVar, - DAG.getTargetConstant(0, MVT::i32)), - 0); - - TPOff = DAG.getNode(ISD::ADD, DL, PtrVT, TPOff, DTPOff); - } else if (Model == TLSModel::GeneralDynamic) { - // Accesses used in this sequence go via the TLS descriptor which lives in - // the GOT. Prepare an address we can use to handle this. - SDValue HiDesc = DAG.getTargetGlobalAddress( - GV, DL, PtrVT, 0, ARM64II::MO_TLS | ARM64II::MO_PAGE); - SDValue LoDesc = DAG.getTargetGlobalAddress( - GV, DL, PtrVT, 0, - ARM64II::MO_TLS | ARM64II::MO_PAGEOFF | ARM64II::MO_NC); - - // First argument to the descriptor call is the address of the descriptor - // itself. - SDValue DescAddr = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, HiDesc); - DescAddr = DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc); - - // The call needs a relocation too for linker relaxation. It doesn't make - // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of - // the address. - SDValue SymAddr = - DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_TLS); - - // Finally we can make a call to calculate the offset from tpidr_el0. - TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG); - } else - llvm_unreachable("Unsupported ELF TLS access model"); - - return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); -} - -SDValue ARM64TargetLowering::LowerGlobalTLSAddress(SDValue Op, - SelectionDAG &DAG) const { - if (Subtarget->isTargetDarwin()) - return LowerDarwinGlobalTLSAddress(Op, DAG); - else if (Subtarget->isTargetELF()) - return LowerELFGlobalTLSAddress(Op, DAG); - - llvm_unreachable("Unexpected platform trying to use TLS"); -} -SDValue ARM64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { - SDValue Chain = Op.getOperand(0); - ISD::CondCode CC = cast(Op.getOperand(1))->get(); - SDValue LHS = Op.getOperand(2); - SDValue RHS = Op.getOperand(3); - SDValue Dest = Op.getOperand(4); - SDLoc dl(Op); - - // Handle f128 first, since lowering it will result in comparing the return - // value of a libcall against zero, which is just what the rest of LowerBR_CC - // is expecting to deal with. - if (LHS.getValueType() == MVT::f128) { - softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); - - // If softenSetCCOperands returned a scalar, we need to compare the result - // against zero to select between true and false values. - if (!RHS.getNode()) { - RHS = DAG.getConstant(0, LHS.getValueType()); - CC = ISD::SETNE; - } - } - - // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch - // instruction. - unsigned Opc = LHS.getOpcode(); - if (LHS.getResNo() == 1 && isa(RHS) && - cast(RHS)->isOne() && - (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || - Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { - assert((CC == ISD::SETEQ || CC == ISD::SETNE) && - "Unexpected condition code."); - // Only lower legal XALUO ops. - if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) - return SDValue(); - - // The actual operation with overflow check. - ARM64CC::CondCode OFCC; - SDValue Value, Overflow; - std::tie(Value, Overflow) = getARM64XALUOOp(OFCC, LHS.getValue(0), DAG); - - if (CC == ISD::SETNE) - OFCC = getInvertedCondCode(OFCC); - SDValue CCVal = DAG.getConstant(OFCC, MVT::i32); - - return DAG.getNode(ARM64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest, - CCVal, Overflow); - } - - if (LHS.getValueType().isInteger()) { - assert((LHS.getValueType() == RHS.getValueType()) && - (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); - - // If the RHS of the comparison is zero, we can potentially fold this - // to a specialized branch. - const ConstantSDNode *RHSC = dyn_cast(RHS); - if (RHSC && RHSC->getZExtValue() == 0) { - if (CC == ISD::SETEQ) { - // See if we can use a TBZ to fold in an AND as well. - // TBZ has a smaller branch displacement than CBZ. If the offset is - // out of bounds, a late MI-layer pass rewrites branches. - // 403.gcc is an example that hits this case. - if (LHS.getOpcode() == ISD::AND && - isa(LHS.getOperand(1)) && - isPowerOf2_64(LHS.getConstantOperandVal(1))) { - SDValue Test = LHS.getOperand(0); - uint64_t Mask = LHS.getConstantOperandVal(1); - - // TBZ only operates on i64's, but the ext should be free. - if (Test.getValueType() == MVT::i32) - Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64); - - return DAG.getNode(ARM64ISD::TBZ, dl, MVT::Other, Chain, Test, - DAG.getConstant(Log2_64(Mask), MVT::i64), Dest); - } - - return DAG.getNode(ARM64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest); - } else if (CC == ISD::SETNE) { - // See if we can use a TBZ to fold in an AND as well. - // TBZ has a smaller branch displacement than CBZ. If the offset is - // out of bounds, a late MI-layer pass rewrites branches. - // 403.gcc is an example that hits this case. - if (LHS.getOpcode() == ISD::AND && - isa(LHS.getOperand(1)) && - isPowerOf2_64(LHS.getConstantOperandVal(1))) { - SDValue Test = LHS.getOperand(0); - uint64_t Mask = LHS.getConstantOperandVal(1); - - // TBNZ only operates on i64's, but the ext should be free. - if (Test.getValueType() == MVT::i32) - Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64); - - return DAG.getNode(ARM64ISD::TBNZ, dl, MVT::Other, Chain, Test, - DAG.getConstant(Log2_64(Mask), MVT::i64), Dest); - } - - return DAG.getNode(ARM64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); - } - } - - SDValue CCVal; - SDValue Cmp = getARM64Cmp(LHS, RHS, CC, CCVal, DAG, dl); - return DAG.getNode(ARM64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, - Cmp); - } - - assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); - - // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally - // clean. Some of them require two branches to implement. - SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); - ARM64CC::CondCode CC1, CC2; - changeFPCCToARM64CC(CC, CC1, CC2); - SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); - SDValue BR1 = - DAG.getNode(ARM64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp); - if (CC2 != ARM64CC::AL) { - SDValue CC2Val = DAG.getConstant(CC2, MVT::i32); - return DAG.getNode(ARM64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val, - Cmp); - } - - return BR1; -} - -SDValue ARM64TargetLowering::LowerFCOPYSIGN(SDValue Op, - SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - SDLoc DL(Op); - - SDValue In1 = Op.getOperand(0); - SDValue In2 = Op.getOperand(1); - EVT SrcVT = In2.getValueType(); - if (SrcVT != VT) { - if (SrcVT == MVT::f32 && VT == MVT::f64) - In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); - else if (SrcVT == MVT::f64 && VT == MVT::f32) - In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0)); - else - // FIXME: Src type is different, bail out for now. Can VT really be a - // vector type? - return SDValue(); - } - - EVT VecVT; - EVT EltVT; - SDValue EltMask, VecVal1, VecVal2; - if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { - EltVT = MVT::i32; - VecVT = MVT::v4i32; - EltMask = DAG.getConstant(0x80000000ULL, EltVT); - - if (!VT.isVector()) { - VecVal1 = DAG.getTargetInsertSubreg(ARM64::ssub, DL, VecVT, - DAG.getUNDEF(VecVT), In1); - VecVal2 = DAG.getTargetInsertSubreg(ARM64::ssub, DL, VecVT, - DAG.getUNDEF(VecVT), In2); - } else { - VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); - VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); - } - } else if (VT == MVT::f64 || VT == MVT::v2f64) { - EltVT = MVT::i64; - VecVT = MVT::v2i64; - - // We want to materialize a mask with the the high bit set, but the AdvSIMD - // immediate moves cannot materialize that in a single instruction for - // 64-bit elements. Instead, materialize zero and then negate it. - EltMask = DAG.getConstant(0, EltVT); - - if (!VT.isVector()) { - VecVal1 = DAG.getTargetInsertSubreg(ARM64::dsub, DL, VecVT, - DAG.getUNDEF(VecVT), In1); - VecVal2 = DAG.getTargetInsertSubreg(ARM64::dsub, DL, VecVT, - DAG.getUNDEF(VecVT), In2); - } else { - VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); - VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); - } - } else { - llvm_unreachable("Invalid type for copysign!"); - } - - std::vector BuildVectorOps; - for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i) - BuildVectorOps.push_back(EltMask); - - SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, BuildVectorOps); - - // If we couldn't materialize the mask above, then the mask vector will be - // the zero vector, and we need to negate it here. - if (VT == MVT::f64 || VT == MVT::v2f64) { - BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec); - BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec); - BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec); - } - - SDValue Sel = - DAG.getNode(ARM64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); - - if (VT == MVT::f32) - return DAG.getTargetExtractSubreg(ARM64::ssub, DL, VT, Sel); - else if (VT == MVT::f64) - return DAG.getTargetExtractSubreg(ARM64::dsub, DL, VT, Sel); - else - return DAG.getNode(ISD::BITCAST, DL, VT, Sel); -} - -SDValue ARM64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { - if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute( - AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) - return SDValue(); - - // While there is no integer popcount instruction, it can - // be more efficiently lowered to the following sequence that uses - // AdvSIMD registers/instructions as long as the copies to/from - // the AdvSIMD registers are cheap. - // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd - // CNT V0.8B, V0.8B // 8xbyte pop-counts - // ADDV B0, V0.8B // sum 8xbyte pop-counts - // UMOV X0, V0.B[0] // copy byte result back to integer reg - SDValue Val = Op.getOperand(0); - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8); - - SDValue VecVal; - if (VT == MVT::i32) { - VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val); - VecVal = - DAG.getTargetInsertSubreg(ARM64::ssub, DL, MVT::v8i8, ZeroVec, VecVal); - } else { - VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); - } - - SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal); - SDValue UaddLV = DAG.getNode( - ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, - DAG.getConstant(Intrinsic::arm64_neon_uaddlv, MVT::i32), CtPop); - - if (VT == MVT::i64) - UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); - return UaddLV; -} - -SDValue ARM64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { - - if (Op.getValueType().isVector()) - return LowerVSETCC(Op, DAG); - - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - ISD::CondCode CC = cast(Op.getOperand(2))->get(); - SDLoc dl(Op); - - // We chose ZeroOrOneBooleanContents, so use zero and one. - EVT VT = Op.getValueType(); - SDValue TVal = DAG.getConstant(1, VT); - SDValue FVal = DAG.getConstant(0, VT); - - // Handle f128 first, since one possible outcome is a normal integer - // comparison which gets picked up by the next if statement. - if (LHS.getValueType() == MVT::f128) { - softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); - - // If softenSetCCOperands returned a scalar, use it. - if (!RHS.getNode()) { - assert(LHS.getValueType() == Op.getValueType() && - "Unexpected setcc expansion!"); - return LHS; - } - } - - if (LHS.getValueType().isInteger()) { - SDValue CCVal; - SDValue Cmp = - getARM64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl); - - // Note that we inverted the condition above, so we reverse the order of - // the true and false operands here. This will allow the setcc to be - // matched to a single CSINC instruction. - return DAG.getNode(ARM64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); - } - - // Now we know we're dealing with FP values. - assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); - - // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead - // and do the comparison. - SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); - - ARM64CC::CondCode CC1, CC2; - changeFPCCToARM64CC(CC, CC1, CC2); - if (CC2 == ARM64CC::AL) { - changeFPCCToARM64CC(ISD::getSetCCInverse(CC, false), CC1, CC2); - SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); - - // Note that we inverted the condition above, so we reverse the order of - // the true and false operands here. This will allow the setcc to be - // matched to a single CSINC instruction. - return DAG.getNode(ARM64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); - } else { - // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally - // clean. Some of them require two CSELs to implement. As is in this case, - // we emit the first CSEL and then emit a second using the output of the - // first as the RHS. We're effectively OR'ing the two CC's together. - - // FIXME: It would be nice if we could match the two CSELs to two CSINCs. - SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); - SDValue CS1 = DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); - - SDValue CC2Val = DAG.getConstant(CC2, MVT::i32); - return DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); - } -} - -/// A SELECT_CC operation is really some kind of max or min if both values being -/// compared are, in some sense, equal to the results in either case. However, -/// it is permissible to compare f32 values and produce directly extended f64 -/// values. -/// -/// Extending the comparison operands would also be allowed, but is less likely -/// to happen in practice since their use is right here. Note that truncate -/// operations would *not* be semantically equivalent. -static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) { - if (Cmp == Result) - return true; - - ConstantFPSDNode *CCmp = dyn_cast(Cmp); - ConstantFPSDNode *CResult = dyn_cast(Result); - if (CCmp && CResult && Cmp.getValueType() == MVT::f32 && - Result.getValueType() == MVT::f64) { - bool Lossy; - APFloat CmpVal = CCmp->getValueAPF(); - CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy); - return CResult->getValueAPF().bitwiseIsEqual(CmpVal); - } - - return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp; -} - -SDValue ARM64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { - SDValue CC = Op->getOperand(0); - SDValue TVal = Op->getOperand(1); - SDValue FVal = Op->getOperand(2); - SDLoc DL(Op); - - unsigned Opc = CC.getOpcode(); - // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select - // instruction. - if (CC.getResNo() == 1 && - (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || - Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { - // Only lower legal XALUO ops. - if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0))) - return SDValue(); - - ARM64CC::CondCode OFCC; - SDValue Value, Overflow; - std::tie(Value, Overflow) = getARM64XALUOOp(OFCC, CC.getValue(0), DAG); - SDValue CCVal = DAG.getConstant(OFCC, MVT::i32); - - return DAG.getNode(ARM64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, CCVal, - Overflow); - } - - if (CC.getOpcode() == ISD::SETCC) - return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal, - cast(CC.getOperand(2))->get()); - else - return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal, - FVal, ISD::SETNE); -} - -SDValue ARM64TargetLowering::LowerSELECT_CC(SDValue Op, - SelectionDAG &DAG) const { - ISD::CondCode CC = cast(Op.getOperand(4))->get(); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - SDValue TVal = Op.getOperand(2); - SDValue FVal = Op.getOperand(3); - SDLoc dl(Op); - - // Handle f128 first, because it will result in a comparison of some RTLIB - // call result against zero. - if (LHS.getValueType() == MVT::f128) { - softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); - - // If softenSetCCOperands returned a scalar, we need to compare the result - // against zero to select between true and false values. - if (!RHS.getNode()) { - RHS = DAG.getConstant(0, LHS.getValueType()); - CC = ISD::SETNE; - } - } - - // Handle integers first. - if (LHS.getValueType().isInteger()) { - assert((LHS.getValueType() == RHS.getValueType()) && - (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); - - unsigned Opcode = ARM64ISD::CSEL; - - // If both the TVal and the FVal are constants, see if we can swap them in - // order to for a CSINV or CSINC out of them. - ConstantSDNode *CFVal = dyn_cast(FVal); - ConstantSDNode *CTVal = dyn_cast(TVal); - - if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) { - std::swap(TVal, FVal); - std::swap(CTVal, CFVal); - CC = ISD::getSetCCInverse(CC, true); - } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) { - std::swap(TVal, FVal); - std::swap(CTVal, CFVal); - CC = ISD::getSetCCInverse(CC, true); - } else if (TVal.getOpcode() == ISD::XOR) { - // If TVal is a NOT we want to swap TVal and FVal so that we can match - // with a CSINV rather than a CSEL. - ConstantSDNode *CVal = dyn_cast(TVal.getOperand(1)); - - if (CVal && CVal->isAllOnesValue()) { - std::swap(TVal, FVal); - std::swap(CTVal, CFVal); - CC = ISD::getSetCCInverse(CC, true); - } - } else if (TVal.getOpcode() == ISD::SUB) { - // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so - // that we can match with a CSNEG rather than a CSEL. - ConstantSDNode *CVal = dyn_cast(TVal.getOperand(0)); - - if (CVal && CVal->isNullValue()) { - std::swap(TVal, FVal); - std::swap(CTVal, CFVal); - CC = ISD::getSetCCInverse(CC, true); - } - } else if (CTVal && CFVal) { - const int64_t TrueVal = CTVal->getSExtValue(); - const int64_t FalseVal = CFVal->getSExtValue(); - bool Swap = false; - - // If both TVal and FVal are constants, see if FVal is the - // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC - // instead of a CSEL in that case. - if (TrueVal == ~FalseVal) { - Opcode = ARM64ISD::CSINV; - } else if (TrueVal == -FalseVal) { - Opcode = ARM64ISD::CSNEG; - } else if (TVal.getValueType() == MVT::i32) { - // If our operands are only 32-bit wide, make sure we use 32-bit - // arithmetic for the check whether we can use CSINC. This ensures that - // the addition in the check will wrap around properly in case there is - // an overflow (which would not be the case if we do the check with - // 64-bit arithmetic). - const uint32_t TrueVal32 = CTVal->getZExtValue(); - const uint32_t FalseVal32 = CFVal->getZExtValue(); - - if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) { - Opcode = ARM64ISD::CSINC; - - if (TrueVal32 > FalseVal32) { - Swap = true; - } - } - // 64-bit check whether we can use CSINC. - } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) { - Opcode = ARM64ISD::CSINC; - - if (TrueVal > FalseVal) { - Swap = true; - } - } - - // Swap TVal and FVal if necessary. - if (Swap) { - std::swap(TVal, FVal); - std::swap(CTVal, CFVal); - CC = ISD::getSetCCInverse(CC, true); - } - - if (Opcode != ARM64ISD::CSEL) { - // Drop FVal since we can get its value by simply inverting/negating - // TVal. - FVal = TVal; - } - } - - SDValue CCVal; - SDValue Cmp = getARM64Cmp(LHS, RHS, CC, CCVal, DAG, dl); - - EVT VT = Op.getValueType(); - return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); - } - - // Now we know we're dealing with FP values. - assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); - assert(LHS.getValueType() == RHS.getValueType()); - EVT VT = Op.getValueType(); - - // Try to match this select into a max/min operation, which have dedicated - // opcode in the instruction set. - // FIXME: This is not correct in the presence of NaNs, so we only enable this - // in no-NaNs mode. - if (getTargetMachine().Options.NoNaNsFPMath) { - SDValue MinMaxLHS = TVal, MinMaxRHS = FVal; - if (selectCCOpsAreFMaxCompatible(LHS, MinMaxRHS) && - selectCCOpsAreFMaxCompatible(RHS, MinMaxLHS)) { - CC = ISD::getSetCCSwappedOperands(CC); - std::swap(MinMaxLHS, MinMaxRHS); - } - - if (selectCCOpsAreFMaxCompatible(LHS, MinMaxLHS) && - selectCCOpsAreFMaxCompatible(RHS, MinMaxRHS)) { - switch (CC) { - default: - break; - case ISD::SETGT: - case ISD::SETGE: - case ISD::SETUGT: - case ISD::SETUGE: - case ISD::SETOGT: - case ISD::SETOGE: - return DAG.getNode(ARM64ISD::FMAX, dl, VT, MinMaxLHS, MinMaxRHS); - break; - case ISD::SETLT: - case ISD::SETLE: - case ISD::SETULT: - case ISD::SETULE: - case ISD::SETOLT: - case ISD::SETOLE: - return DAG.getNode(ARM64ISD::FMIN, dl, VT, MinMaxLHS, MinMaxRHS); - break; - } - } - } - - // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead - // and do the comparison. - SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); - - // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally - // clean. Some of them require two CSELs to implement. - ARM64CC::CondCode CC1, CC2; - changeFPCCToARM64CC(CC, CC1, CC2); - SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); - SDValue CS1 = DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); - - // If we need a second CSEL, emit it, using the output of the first as the - // RHS. We're effectively OR'ing the two CC's together. - if (CC2 != ARM64CC::AL) { - SDValue CC2Val = DAG.getConstant(CC2, MVT::i32); - return DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); - } - - // Otherwise, return the output of the first CSEL. - return CS1; -} - -SDValue ARM64TargetLowering::LowerJumpTable(SDValue Op, - SelectionDAG &DAG) const { - // Jump table entries as PC relative offsets. No additional tweaking - // is necessary here. Just get the address of the jump table. - JumpTableSDNode *JT = cast(Op); - EVT PtrVT = getPointerTy(); - SDLoc DL(Op); - - if (getTargetMachine().getCodeModel() == CodeModel::Large && - !Subtarget->isTargetMachO()) { - const unsigned char MO_NC = ARM64II::MO_NC; - return DAG.getNode( - ARM64ISD::WrapperLarge, DL, PtrVT, - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_G3), - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_G2 | MO_NC), - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_G1 | MO_NC), - DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_G0 | MO_NC)); - } - - SDValue Hi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_PAGE); - SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, - ARM64II::MO_PAGEOFF | ARM64II::MO_NC); - SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi); - return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo); -} - -SDValue ARM64TargetLowering::LowerConstantPool(SDValue Op, - SelectionDAG &DAG) const { - ConstantPoolSDNode *CP = cast(Op); - EVT PtrVT = getPointerTy(); - SDLoc DL(Op); - - if (getTargetMachine().getCodeModel() == CodeModel::Large) { - // Use the GOT for the large code model on iOS. - if (Subtarget->isTargetMachO()) { - SDValue GotAddr = DAG.getTargetConstantPool( - CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), - ARM64II::MO_GOT); - return DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, GotAddr); - } - - const unsigned char MO_NC = ARM64II::MO_NC; - return DAG.getNode( - ARM64ISD::WrapperLarge, DL, PtrVT, - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), ARM64II::MO_G3), - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), ARM64II::MO_G2 | MO_NC), - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), ARM64II::MO_G1 | MO_NC), - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), ARM64II::MO_G0 | MO_NC)); - } else { - // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on - // ELF, the only valid one on Darwin. - SDValue Hi = - DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), - CP->getOffset(), ARM64II::MO_PAGE); - SDValue Lo = DAG.getTargetConstantPool( - CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), - ARM64II::MO_PAGEOFF | ARM64II::MO_NC); - - SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi); - return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo); - } -} - -SDValue ARM64TargetLowering::LowerBlockAddress(SDValue Op, - SelectionDAG &DAG) const { - const BlockAddress *BA = cast(Op)->getBlockAddress(); - EVT PtrVT = getPointerTy(); - SDLoc DL(Op); - if (getTargetMachine().getCodeModel() == CodeModel::Large && - !Subtarget->isTargetMachO()) { - const unsigned char MO_NC = ARM64II::MO_NC; - return DAG.getNode( - ARM64ISD::WrapperLarge, DL, PtrVT, - DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G3), - DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G2 | MO_NC), - DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G1 | MO_NC), - DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G0 | MO_NC)); - } else { - SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_PAGE); - SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_PAGEOFF | - ARM64II::MO_NC); - SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi); - return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo); - } -} - -SDValue ARM64TargetLowering::LowerDarwin_VASTART(SDValue Op, - SelectionDAG &DAG) const { - ARM64FunctionInfo *FuncInfo = - DAG.getMachineFunction().getInfo(); - - SDLoc DL(Op); - SDValue FR = - DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy()); - const Value *SV = cast(Op.getOperand(2))->getValue(); - return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), - MachinePointerInfo(SV), false, false, 0); -} - -SDValue ARM64TargetLowering::LowerAAPCS_VASTART(SDValue Op, - SelectionDAG &DAG) const { - // The layout of the va_list struct is specified in the AArch64 Procedure Call - // Standard, section B.3. - MachineFunction &MF = DAG.getMachineFunction(); - ARM64FunctionInfo *FuncInfo = MF.getInfo(); - SDLoc DL(Op); - - SDValue Chain = Op.getOperand(0); - SDValue VAList = Op.getOperand(1); - const Value *SV = cast(Op.getOperand(2))->getValue(); - SmallVector MemOps; - - // void *__stack at offset 0 - SDValue Stack = - DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy()); - MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, - MachinePointerInfo(SV), false, false, 8)); - - // void *__gr_top at offset 8 - int GPRSize = FuncInfo->getVarArgsGPRSize(); - if (GPRSize > 0) { - SDValue GRTop, GRTopAddr; - - GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(8, getPointerTy())); - - GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy()); - GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop, - DAG.getConstant(GPRSize, getPointerTy())); - - MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, - MachinePointerInfo(SV, 8), false, false, 8)); - } - - // void *__vr_top at offset 16 - int FPRSize = FuncInfo->getVarArgsFPRSize(); - if (FPRSize > 0) { - SDValue VRTop, VRTopAddr; - VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(16, getPointerTy())); - - VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy()); - VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop, - DAG.getConstant(FPRSize, getPointerTy())); - - MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, - MachinePointerInfo(SV, 16), false, false, 8)); - } - - // int __gr_offs at offset 24 - SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(24, getPointerTy())); - MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32), - GROffsAddr, MachinePointerInfo(SV, 24), false, - false, 4)); - - // int __vr_offs at offset 28 - SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(28, getPointerTy())); - MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32), - VROffsAddr, MachinePointerInfo(SV, 28), false, - false, 4)); - - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); -} - -SDValue ARM64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { - return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG) - : LowerAAPCS_VASTART(Op, DAG); -} - -SDValue ARM64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { - // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single - // pointer. - unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32; - const Value *DestSV = cast(Op.getOperand(3))->getValue(); - const Value *SrcSV = cast(Op.getOperand(4))->getValue(); - - return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1), - Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32), - 8, false, false, MachinePointerInfo(DestSV), - MachinePointerInfo(SrcSV)); -} - -SDValue ARM64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { - assert(Subtarget->isTargetDarwin() && - "automatic va_arg instruction only works on Darwin"); - - const Value *V = cast(Op.getOperand(2))->getValue(); - EVT VT = Op.getValueType(); - SDLoc DL(Op); - SDValue Chain = Op.getOperand(0); - SDValue Addr = Op.getOperand(1); - unsigned Align = Op.getConstantOperandVal(3); - - SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr, - MachinePointerInfo(V), false, false, false, 0); - Chain = VAList.getValue(1); - - if (Align > 8) { - assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); - VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(Align - 1, getPointerTy())); - VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList, - DAG.getConstant(-(int64_t)Align, getPointerTy())); - } - - Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); - uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy); - - // Scalar integer and FP values smaller than 64 bits are implicitly extended - // up to 64 bits. At the very least, we have to increase the striding of the - // vaargs list to match this, and for FP values we need to introduce - // FP_ROUND nodes as well. - if (VT.isInteger() && !VT.isVector()) - ArgSize = 8; - bool NeedFPTrunc = false; - if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { - ArgSize = 8; - NeedFPTrunc = true; - } - - // Increment the pointer, VAList, to the next vaarg - SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, - DAG.getConstant(ArgSize, getPointerTy())); - // Store the incremented VAList to the legalized pointer - SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V), - false, false, 0); - - // Load the actual argument out of the pointer VAList - if (NeedFPTrunc) { - // Load the value as an f64. - SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList, - MachinePointerInfo(), false, false, false, 0); - // Round the value down to an f32. - SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), - DAG.getIntPtrConstant(1)); - SDValue Ops[] = { NarrowFP, WideFP.getValue(1) }; - // Merge the rounded value with the chain output of the load. - return DAG.getMergeValues(Ops, DL); - } - - return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false, - false, false, 0); -} - -SDValue ARM64TargetLowering::LowerFRAMEADDR(SDValue Op, - SelectionDAG &DAG) const { - MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); - MFI->setFrameAddressIsTaken(true); - - EVT VT = Op.getValueType(); - SDLoc DL(Op); - unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); - SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, ARM64::FP, VT); - while (Depth--) - FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, - MachinePointerInfo(), false, false, false, 0); - return FrameAddr; -} - -// FIXME? Maybe this could be a TableGen attribute on some registers and -// this table could be generated automatically from RegInfo. -unsigned ARM64TargetLowering::getRegisterByName(const char* RegName, - EVT VT) const { - unsigned Reg = StringSwitch(RegName) - .Case("sp", ARM64::SP) - .Default(0); - if (Reg) - return Reg; - report_fatal_error("Invalid register name global variable"); -} - -SDValue ARM64TargetLowering::LowerRETURNADDR(SDValue Op, - SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); - MachineFrameInfo *MFI = MF.getFrameInfo(); - MFI->setReturnAddressIsTaken(true); - - EVT VT = Op.getValueType(); - SDLoc DL(Op); - unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); - if (Depth) { - SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); - SDValue Offset = DAG.getConstant(8, getPointerTy()); - return DAG.getLoad(VT, DL, DAG.getEntryNode(), - DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), - MachinePointerInfo(), false, false, false, 0); - } - - // Return LR, which contains the return address. Mark it an implicit live-in. - unsigned Reg = MF.addLiveIn(ARM64::LR, &ARM64::GPR64RegClass); - return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); -} - -/// LowerShiftRightParts - Lower SRA_PARTS, which returns two -/// i64 values and take a 2 x i64 value to shift plus a shift amount. -SDValue ARM64TargetLowering::LowerShiftRightParts(SDValue Op, - SelectionDAG &DAG) const { - assert(Op.getNumOperands() == 3 && "Not a double-shift!"); - EVT VT = Op.getValueType(); - unsigned VTBits = VT.getSizeInBits(); - SDLoc dl(Op); - SDValue ShOpLo = Op.getOperand(0); - SDValue ShOpHi = Op.getOperand(1); - SDValue ShAmt = Op.getOperand(2); - SDValue ARMcc; - unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; - - assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); - - SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, - DAG.getConstant(VTBits, MVT::i64), ShAmt); - SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); - SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, - DAG.getConstant(VTBits, MVT::i64)); - SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); - - SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64), - ISD::SETGE, dl, DAG); - SDValue CCVal = DAG.getConstant(ARM64CC::GE, MVT::i32); - - SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); - SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); - SDValue Lo = - DAG.getNode(ARM64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp); - - // ARM64 shifts larger than the register width are wrapped rather than - // clamped, so we can't just emit "hi >> x". - SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); - SDValue TrueValHi = Opc == ISD::SRA - ? DAG.getNode(Opc, dl, VT, ShOpHi, - DAG.getConstant(VTBits - 1, MVT::i64)) - : DAG.getConstant(0, VT); - SDValue Hi = - DAG.getNode(ARM64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp); - - SDValue Ops[2] = { Lo, Hi }; - return DAG.getMergeValues(Ops, dl); -} - -/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two -/// i64 values and take a 2 x i64 value to shift plus a shift amount. -SDValue ARM64TargetLowering::LowerShiftLeftParts(SDValue Op, - SelectionDAG &DAG) const { - assert(Op.getNumOperands() == 3 && "Not a double-shift!"); - EVT VT = Op.getValueType(); - unsigned VTBits = VT.getSizeInBits(); - SDLoc dl(Op); - SDValue ShOpLo = Op.getOperand(0); - SDValue ShOpHi = Op.getOperand(1); - SDValue ShAmt = Op.getOperand(2); - SDValue ARMcc; - - assert(Op.getOpcode() == ISD::SHL_PARTS); - SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, - DAG.getConstant(VTBits, MVT::i64), ShAmt); - SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); - SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, - DAG.getConstant(VTBits, MVT::i64)); - SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); - SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); - - SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); - - SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64), - ISD::SETGE, dl, DAG); - SDValue CCVal = DAG.getConstant(ARM64CC::GE, MVT::i32); - SDValue Hi = DAG.getNode(ARM64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp); - - // ARM64 shifts of larger than register sizes are wrapped rather than clamped, - // so we can't just emit "lo << a" if a is too big. - SDValue TrueValLo = DAG.getConstant(0, VT); - SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); - SDValue Lo = - DAG.getNode(ARM64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp); - - SDValue Ops[2] = { Lo, Hi }; - return DAG.getMergeValues(Ops, dl); -} - -bool -ARM64TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { - // The ARM64 target doesn't support folding offsets into global addresses. - return false; -} - -bool ARM64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { - // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases. - // FIXME: We should be able to handle f128 as well with a clever lowering. - if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32)) - return true; - - if (VT == MVT::f64) - return ARM64_AM::getFP64Imm(Imm) != -1; - else if (VT == MVT::f32) - return ARM64_AM::getFP32Imm(Imm) != -1; - return false; -} - -//===----------------------------------------------------------------------===// -// ARM64 Optimization Hooks -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// ARM64 Inline Assembly Support -//===----------------------------------------------------------------------===// - -// Table of Constraints -// TODO: This is the current set of constraints supported by ARM for the -// compiler, not all of them may make sense, e.g. S may be difficult to support. -// -// r - A general register -// w - An FP/SIMD register of some size in the range v0-v31 -// x - An FP/SIMD register of some size in the range v0-v15 -// I - Constant that can be used with an ADD instruction -// J - Constant that can be used with a SUB instruction -// K - Constant that can be used with a 32-bit logical instruction -// L - Constant that can be used with a 64-bit logical instruction -// M - Constant that can be used as a 32-bit MOV immediate -// N - Constant that can be used as a 64-bit MOV immediate -// Q - A memory reference with base register and no offset -// S - A symbolic address -// Y - Floating point constant zero -// Z - Integer constant zero -// -// Note that general register operands will be output using their 64-bit x -// register name, whatever the size of the variable, unless the asm operand -// is prefixed by the %w modifier. Floating-point and SIMD register operands -// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or -// %q modifier. - -/// getConstraintType - Given a constraint letter, return the type of -/// constraint it is for this target. -ARM64TargetLowering::ConstraintType -ARM64TargetLowering::getConstraintType(const std::string &Constraint) const { - if (Constraint.size() == 1) { - switch (Constraint[0]) { - default: - break; - case 'z': - return C_Other; - case 'x': - case 'w': - return C_RegisterClass; - // An address with a single base register. Due to the way we - // currently handle addresses it is the same as 'r'. - case 'Q': - return C_Memory; - } - } - return TargetLowering::getConstraintType(Constraint); -} - -/// Examine constraint type and operand type and determine a weight value. -/// This object must already have been set up with the operand type -/// and the current alternative constraint selected. -TargetLowering::ConstraintWeight -ARM64TargetLowering::getSingleConstraintMatchWeight( - AsmOperandInfo &info, const char *constraint) const { - ConstraintWeight weight = CW_Invalid; - Value *CallOperandVal = info.CallOperandVal; - // If we don't have a value, we can't do a match, - // but allow it at the lowest weight. - if (!CallOperandVal) - return CW_Default; - Type *type = CallOperandVal->getType(); - // Look at the constraint type. - switch (*constraint) { - default: - weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); - break; - case 'x': - case 'w': - if (type->isFloatingPointTy() || type->isVectorTy()) - weight = CW_Register; - break; - case 'z': - weight = CW_Constant; - break; - } - return weight; -} - -std::pair -ARM64TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, - MVT VT) const { - if (Constraint.size() == 1) { - switch (Constraint[0]) { - case 'r': - if (VT.getSizeInBits() == 64) - return std::make_pair(0U, &ARM64::GPR64commonRegClass); - return std::make_pair(0U, &ARM64::GPR32commonRegClass); - case 'w': - if (VT == MVT::f32) - return std::make_pair(0U, &ARM64::FPR32RegClass); - if (VT.getSizeInBits() == 64) - return std::make_pair(0U, &ARM64::FPR64RegClass); - if (VT.getSizeInBits() == 128) - return std::make_pair(0U, &ARM64::FPR128RegClass); - break; - // The instructions that this constraint is designed for can - // only take 128-bit registers so just use that regclass. - case 'x': - if (VT.getSizeInBits() == 128) - return std::make_pair(0U, &ARM64::FPR128_loRegClass); - break; - } - } - if (StringRef("{cc}").equals_lower(Constraint)) - return std::make_pair(unsigned(ARM64::NZCV), &ARM64::CCRRegClass); - - // Use the default implementation in TargetLowering to convert the register - // constraint into a member of a register class. - std::pair Res; - Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); - - // Not found as a standard register? - if (!Res.second) { - unsigned Size = Constraint.size(); - if ((Size == 4 || Size == 5) && Constraint[0] == '{' && - tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { - const std::string Reg = - std::string(&Constraint[2], &Constraint[Size - 1]); - int RegNo = atoi(Reg.c_str()); - if (RegNo >= 0 && RegNo <= 31) { - // v0 - v31 are aliases of q0 - q31. - // By default we'll emit v0-v31 for this unless there's a modifier where - // we'll emit the correct register as well. - Res.first = ARM64::FPR128RegClass.getRegister(RegNo); - Res.second = &ARM64::FPR128RegClass; - } - } - } - - return Res; -} - -/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops -/// vector. If it is invalid, don't add anything to Ops. -void ARM64TargetLowering::LowerAsmOperandForConstraint( - SDValue Op, std::string &Constraint, std::vector &Ops, - SelectionDAG &DAG) const { - SDValue Result; - - // Currently only support length 1 constraints. - if (Constraint.length() != 1) - return; - - char ConstraintLetter = Constraint[0]; - switch (ConstraintLetter) { - default: - break; - - // This set of constraints deal with valid constants for various instructions. - // Validate and return a target constant for them if we can. - case 'z': { - // 'z' maps to xzr or wzr so it needs an input of 0. - ConstantSDNode *C = dyn_cast(Op); - if (!C || C->getZExtValue() != 0) - return; - - if (Op.getValueType() == MVT::i64) - Result = DAG.getRegister(ARM64::XZR, MVT::i64); - else - Result = DAG.getRegister(ARM64::WZR, MVT::i32); - break; - } - - case 'I': - case 'J': - case 'K': - case 'L': - case 'M': - case 'N': - ConstantSDNode *C = dyn_cast(Op); - if (!C) - return; - - // Grab the value and do some validation. - uint64_t CVal = C->getZExtValue(); - switch (ConstraintLetter) { - // The I constraint applies only to simple ADD or SUB immediate operands: - // i.e. 0 to 4095 with optional shift by 12 - // The J constraint applies only to ADD or SUB immediates that would be - // valid when negated, i.e. if [an add pattern] were to be output as a SUB - // instruction [or vice versa], in other words -1 to -4095 with optional - // left shift by 12. - case 'I': - if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal)) - break; - return; - case 'J': { - uint64_t NVal = -C->getSExtValue(); - if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) - break; - return; - } - // The K and L constraints apply *only* to logical immediates, including - // what used to be the MOVI alias for ORR (though the MOVI alias has now - // been removed and MOV should be used). So these constraints have to - // distinguish between bit patterns that are valid 32-bit or 64-bit - // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but - // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice - // versa. - case 'K': - if (ARM64_AM::isLogicalImmediate(CVal, 32)) - break; - return; - case 'L': - if (ARM64_AM::isLogicalImmediate(CVal, 64)) - break; - return; - // The M and N constraints are a superset of K and L respectively, for use - // with the MOV (immediate) alias. As well as the logical immediates they - // also match 32 or 64-bit immediates that can be loaded either using a - // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca - // (M) or 64-bit 0x1234000000000000 (N) etc. - // As a note some of this code is liberally stolen from the asm parser. - case 'M': { - if (!isUInt<32>(CVal)) - return; - if (ARM64_AM::isLogicalImmediate(CVal, 32)) - break; - if ((CVal & 0xFFFF) == CVal) - break; - if ((CVal & 0xFFFF0000ULL) == CVal) - break; - uint64_t NCVal = ~(uint32_t)CVal; - if ((NCVal & 0xFFFFULL) == NCVal) - break; - if ((NCVal & 0xFFFF0000ULL) == NCVal) - break; - return; - } - case 'N': { - if (ARM64_AM::isLogicalImmediate(CVal, 64)) - break; - if ((CVal & 0xFFFFULL) == CVal) - break; - if ((CVal & 0xFFFF0000ULL) == CVal) - break; - if ((CVal & 0xFFFF00000000ULL) == CVal) - break; - if ((CVal & 0xFFFF000000000000ULL) == CVal) - break; - uint64_t NCVal = ~CVal; - if ((NCVal & 0xFFFFULL) == NCVal) - break; - if ((NCVal & 0xFFFF0000ULL) == NCVal) - break; - if ((NCVal & 0xFFFF00000000ULL) == NCVal) - break; - if ((NCVal & 0xFFFF000000000000ULL) == NCVal) - break; - return; - } - default: - return; - } - - // All assembler immediates are 64-bit integers. - Result = DAG.getTargetConstant(CVal, MVT::i64); - break; - } - - if (Result.getNode()) { - Ops.push_back(Result); - return; - } - - return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); -} - -//===----------------------------------------------------------------------===// -// ARM64 Advanced SIMD Support -//===----------------------------------------------------------------------===// - -/// WidenVector - Given a value in the V64 register class, produce the -/// equivalent value in the V128 register class. -static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) { - EVT VT = V64Reg.getValueType(); - unsigned NarrowSize = VT.getVectorNumElements(); - MVT EltTy = VT.getVectorElementType().getSimpleVT(); - MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); - SDLoc DL(V64Reg); - - return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy), - V64Reg, DAG.getConstant(0, MVT::i32)); -} - -/// getExtFactor - Determine the adjustment factor for the position when -/// generating an "extract from vector registers" instruction. -static unsigned getExtFactor(SDValue &V) { - EVT EltType = V.getValueType().getVectorElementType(); - return EltType.getSizeInBits() / 8; -} - -/// NarrowVector - Given a value in the V128 register class, produce the -/// equivalent value in the V64 register class. -static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { - EVT VT = V128Reg.getValueType(); - unsigned WideSize = VT.getVectorNumElements(); - MVT EltTy = VT.getVectorElementType().getSimpleVT(); - MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); - SDLoc DL(V128Reg); - - return DAG.getTargetExtractSubreg(ARM64::dsub, DL, NarrowTy, V128Reg); -} - -// Gather data to see if the operation can be modelled as a -// shuffle in combination with VEXTs. -SDValue ARM64TargetLowering::ReconstructShuffle(SDValue Op, - SelectionDAG &DAG) const { - SDLoc dl(Op); - EVT VT = Op.getValueType(); - unsigned NumElts = VT.getVectorNumElements(); - - SmallVector SourceVecs; - SmallVector MinElts; - SmallVector MaxElts; - - for (unsigned i = 0; i < NumElts; ++i) { - SDValue V = Op.getOperand(i); - if (V.getOpcode() == ISD::UNDEF) - continue; - else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { - // A shuffle can only come from building a vector from various - // elements of other vectors. - return SDValue(); - } - - // Record this extraction against the appropriate vector if possible... - SDValue SourceVec = V.getOperand(0); - unsigned EltNo = cast(V.getOperand(1))->getZExtValue(); - bool FoundSource = false; - for (unsigned j = 0; j < SourceVecs.size(); ++j) { - if (SourceVecs[j] == SourceVec) { - if (MinElts[j] > EltNo) - MinElts[j] = EltNo; - if (MaxElts[j] < EltNo) - MaxElts[j] = EltNo; - FoundSource = true; - break; - } - } - - // Or record a new source if not... - if (!FoundSource) { - SourceVecs.push_back(SourceVec); - MinElts.push_back(EltNo); - MaxElts.push_back(EltNo); - } - } - - // Currently only do something sane when at most two source vectors - // involved. - if (SourceVecs.size() > 2) - return SDValue(); - - SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) }; - int VEXTOffsets[2] = { 0, 0 }; - - // This loop extracts the usage patterns of the source vectors - // and prepares appropriate SDValues for a shuffle if possible. - for (unsigned i = 0; i < SourceVecs.size(); ++i) { - if (SourceVecs[i].getValueType() == VT) { - // No VEXT necessary - ShuffleSrcs[i] = SourceVecs[i]; - VEXTOffsets[i] = 0; - continue; - } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) { - // We can pad out the smaller vector for free, so if it's part of a - // shuffle... - ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, SourceVecs[i], - DAG.getUNDEF(SourceVecs[i].getValueType())); - continue; - } - - // Don't attempt to extract subvectors from BUILD_VECTOR sources - // that expand or trunc the original value. - // TODO: We can try to bitcast and ANY_EXTEND the result but - // we need to consider the cost of vector ANY_EXTEND, and the - // legality of all the types. - if (SourceVecs[i].getValueType().getVectorElementType() != - VT.getVectorElementType()) - return SDValue(); - - // Since only 64-bit and 128-bit vectors are legal on ARM and - // we've eliminated the other cases... - assert(SourceVecs[i].getValueType().getVectorNumElements() == 2 * NumElts && - "unexpected vector sizes in ReconstructShuffle"); - - if (MaxElts[i] - MinElts[i] >= NumElts) { - // Span too large for a VEXT to cope - return SDValue(); - } - - if (MinElts[i] >= NumElts) { - // The extraction can just take the second half - VEXTOffsets[i] = NumElts; - ShuffleSrcs[i] = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i], - DAG.getIntPtrConstant(NumElts)); - } else if (MaxElts[i] < NumElts) { - // The extraction can just take the first half - VEXTOffsets[i] = 0; - ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - SourceVecs[i], DAG.getIntPtrConstant(0)); - } else { - // An actual VEXT is needed - VEXTOffsets[i] = MinElts[i]; - SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - SourceVecs[i], DAG.getIntPtrConstant(0)); - SDValue VEXTSrc2 = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i], - DAG.getIntPtrConstant(NumElts)); - unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1); - ShuffleSrcs[i] = DAG.getNode(ARM64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2, - DAG.getConstant(Imm, MVT::i32)); - } - } - - SmallVector Mask; - - for (unsigned i = 0; i < NumElts; ++i) { - SDValue Entry = Op.getOperand(i); - if (Entry.getOpcode() == ISD::UNDEF) { - Mask.push_back(-1); - continue; - } - - SDValue ExtractVec = Entry.getOperand(0); - int ExtractElt = - cast(Op.getOperand(i).getOperand(1))->getSExtValue(); - if (ExtractVec == SourceVecs[0]) { - Mask.push_back(ExtractElt - VEXTOffsets[0]); - } else { - Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]); - } - } - - // Final check before we try to produce nonsense... - if (isShuffleMaskLegal(Mask, VT)) - return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1], - &Mask[0]); - - return SDValue(); -} - -// check if an EXT instruction can handle the shuffle mask when the -// vector sources of the shuffle are the same. -static bool isSingletonEXTMask(ArrayRef M, EVT VT, unsigned &Imm) { - unsigned NumElts = VT.getVectorNumElements(); - - // Assume that the first shuffle index is not UNDEF. Fail if it is. - if (M[0] < 0) - return false; - - Imm = M[0]; - - // If this is a VEXT shuffle, the immediate value is the index of the first - // element. The other shuffle indices must be the successive elements after - // the first one. - unsigned ExpectedElt = Imm; - for (unsigned i = 1; i < NumElts; ++i) { - // Increment the expected index. If it wraps around, just follow it - // back to index zero and keep going. - ++ExpectedElt; - if (ExpectedElt == NumElts) - ExpectedElt = 0; - - if (M[i] < 0) - continue; // ignore UNDEF indices - if (ExpectedElt != static_cast(M[i])) - return false; - } - - return true; -} - -// check if an EXT instruction can handle the shuffle mask when the -// vector sources of the shuffle are different. -static bool isEXTMask(ArrayRef M, EVT VT, bool &ReverseEXT, - unsigned &Imm) { - // Look for the first non-undef element. - const int *FirstRealElt = std::find_if(M.begin(), M.end(), - [](int Elt) {return Elt >= 0;}); - - // Benefit form APInt to handle overflow when calculating expected element. - unsigned NumElts = VT.getVectorNumElements(); - unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); - APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); - // The following shuffle indices must be the successive elements after the - // first real element. - const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(), - [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;}); - if (FirstWrongElt != M.end()) - return false; - - // The index of an EXT is the first element if it is not UNDEF. - // Watch out for the beginning UNDEFs. The EXT index should be the expected - // value of the first element. E.g. - // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. - // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. - // ExpectedElt is the last mask index plus 1. - Imm = ExpectedElt.getZExtValue(); - - // There are two difference cases requiring to reverse input vectors. - // For example, for vector <4 x i32> we have the following cases, - // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) - // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) - // For both cases, we finally use mask <5, 6, 7, 0>, which requires - // to reverse two input vectors. - if (Imm < NumElts) - ReverseEXT = true; - else - Imm -= NumElts; - - return true; -} - -/// isREVMask - Check if a vector shuffle corresponds to a REV -/// instruction with the specified blocksize. (The order of the elements -/// within each block of the vector is reversed.) -static bool isREVMask(ArrayRef M, EVT VT, unsigned BlockSize) { - assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && - "Only possible block sizes for REV are: 16, 32, 64"); - - unsigned EltSz = VT.getVectorElementType().getSizeInBits(); - if (EltSz == 64) - return false; - - unsigned NumElts = VT.getVectorNumElements(); - unsigned BlockElts = M[0] + 1; - // If the first shuffle index is UNDEF, be optimistic. - if (M[0] < 0) - BlockElts = BlockSize / EltSz; - - if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) - return false; - - for (unsigned i = 0; i < NumElts; ++i) { - if (M[i] < 0) - continue; // ignore UNDEF indices - if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) - return false; - } - - return true; -} - -static bool isZIPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { - unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - unsigned Idx = WhichResult * NumElts / 2; - for (unsigned i = 0; i != NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned)M[i] != Idx) || - (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts)) - return false; - Idx += 1; - } - - return true; -} - -static bool isUZPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { - unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned i = 0; i != NumElts; ++i) { - if (M[i] < 0) - continue; // ignore UNDEF indices - if ((unsigned)M[i] != 2 * i + WhichResult) - return false; - } - - return true; -} - -static bool isTRNMask(ArrayRef M, EVT VT, unsigned &WhichResult) { - unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned i = 0; i < NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || - (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) - return false; - } - return true; -} - -/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of -/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". -/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. -static bool isZIP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { - unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - unsigned Idx = WhichResult * NumElts / 2; - for (unsigned i = 0; i != NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned)M[i] != Idx) || - (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx)) - return false; - Idx += 1; - } - - return true; -} - -/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of -/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". -/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, -static bool isUZP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { - unsigned Half = VT.getVectorNumElements() / 2; - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned j = 0; j != 2; ++j) { - unsigned Idx = WhichResult; - for (unsigned i = 0; i != Half; ++i) { - int MIdx = M[i + j * Half]; - if (MIdx >= 0 && (unsigned)MIdx != Idx) - return false; - Idx += 2; - } - } - - return true; -} - -/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of -/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". -/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. -static bool isTRN_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { - unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned i = 0; i < NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || - (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult)) - return false; - } - return true; -} - -static bool isINSMask(ArrayRef M, int NumInputElements, - bool &DstIsLeft, int &Anomaly) { - if (M.size() != static_cast(NumInputElements)) - return false; - - int NumLHSMatch = 0, NumRHSMatch = 0; - int LastLHSMismatch = -1, LastRHSMismatch = -1; - - for (int i = 0; i < NumInputElements; ++i) { - if (M[i] == -1) { - ++NumLHSMatch; - ++NumRHSMatch; - continue; - } - - if (M[i] == i) - ++NumLHSMatch; - else - LastLHSMismatch = i; - - if (M[i] == i + NumInputElements) - ++NumRHSMatch; - else - LastRHSMismatch = i; - } - - if (NumLHSMatch == NumInputElements - 1) { - DstIsLeft = true; - Anomaly = LastLHSMismatch; - return true; - } else if (NumRHSMatch == NumInputElements - 1) { - DstIsLeft = false; - Anomaly = LastRHSMismatch; - return true; - } - - return false; -} - -static bool isConcatMask(ArrayRef Mask, EVT VT, bool SplitLHS) { - if (VT.getSizeInBits() != 128) - return false; - - unsigned NumElts = VT.getVectorNumElements(); - - for (int I = 0, E = NumElts / 2; I != E; I++) { - if (Mask[I] != I) - return false; - } - - int Offset = NumElts / 2; - for (int I = NumElts / 2, E = NumElts; I != E; I++) { - if (Mask[I] != I + SplitLHS * Offset) - return false; - } - - return true; -} - -static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue V0 = Op.getOperand(0); - SDValue V1 = Op.getOperand(1); - ArrayRef Mask = cast(Op)->getMask(); - - if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() || - VT.getVectorElementType() != V1.getValueType().getVectorElementType()) - return SDValue(); - - bool SplitV0 = V0.getValueType().getSizeInBits() == 128; - - if (!isConcatMask(Mask, VT, SplitV0)) - return SDValue(); - - EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), - VT.getVectorNumElements() / 2); - if (SplitV0) { - V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, - DAG.getConstant(0, MVT::i64)); - } - if (V1.getValueType().getSizeInBits() == 128) { - V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, - DAG.getConstant(0, MVT::i64)); - } - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); -} - -/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit -/// the specified operations to build the shuffle. -static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, - SDValue RHS, SelectionDAG &DAG, - SDLoc dl) { - unsigned OpNum = (PFEntry >> 26) & 0x0F; - unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); - unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); - - enum { - OP_COPY = 0, // Copy, used for things like to say it is <0,1,2,3> - OP_VREV, - OP_VDUP0, - OP_VDUP1, - OP_VDUP2, - OP_VDUP3, - OP_VEXT1, - OP_VEXT2, - OP_VEXT3, - OP_VUZPL, // VUZP, left result - OP_VUZPR, // VUZP, right result - OP_VZIPL, // VZIP, left result - OP_VZIPR, // VZIP, right result - OP_VTRNL, // VTRN, left result - OP_VTRNR // VTRN, right result - }; - - if (OpNum == OP_COPY) { - if (LHSID == (1 * 9 + 2) * 9 + 3) - return LHS; - assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); - return RHS; - } - - SDValue OpLHS, OpRHS; - OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); - OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); - EVT VT = OpLHS.getValueType(); - - switch (OpNum) { - default: - llvm_unreachable("Unknown shuffle opcode!"); - case OP_VREV: - // VREV divides the vector in half and swaps within the half. - if (VT.getVectorElementType() == MVT::i32 || - VT.getVectorElementType() == MVT::f32) - return DAG.getNode(ARM64ISD::REV64, dl, VT, OpLHS); - // vrev <4 x i16> -> REV32 - if (VT.getVectorElementType() == MVT::i16) - return DAG.getNode(ARM64ISD::REV32, dl, VT, OpLHS); - // vrev <4 x i8> -> REV16 - assert(VT.getVectorElementType() == MVT::i8); - return DAG.getNode(ARM64ISD::REV16, dl, VT, OpLHS); - case OP_VDUP0: - case OP_VDUP1: - case OP_VDUP2: - case OP_VDUP3: { - EVT EltTy = VT.getVectorElementType(); - unsigned Opcode; - if (EltTy == MVT::i8) - Opcode = ARM64ISD::DUPLANE8; - else if (EltTy == MVT::i16) - Opcode = ARM64ISD::DUPLANE16; - else if (EltTy == MVT::i32 || EltTy == MVT::f32) - Opcode = ARM64ISD::DUPLANE32; - else if (EltTy == MVT::i64 || EltTy == MVT::f64) - Opcode = ARM64ISD::DUPLANE64; - else - llvm_unreachable("Invalid vector element type?"); - - if (VT.getSizeInBits() == 64) - OpLHS = WidenVector(OpLHS, DAG); - SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, MVT::i64); - return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); - } - case OP_VEXT1: - case OP_VEXT2: - case OP_VEXT3: { - unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); - return DAG.getNode(ARM64ISD::EXT, dl, VT, OpLHS, OpRHS, - DAG.getConstant(Imm, MVT::i32)); - } - case OP_VUZPL: - return DAG.getNode(ARM64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); - case OP_VUZPR: - return DAG.getNode(ARM64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); - case OP_VZIPL: - return DAG.getNode(ARM64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); - case OP_VZIPR: - return DAG.getNode(ARM64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); - case OP_VTRNL: - return DAG.getNode(ARM64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); - case OP_VTRNR: - return DAG.getNode(ARM64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); - } -} - -static SDValue GenerateTBL(SDValue Op, ArrayRef ShuffleMask, - SelectionDAG &DAG) { - // Check to see if we can use the TBL instruction. - SDValue V1 = Op.getOperand(0); - SDValue V2 = Op.getOperand(1); - SDLoc DL(Op); - - EVT EltVT = Op.getValueType().getVectorElementType(); - unsigned BytesPerElt = EltVT.getSizeInBits() / 8; - - SmallVector TBLMask; - for (int Val : ShuffleMask) { - for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { - unsigned Offset = Byte + Val * BytesPerElt; - TBLMask.push_back(DAG.getConstant(Offset, MVT::i32)); - } - } - - MVT IndexVT = MVT::v8i8; - unsigned IndexLen = 8; - if (Op.getValueType().getSizeInBits() == 128) { - IndexVT = MVT::v16i8; - IndexLen = 16; - } - - SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); - SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); - - SDValue Shuffle; - if (V2.getNode()->getOpcode() == ISD::UNDEF) { - if (IndexLen == 8) - V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); - Shuffle = DAG.getNode( - ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, - DAG.getConstant(Intrinsic::arm64_neon_tbl1, MVT::i32), V1Cst, - DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, - makeArrayRef(TBLMask.data(), IndexLen))); - } else { - if (IndexLen == 8) { - V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); - Shuffle = DAG.getNode( - ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, - DAG.getConstant(Intrinsic::arm64_neon_tbl1, MVT::i32), V1Cst, - DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, - makeArrayRef(TBLMask.data(), IndexLen))); - } else { - // FIXME: We cannot, for the moment, emit a TBL2 instruction because we - // cannot currently represent the register constraints on the input - // table registers. - // Shuffle = DAG.getNode(ARM64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, - // DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, - // &TBLMask[0], IndexLen)); - Shuffle = DAG.getNode( - ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, - DAG.getConstant(Intrinsic::arm64_neon_tbl2, MVT::i32), V1Cst, V2Cst, - DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, - makeArrayRef(TBLMask.data(), IndexLen))); - } - } - return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); -} - -static unsigned getDUPLANEOp(EVT EltType) { - if (EltType == MVT::i8) - return ARM64ISD::DUPLANE8; - if (EltType == MVT::i16) - return ARM64ISD::DUPLANE16; - if (EltType == MVT::i32 || EltType == MVT::f32) - return ARM64ISD::DUPLANE32; - if (EltType == MVT::i64 || EltType == MVT::f64) - return ARM64ISD::DUPLANE64; - - llvm_unreachable("Invalid vector element type?"); -} - -SDValue ARM64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, - SelectionDAG &DAG) const { - SDLoc dl(Op); - EVT VT = Op.getValueType(); - - ShuffleVectorSDNode *SVN = cast(Op.getNode()); - - // Convert shuffles that are directly supported on NEON to target-specific - // DAG nodes, instead of keeping them as shuffles and matching them again - // during code selection. This is more efficient and avoids the possibility - // of inconsistencies between legalization and selection. - ArrayRef ShuffleMask = SVN->getMask(); - - SDValue V1 = Op.getOperand(0); - SDValue V2 = Op.getOperand(1); - - if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], - V1.getValueType().getSimpleVT())) { - int Lane = SVN->getSplatIndex(); - // If this is undef splat, generate it via "just" vdup, if possible. - if (Lane == -1) - Lane = 0; - - if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) - return DAG.getNode(ARM64ISD::DUP, dl, V1.getValueType(), - V1.getOperand(0)); - // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non- - // constant. If so, we can just reference the lane's definition directly. - if (V1.getOpcode() == ISD::BUILD_VECTOR && - !isa(V1.getOperand(Lane))) - return DAG.getNode(ARM64ISD::DUP, dl, VT, V1.getOperand(Lane)); - - // Otherwise, duplicate from the lane of the input vector. - unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); - - // SelectionDAGBuilder may have "helpfully" already extracted or conatenated - // to make a vector of the same size as this SHUFFLE. We can ignore the - // extract entirely, and canonicalise the concat using WidenVector. - if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { - Lane += cast(V1.getOperand(1))->getZExtValue(); - V1 = V1.getOperand(0); - } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) { - unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; - Lane -= Idx * VT.getVectorNumElements() / 2; - V1 = WidenVector(V1.getOperand(Idx), DAG); - } else if (VT.getSizeInBits() == 64) - V1 = WidenVector(V1, DAG); - - return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, MVT::i64)); - } - - if (isREVMask(ShuffleMask, VT, 64)) - return DAG.getNode(ARM64ISD::REV64, dl, V1.getValueType(), V1, V2); - if (isREVMask(ShuffleMask, VT, 32)) - return DAG.getNode(ARM64ISD::REV32, dl, V1.getValueType(), V1, V2); - if (isREVMask(ShuffleMask, VT, 16)) - return DAG.getNode(ARM64ISD::REV16, dl, V1.getValueType(), V1, V2); - - bool ReverseEXT = false; - unsigned Imm; - if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) { - if (ReverseEXT) - std::swap(V1, V2); - Imm *= getExtFactor(V1); - return DAG.getNode(ARM64ISD::EXT, dl, V1.getValueType(), V1, V2, - DAG.getConstant(Imm, MVT::i32)); - } else if (V2->getOpcode() == ISD::UNDEF && - isSingletonEXTMask(ShuffleMask, VT, Imm)) { - Imm *= getExtFactor(V1); - return DAG.getNode(ARM64ISD::EXT, dl, V1.getValueType(), V1, V1, - DAG.getConstant(Imm, MVT::i32)); - } - - unsigned WhichResult; - if (isZIPMask(ShuffleMask, VT, WhichResult)) { - unsigned Opc = (WhichResult == 0) ? ARM64ISD::ZIP1 : ARM64ISD::ZIP2; - return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); - } - if (isUZPMask(ShuffleMask, VT, WhichResult)) { - unsigned Opc = (WhichResult == 0) ? ARM64ISD::UZP1 : ARM64ISD::UZP2; - return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); - } - if (isTRNMask(ShuffleMask, VT, WhichResult)) { - unsigned Opc = (WhichResult == 0) ? ARM64ISD::TRN1 : ARM64ISD::TRN2; - return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); - } - - if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { - unsigned Opc = (WhichResult == 0) ? ARM64ISD::ZIP1 : ARM64ISD::ZIP2; - return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); - } - if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { - unsigned Opc = (WhichResult == 0) ? ARM64ISD::UZP1 : ARM64ISD::UZP2; - return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); - } - if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { - unsigned Opc = (WhichResult == 0) ? ARM64ISD::TRN1 : ARM64ISD::TRN2; - return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); - } - - SDValue Concat = tryFormConcatFromShuffle(Op, DAG); - if (Concat.getNode()) - return Concat; - - bool DstIsLeft; - int Anomaly; - int NumInputElements = V1.getValueType().getVectorNumElements(); - if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) { - SDValue DstVec = DstIsLeft ? V1 : V2; - SDValue DstLaneV = DAG.getConstant(Anomaly, MVT::i64); - - SDValue SrcVec = V1; - int SrcLane = ShuffleMask[Anomaly]; - if (SrcLane >= NumInputElements) { - SrcVec = V2; - SrcLane -= VT.getVectorNumElements(); - } - SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64); - - EVT ScalarVT = VT.getVectorElementType(); - if (ScalarVT.getSizeInBits() < 32) - ScalarVT = MVT::i32; - - return DAG.getNode( - ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV), - DstLaneV); - } - - // If the shuffle is not directly supported and it has 4 elements, use - // the PerfectShuffle-generated table to synthesize it from other shuffles. - unsigned NumElts = VT.getVectorNumElements(); - if (NumElts == 4) { - unsigned PFIndexes[4]; - for (unsigned i = 0; i != 4; ++i) { - if (ShuffleMask[i] < 0) - PFIndexes[i] = 8; - else - PFIndexes[i] = ShuffleMask[i]; - } - - // Compute the index in the perfect shuffle table. - unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + - PFIndexes[2] * 9 + PFIndexes[3]; - unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; - unsigned Cost = (PFEntry >> 30); - - if (Cost <= 4) - return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); - } - - return GenerateTBL(Op, ShuffleMask, DAG); -} - -static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, - APInt &UndefBits) { - EVT VT = BVN->getValueType(0); - APInt SplatBits, SplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; - if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { - unsigned NumSplats = VT.getSizeInBits() / SplatBitSize; - - for (unsigned i = 0; i < NumSplats; ++i) { - CnstBits <<= SplatBitSize; - UndefBits <<= SplatBitSize; - CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits()); - UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits()); - } - - return true; - } - - return false; -} - -SDValue ARM64TargetLowering::LowerVectorAND(SDValue Op, - SelectionDAG &DAG) const { - BuildVectorSDNode *BVN = - dyn_cast(Op.getOperand(1).getNode()); - SDValue LHS = Op.getOperand(0); - SDLoc dl(Op); - EVT VT = Op.getValueType(); - - if (!BVN) - return Op; - - APInt CnstBits(VT.getSizeInBits(), 0); - APInt UndefBits(VT.getSizeInBits(), 0); - if (resolveBuildVector(BVN, CnstBits, UndefBits)) { - // We only have BIC vector immediate instruction, which is and-not. - CnstBits = ~CnstBits; - - // We make use of a little bit of goto ickiness in order to avoid having to - // duplicate the immediate matching logic for the undef toggled case. - bool SecondTry = false; - AttemptModImm: - - if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { - CnstBits = CnstBits.zextOrTrunc(64); - uint64_t CnstVal = CnstBits.getZExtValue(); - - if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; - SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(0, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; - SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(8, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; - SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(16, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; - SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(24, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; - SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(0, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; - SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(8, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - } - - if (SecondTry) - goto FailedModImm; - SecondTry = true; - CnstBits = ~UndefBits; - goto AttemptModImm; - } - -// We can always fall back to a non-immediate AND. -FailedModImm: - return Op; -} - -// Specialized code to quickly find if PotentialBVec is a BuildVector that -// consists of only the same constant int value, returned in reference arg -// ConstVal -static bool isAllConstantBuildVector(const SDValue &PotentialBVec, - uint64_t &ConstVal) { - BuildVectorSDNode *Bvec = dyn_cast(PotentialBVec); - if (!Bvec) - return false; - ConstantSDNode *FirstElt = dyn_cast(Bvec->getOperand(0)); - if (!FirstElt) - return false; - EVT VT = Bvec->getValueType(0); - unsigned NumElts = VT.getVectorNumElements(); - for (unsigned i = 1; i < NumElts; ++i) - if (dyn_cast(Bvec->getOperand(i)) != FirstElt) - return false; - ConstVal = FirstElt->getZExtValue(); - return true; -} - -static unsigned getIntrinsicID(const SDNode *N) { - unsigned Opcode = N->getOpcode(); - switch (Opcode) { - default: - return Intrinsic::not_intrinsic; - case ISD::INTRINSIC_WO_CHAIN: { - unsigned IID = cast(N->getOperand(0))->getZExtValue(); - if (IID < Intrinsic::num_intrinsics) - return IID; - return Intrinsic::not_intrinsic; - } - } -} - -// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), -// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a -// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2. -// Also, logical shift right -> sri, with the same structure. -static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - - if (!VT.isVector()) - return SDValue(); - - SDLoc DL(N); - - // Is the first op an AND? - const SDValue And = N->getOperand(0); - if (And.getOpcode() != ISD::AND) - return SDValue(); - - // Is the second op an shl or lshr? - SDValue Shift = N->getOperand(1); - // This will have been turned into: ARM64ISD::VSHL vector, #shift - // or ARM64ISD::VLSHR vector, #shift - unsigned ShiftOpc = Shift.getOpcode(); - if ((ShiftOpc != ARM64ISD::VSHL && ShiftOpc != ARM64ISD::VLSHR)) - return SDValue(); - bool IsShiftRight = ShiftOpc == ARM64ISD::VLSHR; - - // Is the shift amount constant? - ConstantSDNode *C2node = dyn_cast(Shift.getOperand(1)); - if (!C2node) - return SDValue(); - - // Is the and mask vector all constant? - uint64_t C1; - if (!isAllConstantBuildVector(And.getOperand(1), C1)) - return SDValue(); - - // Is C1 == ~C2, taking into account how much one can shift elements of a - // particular size? - uint64_t C2 = C2node->getZExtValue(); - unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits(); - if (C2 > ElemSizeInBits) - return SDValue(); - unsigned ElemMask = (1 << ElemSizeInBits) - 1; - if ((C1 & ElemMask) != (~C2 & ElemMask)) - return SDValue(); - - SDValue X = And.getOperand(0); - SDValue Y = Shift.getOperand(0); - - unsigned Intrin = - IsShiftRight ? Intrinsic::arm64_neon_vsri : Intrinsic::arm64_neon_vsli; - SDValue ResultSLI = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrin, MVT::i32), X, Y, Shift.getOperand(1)); - - DEBUG(dbgs() << "arm64-lower: transformed: \n"); - DEBUG(N->dump(&DAG)); - DEBUG(dbgs() << "into: \n"); - DEBUG(ResultSLI->dump(&DAG)); - - ++NumShiftInserts; - return ResultSLI; -} - -SDValue ARM64TargetLowering::LowerVectorOR(SDValue Op, - SelectionDAG &DAG) const { - // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) - if (EnableARM64SlrGeneration) { - SDValue Res = tryLowerToSLI(Op.getNode(), DAG); - if (Res.getNode()) - return Res; - } - - BuildVectorSDNode *BVN = - dyn_cast(Op.getOperand(0).getNode()); - SDValue LHS = Op.getOperand(1); - SDLoc dl(Op); - EVT VT = Op.getValueType(); - - // OR commutes, so try swapping the operands. - if (!BVN) { - LHS = Op.getOperand(0); - BVN = dyn_cast(Op.getOperand(1).getNode()); - } - if (!BVN) - return Op; - - APInt CnstBits(VT.getSizeInBits(), 0); - APInt UndefBits(VT.getSizeInBits(), 0); - if (resolveBuildVector(BVN, CnstBits, UndefBits)) { - // We make use of a little bit of goto ickiness in order to avoid having to - // duplicate the immediate matching logic for the undef toggled case. - bool SecondTry = false; - AttemptModImm: - - if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { - CnstBits = CnstBits.zextOrTrunc(64); - uint64_t CnstVal = CnstBits.getZExtValue(); - - if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; - SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(0, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; - SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(8, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; - SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(16, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; - SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(24, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; - SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(0, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; - SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(8, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - } - - if (SecondTry) - goto FailedModImm; - SecondTry = true; - CnstBits = UndefBits; - goto AttemptModImm; - } - -// We can always fall back to a non-immediate OR. -FailedModImm: - return Op; -} - -SDValue ARM64TargetLowering::LowerBUILD_VECTOR(SDValue Op, - SelectionDAG &DAG) const { - BuildVectorSDNode *BVN = cast(Op.getNode()); - SDLoc dl(Op); - EVT VT = Op.getValueType(); - - APInt CnstBits(VT.getSizeInBits(), 0); - APInt UndefBits(VT.getSizeInBits(), 0); - if (resolveBuildVector(BVN, CnstBits, UndefBits)) { - // We make use of a little bit of goto ickiness in order to avoid having to - // duplicate the immediate matching logic for the undef toggled case. - bool SecondTry = false; - AttemptModImm: - - if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { - CnstBits = CnstBits.zextOrTrunc(64); - uint64_t CnstVal = CnstBits.getZExtValue(); - - // Certain magic vector constants (used to express things like NOT - // and NEG) are passed through unmodified. This allows codegen patterns - // for these operations to match. Special-purpose patterns will lower - // these immediates to MOVIs if it proves necessary. - if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL)) - return Op; - - // The many faces of MOVI... - if (ARM64_AM::isAdvSIMDModImmType10(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType10(CnstVal); - if (VT.getSizeInBits() == 128) { - SDValue Mov = DAG.getNode(ARM64ISD::MOVIedit, dl, MVT::v2i64, - DAG.getConstant(CnstVal, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - // Support the V64 version via subregister insertion. - SDValue Mov = DAG.getNode(ARM64ISD::MOVIedit, dl, MVT::f64, - DAG.getConstant(CnstVal, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; - SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(0, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; - SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(8, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; - SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(16, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; - SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(24, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; - SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(0, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; - SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(8, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType7(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType7(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; - SDValue Mov = DAG.getNode(ARM64ISD::MOVImsl, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(264, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType8(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType8(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; - SDValue Mov = DAG.getNode(ARM64ISD::MOVImsl, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(272, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType9(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType9(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8; - SDValue Mov = DAG.getNode(ARM64ISD::MOVI, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - // The few faces of FMOV... - if (ARM64_AM::isAdvSIMDModImmType11(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType11(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32; - SDValue Mov = DAG.getNode(ARM64ISD::FMOV, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType12(CnstVal) && - VT.getSizeInBits() == 128) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType12(CnstVal); - SDValue Mov = DAG.getNode(ARM64ISD::FMOV, dl, MVT::v2f64, - DAG.getConstant(CnstVal, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - // The many faces of MVNI... - CnstVal = ~CnstVal; - if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; - SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(0, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; - SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(8, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; - SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(16, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; - SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(24, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; - SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(0, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; - SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(8, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType7(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType7(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; - SDValue Mov = DAG.getNode(ARM64ISD::MVNImsl, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(264, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - - if (ARM64_AM::isAdvSIMDModImmType8(CnstVal)) { - CnstVal = ARM64_AM::encodeAdvSIMDModImmType8(CnstVal); - MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; - SDValue Mov = DAG.getNode(ARM64ISD::MVNImsl, dl, MovTy, - DAG.getConstant(CnstVal, MVT::i32), - DAG.getConstant(272, MVT::i32)); - return DAG.getNode(ISD::BITCAST, dl, VT, Mov); - } - } - - if (SecondTry) - goto FailedModImm; - SecondTry = true; - CnstBits = UndefBits; - goto AttemptModImm; - } -FailedModImm: - - // Scan through the operands to find some interesting properties we can - // exploit: - // 1) If only one value is used, we can use a DUP, or - // 2) if only the low element is not undef, we can just insert that, or - // 3) if only one constant value is used (w/ some non-constant lanes), - // we can splat the constant value into the whole vector then fill - // in the non-constant lanes. - // 4) FIXME: If different constant values are used, but we can intelligently - // select the values we'll be overwriting for the non-constant - // lanes such that we can directly materialize the vector - // some other way (MOVI, e.g.), we can be sneaky. - unsigned NumElts = VT.getVectorNumElements(); - bool isOnlyLowElement = true; - bool usesOnlyOneValue = true; - bool usesOnlyOneConstantValue = true; - bool isConstant = true; - unsigned NumConstantLanes = 0; - SDValue Value; - SDValue ConstantValue; - for (unsigned i = 0; i < NumElts; ++i) { - SDValue V = Op.getOperand(i); - if (V.getOpcode() == ISD::UNDEF) - continue; - if (i > 0) - isOnlyLowElement = false; - if (!isa(V) && !isa(V)) - isConstant = false; - - if (isa(V) || isa(V)) { - ++NumConstantLanes; - if (!ConstantValue.getNode()) - ConstantValue = V; - else if (ConstantValue != V) - usesOnlyOneConstantValue = false; - } - - if (!Value.getNode()) - Value = V; - else if (V != Value) - usesOnlyOneValue = false; - } - - if (!Value.getNode()) - return DAG.getUNDEF(VT); - - if (isOnlyLowElement) - return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); - - // Use DUP for non-constant splats. For f32 constant splats, reduce to - // i32 and try again. - if (usesOnlyOneValue) { - if (!isConstant) { - if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - Value.getValueType() != VT) - return DAG.getNode(ARM64ISD::DUP, dl, VT, Value); - - // This is actually a DUPLANExx operation, which keeps everything vectory. - - // DUPLANE works on 128-bit vectors, widen it if necessary. - SDValue Lane = Value.getOperand(1); - Value = Value.getOperand(0); - if (Value.getValueType().getSizeInBits() == 64) - Value = WidenVector(Value, DAG); - - unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); - return DAG.getNode(Opcode, dl, VT, Value, Lane); - } - - if (VT.getVectorElementType().isFloatingPoint()) { - SmallVector Ops; - MVT NewType = - (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64; - for (unsigned i = 0; i < NumElts; ++i) - Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); - EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); - SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops); - Val = LowerBUILD_VECTOR(Val, DAG); - if (Val.getNode()) - return DAG.getNode(ISD::BITCAST, dl, VT, Val); - } - } - - // If there was only one constant value used and for more than one lane, - // start by splatting that value, then replace the non-constant lanes. This - // is better than the default, which will perform a separate initialization - // for each lane. - if (NumConstantLanes > 0 && usesOnlyOneConstantValue) { - SDValue Val = DAG.getNode(ARM64ISD::DUP, dl, VT, ConstantValue); - // Now insert the non-constant lanes. - for (unsigned i = 0; i < NumElts; ++i) { - SDValue V = Op.getOperand(i); - SDValue LaneIdx = DAG.getConstant(i, MVT::i64); - if (!isa(V) && !isa(V)) { - // Note that type legalization likely mucked about with the VT of the - // source operand, so we may have to convert it here before inserting. - Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx); - } - } - return Val; - } - - // If all elements are constants and the case above didn't get hit, fall back - // to the default expansion, which will generate a load from the constant - // pool. - if (isConstant) - return SDValue(); - - // Empirical tests suggest this is rarely worth it for vectors of length <= 2. - if (NumElts >= 4) { - SDValue shuffle = ReconstructShuffle(Op, DAG); - if (shuffle != SDValue()) - return shuffle; - } - - // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we - // know the default expansion would otherwise fall back on something even - // worse. For a vector with one or two non-undef values, that's - // scalar_to_vector for the elements followed by a shuffle (provided the - // shuffle is valid for the target) and materialization element by element - // on the stack followed by a load for everything else. - if (!isConstant && !usesOnlyOneValue) { - SDValue Vec = DAG.getUNDEF(VT); - SDValue Op0 = Op.getOperand(0); - unsigned ElemSize = VT.getVectorElementType().getSizeInBits(); - unsigned i = 0; - // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to - // a) Avoid a RMW dependency on the full vector register, and - // b) Allow the register coalescer to fold away the copy if the - // value is already in an S or D register. - if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) { - unsigned SubIdx = ElemSize == 32 ? ARM64::ssub : ARM64::dsub; - MachineSDNode *N = - DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0, - DAG.getTargetConstant(SubIdx, MVT::i32)); - Vec = SDValue(N, 0); - ++i; - } - for (; i < NumElts; ++i) { - SDValue V = Op.getOperand(i); - if (V.getOpcode() == ISD::UNDEF) - continue; - SDValue LaneIdx = DAG.getConstant(i, MVT::i64); - Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); - } - return Vec; - } - - // Just use the default expansion. We failed to find a better alternative. - return SDValue(); -} - -SDValue ARM64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, - SelectionDAG &DAG) const { - assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); - - // Check for non-constant lane. - if (!isa(Op.getOperand(2))) - return SDValue(); - - EVT VT = Op.getOperand(0).getValueType(); - - // Insertion/extraction are legal for V128 types. - if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || - VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64) - return Op; - - if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && - VT != MVT::v1i64 && VT != MVT::v2f32) - return SDValue(); - - // For V64 types, we perform insertion by expanding the value - // to a V128 type and perform the insertion on that. - SDLoc DL(Op); - SDValue WideVec = WidenVector(Op.getOperand(0), DAG); - EVT WideTy = WideVec.getValueType(); - - SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec, - Op.getOperand(1), Op.getOperand(2)); - // Re-narrow the resultant vector. - return NarrowVector(Node, DAG); -} - -SDValue ARM64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, - SelectionDAG &DAG) const { - assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); - - // Check for non-constant lane. - if (!isa(Op.getOperand(1))) - return SDValue(); - - EVT VT = Op.getOperand(0).getValueType(); - - // Insertion/extraction are legal for V128 types. - if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || - VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64) - return Op; - - if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && - VT != MVT::v1i64 && VT != MVT::v2f32) - return SDValue(); - - // For V64 types, we perform extraction by expanding the value - // to a V128 type and perform the extraction on that. - SDLoc DL(Op); - SDValue WideVec = WidenVector(Op.getOperand(0), DAG); - EVT WideTy = WideVec.getValueType(); - - EVT ExtrTy = WideTy.getVectorElementType(); - if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8) - ExtrTy = MVT::i32; - - // For extractions, we just return the result directly. - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec, - Op.getOperand(1)); -} - -SDValue ARM64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, - SelectionDAG &DAG) const { - EVT VT = Op.getOperand(0).getValueType(); - SDLoc dl(Op); - // Just in case... - if (!VT.isVector()) - return SDValue(); - - ConstantSDNode *Cst = dyn_cast(Op.getOperand(1)); - if (!Cst) - return SDValue(); - unsigned Val = Cst->getZExtValue(); - - unsigned Size = Op.getValueType().getSizeInBits(); - if (Val == 0) { - switch (Size) { - case 8: - return DAG.getTargetExtractSubreg(ARM64::bsub, dl, Op.getValueType(), - Op.getOperand(0)); - case 16: - return DAG.getTargetExtractSubreg(ARM64::hsub, dl, Op.getValueType(), - Op.getOperand(0)); - case 32: - return DAG.getTargetExtractSubreg(ARM64::ssub, dl, Op.getValueType(), - Op.getOperand(0)); - case 64: - return DAG.getTargetExtractSubreg(ARM64::dsub, dl, Op.getValueType(), - Op.getOperand(0)); - default: - llvm_unreachable("Unexpected vector type in extract_subvector!"); - } - } - // If this is extracting the upper 64-bits of a 128-bit vector, we match - // that directly. - if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64) - return Op; - - return SDValue(); -} - -bool ARM64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, - EVT VT) const { - if (VT.getVectorNumElements() == 4 && - (VT.is128BitVector() || VT.is64BitVector())) { - unsigned PFIndexes[4]; - for (unsigned i = 0; i != 4; ++i) { - if (M[i] < 0) - PFIndexes[i] = 8; - else - PFIndexes[i] = M[i]; - } - - // Compute the index in the perfect shuffle table. - unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + - PFIndexes[2] * 9 + PFIndexes[3]; - unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; - unsigned Cost = (PFEntry >> 30); - - if (Cost <= 4) - return true; - } - - bool DummyBool; - int DummyInt; - unsigned DummyUnsigned; - - return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) || - isREVMask(M, VT, 32) || isREVMask(M, VT, 16) || - isEXTMask(M, VT, DummyBool, DummyUnsigned) || - // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM. - isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) || - isZIPMask(M, VT, DummyUnsigned) || - isTRN_v_undef_Mask(M, VT, DummyUnsigned) || - isUZP_v_undef_Mask(M, VT, DummyUnsigned) || - isZIP_v_undef_Mask(M, VT, DummyUnsigned) || - isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) || - isConcatMask(M, VT, VT.getSizeInBits() == 128)); -} - -/// getVShiftImm - Check if this is a valid build_vector for the immediate -/// operand of a vector shift operation, where all the elements of the -/// build_vector must have the same constant integer value. -static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { - // Ignore bit_converts. - while (Op.getOpcode() == ISD::BITCAST) - Op = Op.getOperand(0); - BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); - APInt SplatBits, SplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; - if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, - HasAnyUndefs, ElementBits) || - SplatBitSize > ElementBits) - return false; - Cnt = SplatBits.getSExtValue(); - return true; -} - -/// isVShiftLImm - Check if this is a valid build_vector for the immediate -/// operand of a vector shift left operation. That value must be in the range: -/// 0 <= Value < ElementBits for a left shift; or -/// 0 <= Value <= ElementBits for a long left shift. -static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { - assert(VT.isVector() && "vector shift count is not a vector type"); - unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); - if (!getVShiftImm(Op, ElementBits, Cnt)) - return false; - return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); -} - -/// isVShiftRImm - Check if this is a valid build_vector for the immediate -/// operand of a vector shift right operation. For a shift opcode, the value -/// is positive, but for an intrinsic the value count must be negative. The -/// absolute value must be in the range: -/// 1 <= |Value| <= ElementBits for a right shift; or -/// 1 <= |Value| <= ElementBits/2 for a narrow right shift. -static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, - int64_t &Cnt) { - assert(VT.isVector() && "vector shift count is not a vector type"); - unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); - if (!getVShiftImm(Op, ElementBits, Cnt)) - return false; - if (isIntrinsic) - Cnt = -Cnt; - return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); -} - -SDValue ARM64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, - SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - SDLoc DL(Op); - int64_t Cnt; - - if (!Op.getOperand(1).getValueType().isVector()) - return Op; - unsigned EltSize = VT.getVectorElementType().getSizeInBits(); - - switch (Op.getOpcode()) { - default: - llvm_unreachable("unexpected shift opcode"); - - case ISD::SHL: - if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) - return DAG.getNode(ARM64ISD::VSHL, SDLoc(Op), VT, Op.getOperand(0), - DAG.getConstant(Cnt, MVT::i32)); - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Intrinsic::arm64_neon_ushl, MVT::i32), - Op.getOperand(0), Op.getOperand(1)); - case ISD::SRA: - case ISD::SRL: - // Right shift immediate - if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) && - Cnt < EltSize) { - unsigned Opc = - (Op.getOpcode() == ISD::SRA) ? ARM64ISD::VASHR : ARM64ISD::VLSHR; - return DAG.getNode(Opc, SDLoc(Op), VT, Op.getOperand(0), - DAG.getConstant(Cnt, MVT::i32)); - } - - // Right shift register. Note, there is not a shift right register - // instruction, but the shift left register instruction takes a signed - // value, where negative numbers specify a right shift. - unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::arm64_neon_sshl - : Intrinsic::arm64_neon_ushl; - // negate the shift amount - SDValue NegShift = DAG.getNode(ARM64ISD::NEG, DL, VT, Op.getOperand(1)); - SDValue NegShiftLeft = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, - DAG.getConstant(Opc, MVT::i32), Op.getOperand(0), NegShift); - return NegShiftLeft; - } - - return SDValue(); -} - -static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, - ARM64CC::CondCode CC, bool NoNans, EVT VT, - SDLoc dl, SelectionDAG &DAG) { - EVT SrcVT = LHS.getValueType(); - - BuildVectorSDNode *BVN = dyn_cast(RHS.getNode()); - APInt CnstBits(VT.getSizeInBits(), 0); - APInt UndefBits(VT.getSizeInBits(), 0); - bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits); - bool IsZero = IsCnst && (CnstBits == 0); - - if (SrcVT.getVectorElementType().isFloatingPoint()) { - switch (CC) { - default: - return SDValue(); - case ARM64CC::NE: { - SDValue Fcmeq; - if (IsZero) - Fcmeq = DAG.getNode(ARM64ISD::FCMEQz, dl, VT, LHS); - else - Fcmeq = DAG.getNode(ARM64ISD::FCMEQ, dl, VT, LHS, RHS); - return DAG.getNode(ARM64ISD::NOT, dl, VT, Fcmeq); - } - case ARM64CC::EQ: - if (IsZero) - return DAG.getNode(ARM64ISD::FCMEQz, dl, VT, LHS); - return DAG.getNode(ARM64ISD::FCMEQ, dl, VT, LHS, RHS); - case ARM64CC::GE: - if (IsZero) - return DAG.getNode(ARM64ISD::FCMGEz, dl, VT, LHS); - return DAG.getNode(ARM64ISD::FCMGE, dl, VT, LHS, RHS); - case ARM64CC::GT: - if (IsZero) - return DAG.getNode(ARM64ISD::FCMGTz, dl, VT, LHS); - return DAG.getNode(ARM64ISD::FCMGT, dl, VT, LHS, RHS); - case ARM64CC::LS: - if (IsZero) - return DAG.getNode(ARM64ISD::FCMLEz, dl, VT, LHS); - return DAG.getNode(ARM64ISD::FCMGE, dl, VT, RHS, LHS); - case ARM64CC::LT: - if (!NoNans) - return SDValue(); - // If we ignore NaNs then we can use to the MI implementation. - // Fallthrough. - case ARM64CC::MI: - if (IsZero) - return DAG.getNode(ARM64ISD::FCMLTz, dl, VT, LHS); - return DAG.getNode(ARM64ISD::FCMGT, dl, VT, RHS, LHS); - } - } - - switch (CC) { - default: - return SDValue(); - case ARM64CC::NE: { - SDValue Cmeq; - if (IsZero) - Cmeq = DAG.getNode(ARM64ISD::CMEQz, dl, VT, LHS); - else - Cmeq = DAG.getNode(ARM64ISD::CMEQ, dl, VT, LHS, RHS); - return DAG.getNode(ARM64ISD::NOT, dl, VT, Cmeq); - } - case ARM64CC::EQ: - if (IsZero) - return DAG.getNode(ARM64ISD::CMEQz, dl, VT, LHS); - return DAG.getNode(ARM64ISD::CMEQ, dl, VT, LHS, RHS); - case ARM64CC::GE: - if (IsZero) - return DAG.getNode(ARM64ISD::CMGEz, dl, VT, LHS); - return DAG.getNode(ARM64ISD::CMGE, dl, VT, LHS, RHS); - case ARM64CC::GT: - if (IsZero) - return DAG.getNode(ARM64ISD::CMGTz, dl, VT, LHS); - return DAG.getNode(ARM64ISD::CMGT, dl, VT, LHS, RHS); - case ARM64CC::LE: - if (IsZero) - return DAG.getNode(ARM64ISD::CMLEz, dl, VT, LHS); - return DAG.getNode(ARM64ISD::CMGE, dl, VT, RHS, LHS); - case ARM64CC::LS: - return DAG.getNode(ARM64ISD::CMHS, dl, VT, RHS, LHS); - case ARM64CC::LO: - return DAG.getNode(ARM64ISD::CMHI, dl, VT, RHS, LHS); - case ARM64CC::LT: - if (IsZero) - return DAG.getNode(ARM64ISD::CMLTz, dl, VT, LHS); - return DAG.getNode(ARM64ISD::CMGT, dl, VT, RHS, LHS); - case ARM64CC::HI: - return DAG.getNode(ARM64ISD::CMHI, dl, VT, LHS, RHS); - case ARM64CC::HS: - return DAG.getNode(ARM64ISD::CMHS, dl, VT, LHS, RHS); - } -} - -SDValue ARM64TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { - ISD::CondCode CC = cast(Op.getOperand(2))->get(); - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - SDLoc dl(Op); - - if (LHS.getValueType().getVectorElementType().isInteger()) { - assert(LHS.getValueType() == RHS.getValueType()); - ARM64CC::CondCode ARM64CC = changeIntCCToARM64CC(CC); - return EmitVectorComparison(LHS, RHS, ARM64CC, false, Op.getValueType(), dl, - DAG); - } - - assert(LHS.getValueType().getVectorElementType() == MVT::f32 || - LHS.getValueType().getVectorElementType() == MVT::f64); - - // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally - // clean. Some of them require two branches to implement. - ARM64CC::CondCode CC1, CC2; - bool ShouldInvert; - changeVectorFPCCToARM64CC(CC, CC1, CC2, ShouldInvert); - - bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath; - SDValue Cmp = - EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG); - if (!Cmp.getNode()) - return SDValue(); - - if (CC2 != ARM64CC::AL) { - SDValue Cmp2 = - EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG); - if (!Cmp2.getNode()) - return SDValue(); - - Cmp = DAG.getNode(ISD::OR, dl, Cmp.getValueType(), Cmp, Cmp2); - } - - if (ShouldInvert) - return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType()); - - return Cmp; -} - -/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as -/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment -/// specified in the intrinsic calls. -bool ARM64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, - const CallInst &I, - unsigned Intrinsic) const { - switch (Intrinsic) { - case Intrinsic::arm64_neon_ld2: - case Intrinsic::arm64_neon_ld3: - case Intrinsic::arm64_neon_ld4: - case Intrinsic::arm64_neon_ld1x2: - case Intrinsic::arm64_neon_ld1x3: - case Intrinsic::arm64_neon_ld1x4: - case Intrinsic::arm64_neon_ld2lane: - case Intrinsic::arm64_neon_ld3lane: - case Intrinsic::arm64_neon_ld4lane: - case Intrinsic::arm64_neon_ld2r: - case Intrinsic::arm64_neon_ld3r: - case Intrinsic::arm64_neon_ld4r: { - Info.opc = ISD::INTRINSIC_W_CHAIN; - // Conservatively set memVT to the entire set of vectors loaded. - uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8; - Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); - Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); - Info.offset = 0; - Info.align = 0; - Info.vol = false; // volatile loads with NEON intrinsics not supported - Info.readMem = true; - Info.writeMem = false; - return true; - } - case Intrinsic::arm64_neon_st2: - case Intrinsic::arm64_neon_st3: - case Intrinsic::arm64_neon_st4: - case Intrinsic::arm64_neon_st1x2: - case Intrinsic::arm64_neon_st1x3: - case Intrinsic::arm64_neon_st1x4: - case Intrinsic::arm64_neon_st2lane: - case Intrinsic::arm64_neon_st3lane: - case Intrinsic::arm64_neon_st4lane: { - Info.opc = ISD::INTRINSIC_VOID; - // Conservatively set memVT to the entire set of vectors stored. - unsigned NumElts = 0; - for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { - Type *ArgTy = I.getArgOperand(ArgI)->getType(); - if (!ArgTy->isVectorTy()) - break; - NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8; - } - Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); - Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); - Info.offset = 0; - Info.align = 0; - Info.vol = false; // volatile stores with NEON intrinsics not supported - Info.readMem = false; - Info.writeMem = true; - return true; - } - case Intrinsic::arm64_ldaxr: - case Intrinsic::arm64_ldxr: { - PointerType *PtrTy = cast(I.getArgOperand(0)->getType()); - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(PtrTy->getElementType()); - Info.ptrVal = I.getArgOperand(0); - Info.offset = 0; - Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); - Info.vol = true; - Info.readMem = true; - Info.writeMem = false; - return true; - } - case Intrinsic::arm64_stlxr: - case Intrinsic::arm64_stxr: { - PointerType *PtrTy = cast(I.getArgOperand(1)->getType()); - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(PtrTy->getElementType()); - Info.ptrVal = I.getArgOperand(1); - Info.offset = 0; - Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); - Info.vol = true; - Info.readMem = false; - Info.writeMem = true; - return true; - } - case Intrinsic::arm64_ldaxp: - case Intrinsic::arm64_ldxp: { - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::i128; - Info.ptrVal = I.getArgOperand(0); - Info.offset = 0; - Info.align = 16; - Info.vol = true; - Info.readMem = true; - Info.writeMem = false; - return true; - } - case Intrinsic::arm64_stlxp: - case Intrinsic::arm64_stxp: { - Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::i128; - Info.ptrVal = I.getArgOperand(2); - Info.offset = 0; - Info.align = 16; - Info.vol = true; - Info.readMem = false; - Info.writeMem = true; - return true; - } - default: - break; - } - - return false; -} - -// Truncations from 64-bit GPR to 32-bit GPR is free. -bool ARM64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { - if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) - return false; - unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); - unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); - if (NumBits1 <= NumBits2) - return false; - return true; -} -bool ARM64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { - if (!VT1.isInteger() || !VT2.isInteger()) - return false; - unsigned NumBits1 = VT1.getSizeInBits(); - unsigned NumBits2 = VT2.getSizeInBits(); - if (NumBits1 <= NumBits2) - return false; - return true; -} - -// All 32-bit GPR operations implicitly zero the high-half of the corresponding -// 64-bit GPR. -bool ARM64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { - if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) - return false; - unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); - unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); - if (NumBits1 == 32 && NumBits2 == 64) - return true; - return false; -} -bool ARM64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { - if (!VT1.isInteger() || !VT2.isInteger()) - return false; - unsigned NumBits1 = VT1.getSizeInBits(); - unsigned NumBits2 = VT2.getSizeInBits(); - if (NumBits1 == 32 && NumBits2 == 64) - return true; - return false; -} - -bool ARM64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { - EVT VT1 = Val.getValueType(); - if (isZExtFree(VT1, VT2)) { - return true; - } - - if (Val.getOpcode() != ISD::LOAD) - return false; - - // 8-, 16-, and 32-bit integer loads all implicitly zero-extend. - return (VT1.isSimple() && VT1.isInteger() && VT2.isSimple() && - VT2.isInteger() && VT1.getSizeInBits() <= 32); -} - -bool ARM64TargetLowering::hasPairedLoad(Type *LoadedType, - unsigned &RequiredAligment) const { - if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy()) - return false; - // Cyclone supports unaligned accesses. - RequiredAligment = 0; - unsigned NumBits = LoadedType->getPrimitiveSizeInBits(); - return NumBits == 32 || NumBits == 64; -} - -bool ARM64TargetLowering::hasPairedLoad(EVT LoadedType, - unsigned &RequiredAligment) const { - if (!LoadedType.isSimple() || - (!LoadedType.isInteger() && !LoadedType.isFloatingPoint())) - return false; - // Cyclone supports unaligned accesses. - RequiredAligment = 0; - unsigned NumBits = LoadedType.getSizeInBits(); - return NumBits == 32 || NumBits == 64; -} - -static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, - unsigned AlignCheck) { - return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && - (DstAlign == 0 || DstAlign % AlignCheck == 0)); -} - -EVT ARM64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, - unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, bool MemcpyStrSrc, - MachineFunction &MF) const { - // Don't use AdvSIMD to implement 16-byte memset. It would have taken one - // instruction to materialize the v2i64 zero and one store (with restrictive - // addressing mode). Just do two i64 store of zero-registers. - bool Fast; - const Function *F = MF.getFunction(); - if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 && - !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::NoImplicitFloat) && - (memOpAlign(SrcAlign, DstAlign, 16) || - (allowsUnalignedMemoryAccesses(MVT::f128, 0, &Fast) && Fast))) - return MVT::f128; - - return Size >= 8 ? MVT::i64 : MVT::i32; -} - -// 12-bit optionally shifted immediates are legal for adds. -bool ARM64TargetLowering::isLegalAddImmediate(int64_t Immed) const { - if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0)) - return true; - return false; -} - -// Integer comparisons are implemented with ADDS/SUBS, so the range of valid -// immediates is the same as for an add or a sub. -bool ARM64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { - if (Immed < 0) - Immed *= -1; - return isLegalAddImmediate(Immed); -} - -/// isLegalAddressingMode - Return true if the addressing mode represented -/// by AM is legal for this target, for a load/store of the specified type. -bool ARM64TargetLowering::isLegalAddressingMode(const AddrMode &AM, - Type *Ty) const { - // ARM64 has five basic addressing modes: - // reg - // reg + 9-bit signed offset - // reg + SIZE_IN_BYTES * 12-bit unsigned offset - // reg1 + reg2 - // reg + SIZE_IN_BYTES * reg - - // No global is ever allowed as a base. - if (AM.BaseGV) - return false; - - // No reg+reg+imm addressing. - if (AM.HasBaseReg && AM.BaseOffs && AM.Scale) - return false; - - // check reg + imm case: - // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 - uint64_t NumBytes = 0; - if (Ty->isSized()) { - uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty); - NumBytes = NumBits / 8; - if (!isPowerOf2_64(NumBits)) - NumBytes = 0; - } - - if (!AM.Scale) { - int64_t Offset = AM.BaseOffs; - - // 9-bit signed offset - if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1) - return true; - - // 12-bit unsigned offset - unsigned shift = Log2_64(NumBytes); - if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 && - // Must be a multiple of NumBytes (NumBytes is a power of 2) - (Offset >> shift) << shift == Offset) - return true; - return false; - } - - // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 - - if (!AM.Scale || AM.Scale == 1 || - (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes)) - return true; - return false; -} - -int ARM64TargetLowering::getScalingFactorCost(const AddrMode &AM, - Type *Ty) const { - // Scaling factors are not free at all. - // Operands | Rt Latency - // ------------------------------------------- - // Rt, [Xn, Xm] | 4 - // ------------------------------------------- - // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 - // Rt, [Xn, Wm, #imm] | - if (isLegalAddressingMode(AM, Ty)) - // Scale represents reg2 * scale, thus account for 1 if - // it is not equal to 0 or 1. - return AM.Scale != 0 && AM.Scale != 1; - return -1; -} - -bool ARM64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { - VT = VT.getScalarType(); - - if (!VT.isSimple()) - return false; - - switch (VT.getSimpleVT().SimpleTy) { - case MVT::f32: - case MVT::f64: - return true; - default: - break; - } - - return false; -} - -const MCPhysReg * -ARM64TargetLowering::getScratchRegisters(CallingConv::ID) const { - // LR is a callee-save register, but we must treat it as clobbered by any call - // site. Hence we include LR in the scratch registers, which are in turn added - // as implicit-defs for stackmaps and patchpoints. - static const MCPhysReg ScratchRegs[] = { - ARM64::X16, ARM64::X17, ARM64::LR, 0 - }; - return ScratchRegs; -} - -bool ARM64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const { - EVT VT = N->getValueType(0); - // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine - // it with shift to let it be lowered to UBFX. - if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) && - isa(N->getOperand(1))) { - uint64_t TruncMask = N->getConstantOperandVal(1); - if (isMask_64(TruncMask) && - N->getOperand(0).getOpcode() == ISD::SRL && - isa(N->getOperand(0)->getOperand(1))) - return false; - } - return true; -} - -bool ARM64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, - Type *Ty) const { - assert(Ty->isIntegerTy()); - - unsigned BitSize = Ty->getPrimitiveSizeInBits(); - if (BitSize == 0) - return false; - - int64_t Val = Imm.getSExtValue(); - if (Val == 0 || ARM64_AM::isLogicalImmediate(Val, BitSize)) - return true; - - if ((int64_t)Val < 0) - Val = ~Val; - if (BitSize == 32) - Val &= (1LL << 32) - 1; - - unsigned LZ = countLeadingZeros((uint64_t)Val); - unsigned Shift = (63 - LZ) / 16; - // MOVZ is free so return true for one or fewer MOVK. - return (Shift < 3) ? true : false; -} - -// Generate SUBS and CSEL for integer abs. -static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDLoc DL(N); - - // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) - // and change it to SUB and CSEL. - if (VT.isInteger() && N->getOpcode() == ISD::XOR && - N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 && - N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) - if (ConstantSDNode *Y1C = dyn_cast(N1.getOperand(1))) - if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) { - SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), - N0.getOperand(0)); - // Generate SUBS & CSEL. - SDValue Cmp = - DAG.getNode(ARM64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), - N0.getOperand(0), DAG.getConstant(0, VT)); - return DAG.getNode(ARM64ISD::CSEL, DL, VT, N0.getOperand(0), Neg, - DAG.getConstant(ARM64CC::PL, MVT::i32), - SDValue(Cmp.getNode(), 1)); - } - return SDValue(); -} - -// performXorCombine - Attempts to handle integer ABS. -static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const ARM64Subtarget *Subtarget) { - if (DCI.isBeforeLegalizeOps()) - return SDValue(); - - return performIntegerAbsCombine(N, DAG); -} - -static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const ARM64Subtarget *Subtarget) { - if (DCI.isBeforeLegalizeOps()) - return SDValue(); - - // Multiplication of a power of two plus/minus one can be done more - // cheaply as as shift+add/sub. For now, this is true unilaterally. If - // future CPUs have a cheaper MADD instruction, this may need to be - // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and - // 64-bit is 5 cycles, so this is always a win. - if (ConstantSDNode *C = dyn_cast(N->getOperand(1))) { - APInt Value = C->getAPIntValue(); - EVT VT = N->getValueType(0); - APInt VP1 = Value + 1; - if (VP1.isPowerOf2()) { - // Multiplying by one less than a power of two, replace with a shift - // and a subtract. - SDValue ShiftedVal = - DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), - DAG.getConstant(VP1.logBase2(), MVT::i64)); - return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal, N->getOperand(0)); - } - APInt VM1 = Value - 1; - if (VM1.isPowerOf2()) { - // Multiplying by one more than a power of two, replace with a shift - // and an add. - SDValue ShiftedVal = - DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), - DAG.getConstant(VM1.logBase2(), MVT::i64)); - return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0)); - } - } - return SDValue(); -} - -static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - if (VT != MVT::f32 && VT != MVT::f64) - return SDValue(); - // Only optimize when the source and destination types have the same width. - if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits()) - return SDValue(); - - // If the result of an integer load is only used by an integer-to-float - // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. - // This eliminates an "integer-to-vector-move UOP and improve throughput. - SDValue N0 = N->getOperand(0); - if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && - // Do not change the width of a volatile load. - !cast(N0)->isVolatile()) { - LoadSDNode *LN0 = cast(N0); - SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), - LN0->getPointerInfo(), LN0->isVolatile(), - LN0->isNonTemporal(), LN0->isInvariant(), - LN0->getAlignment()); - - // Make sure successors of the original load stay after it by updating them - // to use the new Chain. - DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1)); - - unsigned Opcode = - (N->getOpcode() == ISD::SINT_TO_FP) ? ARM64ISD::SITOF : ARM64ISD::UITOF; - return DAG.getNode(Opcode, SDLoc(N), VT, Load); - } - - return SDValue(); -} - -/// An EXTR instruction is made up of two shifts, ORed together. This helper -/// searches for and classifies those shifts. -static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, - bool &FromHi) { - if (N.getOpcode() == ISD::SHL) - FromHi = false; - else if (N.getOpcode() == ISD::SRL) - FromHi = true; - else - return false; - - if (!isa(N.getOperand(1))) - return false; - - ShiftAmount = N->getConstantOperandVal(1); - Src = N->getOperand(0); - return true; -} - -/// EXTR instruction extracts a contiguous chunk of bits from two existing -/// registers viewed as a high/low pair. This function looks for the pattern: -/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an -/// EXTR. Can't quite be done in TableGen because the two immediates aren't -/// independent. -static SDValue tryCombineToEXTR(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - SelectionDAG &DAG = DCI.DAG; - SDLoc DL(N); - EVT VT = N->getValueType(0); - - assert(N->getOpcode() == ISD::OR && "Unexpected root"); - - if (VT != MVT::i32 && VT != MVT::i64) - return SDValue(); - - SDValue LHS; - uint32_t ShiftLHS = 0; - bool LHSFromHi = 0; - if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) - return SDValue(); - - SDValue RHS; - uint32_t ShiftRHS = 0; - bool RHSFromHi = 0; - if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) - return SDValue(); - - // If they're both trying to come from the high part of the register, they're - // not really an EXTR. - if (LHSFromHi == RHSFromHi) - return SDValue(); - - if (ShiftLHS + ShiftRHS != VT.getSizeInBits()) - return SDValue(); - - if (LHSFromHi) { - std::swap(LHS, RHS); - std::swap(ShiftLHS, ShiftRHS); - } - - return DAG.getNode(ARM64ISD::EXTR, DL, VT, LHS, RHS, - DAG.getConstant(ShiftRHS, MVT::i64)); -} - -static SDValue tryCombineToBSL(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - EVT VT = N->getValueType(0); - SelectionDAG &DAG = DCI.DAG; - SDLoc DL(N); - - if (!VT.isVector()) - return SDValue(); - - SDValue N0 = N->getOperand(0); - if (N0.getOpcode() != ISD::AND) - return SDValue(); - - SDValue N1 = N->getOperand(1); - if (N1.getOpcode() != ISD::AND) - return SDValue(); - - // We only have to look for constant vectors here since the general, variable - // case can be handled in TableGen. - unsigned Bits = VT.getVectorElementType().getSizeInBits(); - uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); - for (int i = 1; i >= 0; --i) - for (int j = 1; j >= 0; --j) { - BuildVectorSDNode *BVN0 = dyn_cast(N0->getOperand(i)); - BuildVectorSDNode *BVN1 = dyn_cast(N1->getOperand(j)); - if (!BVN0 || !BVN1) - continue; - - bool FoundMatch = true; - for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) { - ConstantSDNode *CN0 = dyn_cast(BVN0->getOperand(k)); - ConstantSDNode *CN1 = dyn_cast(BVN1->getOperand(k)); - if (!CN0 || !CN1 || - CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) { - FoundMatch = false; - break; - } - } - - if (FoundMatch) - return DAG.getNode(ARM64ISD::BSL, DL, VT, SDValue(BVN0, 0), - N0->getOperand(1 - i), N1->getOperand(1 - j)); - } - - return SDValue(); -} - -static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, - const ARM64Subtarget *Subtarget) { - // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) - if (!EnableARM64ExtrGeneration) - return SDValue(); - SelectionDAG &DAG = DCI.DAG; - EVT VT = N->getValueType(0); - - if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) - return SDValue(); - - SDValue Res = tryCombineToEXTR(N, DCI); - if (Res.getNode()) - return Res; - - Res = tryCombineToBSL(N, DCI); - if (Res.getNode()) - return Res; - - return SDValue(); -} - -static SDValue performBitcastCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG) { - // Wait 'til after everything is legalized to try this. That way we have - // legal vector types and such. - if (DCI.isBeforeLegalizeOps()) - return SDValue(); - - // Remove extraneous bitcasts around an extract_subvector. - // For example, - // (v4i16 (bitconvert - // (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1))))) - // becomes - // (extract_subvector ((v8i16 ...), (i64 4))) - - // Only interested in 64-bit vectors as the ultimate result. - EVT VT = N->getValueType(0); - if (!VT.isVector()) - return SDValue(); - if (VT.getSimpleVT().getSizeInBits() != 64) - return SDValue(); - // Is the operand an extract_subvector starting at the beginning or halfway - // point of the vector? A low half may also come through as an - // EXTRACT_SUBREG, so look for that, too. - SDValue Op0 = N->getOperand(0); - if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR && - !(Op0->isMachineOpcode() && - Op0->getMachineOpcode() == ARM64::EXTRACT_SUBREG)) - return SDValue(); - uint64_t idx = cast(Op0->getOperand(1))->getZExtValue(); - if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) { - if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0) - return SDValue(); - } else if (Op0->getMachineOpcode() == ARM64::EXTRACT_SUBREG) { - if (idx != ARM64::dsub) - return SDValue(); - // The dsub reference is equivalent to a lane zero subvector reference. - idx = 0; - } - // Look through the bitcast of the input to the extract. - if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST) - return SDValue(); - SDValue Source = Op0->getOperand(0)->getOperand(0); - // If the source type has twice the number of elements as our destination - // type, we know this is an extract of the high or low half of the vector. - EVT SVT = Source->getValueType(0); - if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2) - return SDValue(); - - DEBUG(dbgs() << "arm64-lower: bitcast extract_subvector simplification\n"); - - // Create the simplified form to just extract the low or high half of the - // vector directly rather than bothering with the bitcasts. - SDLoc dl(N); - unsigned NumElements = VT.getVectorNumElements(); - if (idx) { - SDValue HalfIdx = DAG.getConstant(NumElements, MVT::i64); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx); - } else { - SDValue SubReg = DAG.getTargetConstant(ARM64::dsub, MVT::i32); - return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT, - Source, SubReg), - 0); - } -} - -static SDValue performConcatVectorsCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG) { - // Wait 'til after everything is legalized to try this. That way we have - // legal vector types and such. - if (DCI.isBeforeLegalizeOps()) - return SDValue(); - - SDLoc dl(N); - EVT VT = N->getValueType(0); - - // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector - // splat. The indexed instructions are going to be expecting a DUPLANE64, so - // canonicalise to that. - if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) { - assert(VT.getVectorElementType().getSizeInBits() == 64); - return DAG.getNode(ARM64ISD::DUPLANE64, dl, VT, - WidenVector(N->getOperand(0), DAG), - DAG.getConstant(0, MVT::i64)); - } - - // Canonicalise concat_vectors so that the right-hand vector has as few - // bit-casts as possible before its real operation. The primary matching - // destination for these operations will be the narrowing "2" instructions, - // which depend on the operation being performed on this right-hand vector. - // For example, - // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS)))) - // becomes - // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) - - SDValue Op1 = N->getOperand(1); - if (Op1->getOpcode() != ISD::BITCAST) - return SDValue(); - SDValue RHS = Op1->getOperand(0); - MVT RHSTy = RHS.getValueType().getSimpleVT(); - // If the RHS is not a vector, this is not the pattern we're looking for. - if (!RHSTy.isVector()) - return SDValue(); - - DEBUG(dbgs() << "arm64-lower: concat_vectors bitcast simplification\n"); - - MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(), - RHSTy.getVectorNumElements() * 2); - return DAG.getNode( - ISD::BITCAST, dl, VT, - DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy, - DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS)); -} - -static SDValue tryCombineFixedPointConvert(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG) { - // Wait 'til after everything is legalized to try this. That way we have - // legal vector types and such. - if (DCI.isBeforeLegalizeOps()) - return SDValue(); - // Transform a scalar conversion of a value from a lane extract into a - // lane extract of a vector conversion. E.g., from foo1 to foo2: - // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); } - // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; } - // - // The second form interacts better with instruction selection and the - // register allocator to avoid cross-class register copies that aren't - // coalescable due to a lane reference. - - // Check the operand and see if it originates from a lane extract. - SDValue Op1 = N->getOperand(1); - if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { - // Yep, no additional predication needed. Perform the transform. - SDValue IID = N->getOperand(0); - SDValue Shift = N->getOperand(2); - SDValue Vec = Op1.getOperand(0); - SDValue Lane = Op1.getOperand(1); - EVT ResTy = N->getValueType(0); - EVT VecResTy; - SDLoc DL(N); - - // The vector width should be 128 bits by the time we get here, even - // if it started as 64 bits (the extract_vector handling will have - // done so). - assert(Vec.getValueType().getSizeInBits() == 128 && - "unexpected vector size on extract_vector_elt!"); - if (Vec.getValueType() == MVT::v4i32) - VecResTy = MVT::v4f32; - else if (Vec.getValueType() == MVT::v2i64) - VecResTy = MVT::v2f64; - else - assert(0 && "unexpected vector type!"); - - SDValue Convert = - DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane); - } - return SDValue(); -} - -// AArch64 high-vector "long" operations are formed by performing the non-high -// version on an extract_subvector of each operand which gets the high half: -// -// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS)) -// -// However, there are cases which don't have an extract_high explicitly, but -// have another operation that can be made compatible with one for free. For -// example: -// -// (dupv64 scalar) --> (extract_high (dup128 scalar)) -// -// This routine does the actual conversion of such DUPs, once outer routines -// have determined that everything else is in order. -static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { - // We can handle most types of duplicate, but the lane ones have an extra - // operand saying *which* lane, so we need to know. - bool IsDUPLANE; - switch (N.getOpcode()) { - case ARM64ISD::DUP: - IsDUPLANE = false; - break; - case ARM64ISD::DUPLANE8: - case ARM64ISD::DUPLANE16: - case ARM64ISD::DUPLANE32: - case ARM64ISD::DUPLANE64: - IsDUPLANE = true; - break; - default: - return SDValue(); - } - - MVT NarrowTy = N.getSimpleValueType(); - if (!NarrowTy.is64BitVector()) - return SDValue(); - - MVT ElementTy = NarrowTy.getVectorElementType(); - unsigned NumElems = NarrowTy.getVectorNumElements(); - MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2); - - SDValue NewDUP; - if (IsDUPLANE) - NewDUP = DAG.getNode(N.getOpcode(), SDLoc(N), NewDUPVT, N.getOperand(0), - N.getOperand(1)); - else - NewDUP = DAG.getNode(ARM64ISD::DUP, SDLoc(N), NewDUPVT, N.getOperand(0)); - - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N.getNode()), NarrowTy, - NewDUP, DAG.getConstant(NumElems, MVT::i64)); -} - -static bool isEssentiallyExtractSubvector(SDValue N) { - if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR) - return true; - - return N.getOpcode() == ISD::BITCAST && - N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR; -} - -/// \brief Helper structure to keep track of ISD::SET_CC operands. -struct GenericSetCCInfo { - const SDValue *Opnd0; - const SDValue *Opnd1; - ISD::CondCode CC; -}; - -/// \brief Helper structure to keep track of a SET_CC lowered into ARM64 code. -struct ARM64SetCCInfo { - const SDValue *Cmp; - ARM64CC::CondCode CC; -}; - -/// \brief Helper structure to keep track of SetCC information. -union SetCCInfo { - GenericSetCCInfo Generic; - ARM64SetCCInfo ARM64; -}; - -/// \brief Helper structure to be able to read SetCC information. -/// If set to true, IsARM64 field, Info is a ARM64SetCCInfo, otherwise Info is -/// a GenericSetCCInfo. -struct SetCCInfoAndKind { - SetCCInfo Info; - bool IsARM64; -}; - -/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or -/// an -/// ARM64 lowered one. -/// \p SetCCInfo is filled accordingly. -/// \post SetCCInfo is meanginfull only when this function returns true. -/// \return True when Op is a kind of SET_CC operation. -static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) { - // If this is a setcc, this is straight forward. - if (Op.getOpcode() == ISD::SETCC) { - SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0); - SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1); - SetCCInfo.Info.Generic.CC = cast(Op.getOperand(2))->get(); - SetCCInfo.IsARM64 = false; - return true; - } - // Otherwise, check if this is a matching csel instruction. - // In other words: - // - csel 1, 0, cc - // - csel 0, 1, !cc - if (Op.getOpcode() != ARM64ISD::CSEL) - return false; - // Set the information about the operands. - // TODO: we want the operands of the Cmp not the csel - SetCCInfo.Info.ARM64.Cmp = &Op.getOperand(3); - SetCCInfo.IsARM64 = true; - SetCCInfo.Info.ARM64.CC = static_cast( - cast(Op.getOperand(2))->getZExtValue()); - - // Check that the operands matches the constraints: - // (1) Both operands must be constants. - // (2) One must be 1 and the other must be 0. - ConstantSDNode *TValue = dyn_cast(Op.getOperand(0)); - ConstantSDNode *FValue = dyn_cast(Op.getOperand(1)); - - // Check (1). - if (!TValue || !FValue) - return false; - - // Check (2). - if (!TValue->isOne()) { - // Update the comparison when we are interested in !cc. - std::swap(TValue, FValue); - SetCCInfo.Info.ARM64.CC = - ARM64CC::getInvertedCondCode(SetCCInfo.Info.ARM64.CC); - } - return TValue->isOne() && FValue->isNullValue(); -} - -// Returns true if Op is setcc or zext of setcc. -static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) { - if (isSetCC(Op, Info)) - return true; - return ((Op.getOpcode() == ISD::ZERO_EXTEND) && - isSetCC(Op->getOperand(0), Info)); -} - -// The folding we want to perform is: -// (add x, [zext] (setcc cc ...) ) -// --> -// (csel x, (add x, 1), !cc ...) -// -// The latter will get matched to a CSINC instruction. -static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { - assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!"); - SDValue LHS = Op->getOperand(0); - SDValue RHS = Op->getOperand(1); - SetCCInfoAndKind InfoAndKind; - - // If neither operand is a SET_CC, give up. - if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) { - std::swap(LHS, RHS); - if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) - return SDValue(); - } - - // FIXME: This could be generatized to work for FP comparisons. - EVT CmpVT = InfoAndKind.IsARM64 - ? InfoAndKind.Info.ARM64.Cmp->getOperand(0).getValueType() - : InfoAndKind.Info.Generic.Opnd0->getValueType(); - if (CmpVT != MVT::i32 && CmpVT != MVT::i64) - return SDValue(); - - SDValue CCVal; - SDValue Cmp; - SDLoc dl(Op); - if (InfoAndKind.IsARM64) { - CCVal = DAG.getConstant( - ARM64CC::getInvertedCondCode(InfoAndKind.Info.ARM64.CC), MVT::i32); - Cmp = *InfoAndKind.Info.ARM64.Cmp; - } else - Cmp = getARM64Cmp(*InfoAndKind.Info.Generic.Opnd0, - *InfoAndKind.Info.Generic.Opnd1, - ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true), - CCVal, DAG, dl); - - EVT VT = Op->getValueType(0); - LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, VT)); - return DAG.getNode(ARM64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp); -} - -// The basic add/sub long vector instructions have variants with "2" on the end -// which act on the high-half of their inputs. They are normally matched by -// patterns like: -// -// (add (zeroext (extract_high LHS)), -// (zeroext (extract_high RHS))) -// -> uaddl2 vD, vN, vM -// -// However, if one of the extracts is something like a duplicate, this -// instruction can still be used profitably. This function puts the DAG into a -// more appropriate form for those patterns to trigger. -static SDValue performAddSubLongCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG) { - if (DCI.isBeforeLegalizeOps()) - return SDValue(); - - MVT VT = N->getSimpleValueType(0); - if (!VT.is128BitVector()) { - if (N->getOpcode() == ISD::ADD) - return performSetccAddFolding(N, DAG); - return SDValue(); - } - - // Make sure both branches are extended in the same way. - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - if ((LHS.getOpcode() != ISD::ZERO_EXTEND && - LHS.getOpcode() != ISD::SIGN_EXTEND) || - LHS.getOpcode() != RHS.getOpcode()) - return SDValue(); - - unsigned ExtType = LHS.getOpcode(); - - // It's not worth doing if at least one of the inputs isn't already an - // extract, but we don't know which it'll be so we have to try both. - if (isEssentiallyExtractSubvector(LHS.getOperand(0))) { - RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG); - if (!RHS.getNode()) - return SDValue(); - - RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS); - } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) { - LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG); - if (!LHS.getNode()) - return SDValue(); - - LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS); - } - - return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS); -} - -// Massage DAGs which we can use the high-half "long" operations on into -// something isel will recognize better. E.g. -// -// (arm64_neon_umull (extract_high vec) (dupv64 scalar)) --> -// (arm64_neon_umull (extract_high (v2i64 vec))) -// (extract_high (v2i64 (dup128 scalar))))) -// -static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG) { - if (DCI.isBeforeLegalizeOps()) - return SDValue(); - - SDValue LHS = N->getOperand(1); - SDValue RHS = N->getOperand(2); - assert(LHS.getValueType().is64BitVector() && - RHS.getValueType().is64BitVector() && - "unexpected shape for long operation"); - - // Either node could be a DUP, but it's not worth doing both of them (you'd - // just as well use the non-high version) so look for a corresponding extract - // operation on the other "wing". - if (isEssentiallyExtractSubvector(LHS)) { - RHS = tryExtendDUPToExtractHigh(RHS, DAG); - if (!RHS.getNode()) - return SDValue(); - } else if (isEssentiallyExtractSubvector(RHS)) { - LHS = tryExtendDUPToExtractHigh(LHS, DAG); - if (!LHS.getNode()) - return SDValue(); - } - - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0), - N->getOperand(0), LHS, RHS); -} - -static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { - MVT ElemTy = N->getSimpleValueType(0).getScalarType(); - unsigned ElemBits = ElemTy.getSizeInBits(); - - int64_t ShiftAmount; - if (BuildVectorSDNode *BVN = dyn_cast(N->getOperand(2))) { - APInt SplatValue, SplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; - if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, - HasAnyUndefs, ElemBits) || - SplatBitSize != ElemBits) - return SDValue(); - - ShiftAmount = SplatValue.getSExtValue(); - } else if (ConstantSDNode *CVN = dyn_cast(N->getOperand(2))) { - ShiftAmount = CVN->getSExtValue(); - } else - return SDValue(); - - unsigned Opcode; - bool IsRightShift; - switch (IID) { - default: - llvm_unreachable("Unknown shift intrinsic"); - case Intrinsic::arm64_neon_sqshl: - Opcode = ARM64ISD::SQSHL_I; - IsRightShift = false; - break; - case Intrinsic::arm64_neon_uqshl: - Opcode = ARM64ISD::UQSHL_I; - IsRightShift = false; - break; - case Intrinsic::arm64_neon_srshl: - Opcode = ARM64ISD::SRSHR_I; - IsRightShift = true; - break; - case Intrinsic::arm64_neon_urshl: - Opcode = ARM64ISD::URSHR_I; - IsRightShift = true; - break; - case Intrinsic::arm64_neon_sqshlu: - Opcode = ARM64ISD::SQSHLU_I; - IsRightShift = false; - break; - } - - if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) - return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1), - DAG.getConstant(-ShiftAmount, MVT::i32)); - else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount <= ElemBits) - return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1), - DAG.getConstant(ShiftAmount, MVT::i32)); - - return SDValue(); -} - -// The CRC32[BH] instructions ignore the high bits of their data operand. Since -// the intrinsics must be legal and take an i32, this means there's almost -// certainly going to be a zext in the DAG which we can eliminate. -static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) { - SDValue AndN = N->getOperand(2); - if (AndN.getOpcode() != ISD::AND) - return SDValue(); - - ConstantSDNode *CMask = dyn_cast(AndN.getOperand(1)); - if (!CMask || CMask->getZExtValue() != Mask) - return SDValue(); - - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32, - N->getOperand(0), N->getOperand(1), AndN.getOperand(0)); -} - -static SDValue performIntrinsicCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - const ARM64Subtarget *Subtarget) { - SelectionDAG &DAG = DCI.DAG; - unsigned IID = getIntrinsicID(N); - switch (IID) { - default: - break; - case Intrinsic::arm64_neon_vcvtfxs2fp: - case Intrinsic::arm64_neon_vcvtfxu2fp: - return tryCombineFixedPointConvert(N, DCI, DAG); - break; - case Intrinsic::arm64_neon_fmax: - return DAG.getNode(ARM64ISD::FMAX, SDLoc(N), N->getValueType(0), - N->getOperand(1), N->getOperand(2)); - case Intrinsic::arm64_neon_fmin: - return DAG.getNode(ARM64ISD::FMIN, SDLoc(N), N->getValueType(0), - N->getOperand(1), N->getOperand(2)); - case Intrinsic::arm64_neon_smull: - case Intrinsic::arm64_neon_umull: - case Intrinsic::arm64_neon_pmull: - case Intrinsic::arm64_neon_sqdmull: - return tryCombineLongOpWithDup(IID, N, DCI, DAG); - case Intrinsic::arm64_neon_sqshl: - case Intrinsic::arm64_neon_uqshl: - case Intrinsic::arm64_neon_sqshlu: - case Intrinsic::arm64_neon_srshl: - case Intrinsic::arm64_neon_urshl: - return tryCombineShiftImm(IID, N, DAG); - case Intrinsic::arm64_crc32b: - case Intrinsic::arm64_crc32cb: - return tryCombineCRC32(0xff, N, DAG); - case Intrinsic::arm64_crc32h: - case Intrinsic::arm64_crc32ch: - return tryCombineCRC32(0xffff, N, DAG); - } - return SDValue(); -} - -static SDValue performExtendCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG) { - // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then - // we can convert that DUP into another extract_high (of a bigger DUP), which - // helps the backend to decide that an sabdl2 would be useful, saving a real - // extract_high operation. - if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND && - N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) { - SDNode *ABDNode = N->getOperand(0).getNode(); - unsigned IID = getIntrinsicID(ABDNode); - if (IID == Intrinsic::arm64_neon_sabd || - IID == Intrinsic::arm64_neon_uabd) { - SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG); - if (!NewABD.getNode()) - return SDValue(); - - return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), - NewABD); - } - } - - // This is effectively a custom type legalization for ARM64. - // - // Type legalization will split an extend of a small, legal, type to a larger - // illegal type by first splitting the destination type, often creating - // illegal source types, which then get legalized in isel-confusing ways, - // leading to really terrible codegen. E.g., - // %result = v8i32 sext v8i8 %value - // becomes - // %losrc = extract_subreg %value, ... - // %hisrc = extract_subreg %value, ... - // %lo = v4i32 sext v4i8 %losrc - // %hi = v4i32 sext v4i8 %hisrc - // Things go rapidly downhill from there. - // - // For ARM64, the [sz]ext vector instructions can only go up one element - // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32 - // take two instructions. - // - // This implies that the most efficient way to do the extend from v8i8 - // to two v4i32 values is to first extend the v8i8 to v8i16, then do - // the normal splitting to happen for the v8i16->v8i32. - - // This is pre-legalization to catch some cases where the default - // type legalization will create ill-tempered code. - if (!DCI.isBeforeLegalizeOps()) - return SDValue(); - - // We're only interested in cleaning things up for non-legal vector types - // here. If both the source and destination are legal, things will just - // work naturally without any fiddling. - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - EVT ResVT = N->getValueType(0); - if (!ResVT.isVector() || TLI.isTypeLegal(ResVT)) - return SDValue(); - // If the vector type isn't a simple VT, it's beyond the scope of what - // we're worried about here. Let legalization do its thing and hope for - // the best. - if (!ResVT.isSimple()) - return SDValue(); - - SDValue Src = N->getOperand(0); - MVT SrcVT = Src->getValueType(0).getSimpleVT(); - // If the source VT is a 64-bit vector, we can play games and get the - // better results we want. - if (SrcVT.getSizeInBits() != 64) - return SDValue(); - - unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits(); - unsigned ElementCount = SrcVT.getVectorNumElements(); - SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount); - SDLoc DL(N); - Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src); - - // Now split the rest of the operation into two halves, each with a 64 - // bit source. - EVT LoVT, HiVT; - SDValue Lo, Hi; - unsigned NumElements = ResVT.getVectorNumElements(); - assert(!(NumElements & 1) && "Splitting vector, but not in half!"); - LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(), - ResVT.getVectorElementType(), NumElements / 2); - - EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), - LoVT.getVectorNumElements()); - Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, - DAG.getIntPtrConstant(0)); - Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, - DAG.getIntPtrConstant(InNVT.getVectorNumElements())); - Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo); - Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi); - - // Now combine the parts back together so we still have a single result - // like the combiner expects. - return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi); -} - -/// Replace a splat of a scalar to a vector store by scalar stores of the scalar -/// value. The load store optimizer pass will merge them to store pair stores. -/// This has better performance than a splat of the scalar followed by a split -/// vector store. Even if the stores are not merged it is four stores vs a dup, -/// followed by an ext.b and two stores. -static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { - SDValue StVal = St->getValue(); - EVT VT = StVal.getValueType(); - - // Don't replace floating point stores, they possibly won't be transformed to - // stp because of the store pair suppress pass. - if (VT.isFloatingPoint()) - return SDValue(); - - // Check for insert vector elements. - if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT) - return SDValue(); - - // We can express a splat as store pair(s) for 2 or 4 elements. - unsigned NumVecElts = VT.getVectorNumElements(); - if (NumVecElts != 4 && NumVecElts != 2) - return SDValue(); - SDValue SplatVal = StVal.getOperand(1); - unsigned RemainInsertElts = NumVecElts - 1; - - // Check that this is a splat. - while (--RemainInsertElts) { - SDValue NextInsertElt = StVal.getOperand(0); - if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT) - return SDValue(); - if (NextInsertElt.getOperand(1) != SplatVal) - return SDValue(); - StVal = NextInsertElt; - } - unsigned OrigAlignment = St->getAlignment(); - unsigned EltOffset = NumVecElts == 4 ? 4 : 8; - unsigned Alignment = std::min(OrigAlignment, EltOffset); - - // Create scalar stores. This is at least as good as the code sequence for a - // split unaligned store wich is a dup.s, ext.b, and two stores. - // Most of the time the three stores should be replaced by store pair - // instructions (stp). - SDLoc DL(St); - SDValue BasePtr = St->getBasePtr(); - SDValue NewST1 = - DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(), - St->isVolatile(), St->isNonTemporal(), St->getAlignment()); - - unsigned Offset = EltOffset; - while (--NumVecElts) { - SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, - DAG.getConstant(Offset, MVT::i64)); - NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, - St->getPointerInfo(), St->isVolatile(), - St->isNonTemporal(), Alignment); - Offset += EltOffset; - } - return NewST1; -} - -static SDValue performSTORECombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG, - const ARM64Subtarget *Subtarget) { - if (!DCI.isBeforeLegalize()) - return SDValue(); - - StoreSDNode *S = cast(N); - if (S->isVolatile()) - return SDValue(); - - // Cyclone has bad performance on unaligned 16B stores when crossing line and - // page boundries. We want to split such stores. - if (!Subtarget->isCyclone()) - return SDValue(); - - // Don't split at Oz. - MachineFunction &MF = DAG.getMachineFunction(); - bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute( - AttributeSet::FunctionIndex, Attribute::MinSize); - if (IsMinSize) - return SDValue(); - - SDValue StVal = S->getValue(); - EVT VT = StVal.getValueType(); - - // Don't split v2i64 vectors. Memcpy lowering produces those and splitting - // those up regresses performance on micro-benchmarks and olden/bh. - if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64) - return SDValue(); - - // Split unaligned 16B stores. They are terrible for performance. - // Don't split stores with alignment of 1 or 2. Code that uses clang vector - // extensions can use this to mark that it does not want splitting to happen - // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of - // eliminating alignment hazards is only 1 in 8 for alignment of 2. - if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 || - S->getAlignment() <= 2) - return SDValue(); - - // If we get a splat of a scalar convert this vector store to a store of - // scalars. They will be merged into store pairs thereby removing two - // instructions. - SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S); - if (ReplacedSplat != SDValue()) - return ReplacedSplat; - - SDLoc DL(S); - unsigned NumElts = VT.getVectorNumElements() / 2; - // Split VT into two. - EVT HalfVT = - EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts); - SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, - DAG.getIntPtrConstant(0)); - SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, - DAG.getIntPtrConstant(NumElts)); - SDValue BasePtr = S->getBasePtr(); - SDValue NewST1 = - DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), - S->isVolatile(), S->isNonTemporal(), S->getAlignment()); - SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, - DAG.getConstant(8, MVT::i64)); - return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr, - S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(), - S->getAlignment()); -} - -/// Target-specific DAG combine function for post-increment LD1 (lane) and -/// post-increment LD1R. -static SDValue performPostLD1Combine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - bool IsLaneOp) { - if (DCI.isBeforeLegalizeOps()) - return SDValue(); - - SelectionDAG &DAG = DCI.DAG; - EVT VT = N->getValueType(0); - - unsigned LoadIdx = IsLaneOp ? 1 : 0; - SDNode *LD = N->getOperand(LoadIdx).getNode(); - // If it is not LOAD, can not do such combine. - if (LD->getOpcode() != ISD::LOAD) - return SDValue(); - - LoadSDNode *LoadSDN = cast(LD); - EVT MemVT = LoadSDN->getMemoryVT(); - // Check if memory operand is the same type as the vector element. - if (MemVT != VT.getVectorElementType()) - return SDValue(); - - // Check if there are other uses. If so, do not combine as it will introduce - // an extra load. - for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE; - ++UI) { - if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result. - continue; - if (*UI != N) - return SDValue(); - } - - SDValue Addr = LD->getOperand(1); - SDValue Vector = N->getOperand(0); - // Search for a use of the address operand that is an increment. - for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = - Addr.getNode()->use_end(); UI != UE; ++UI) { - SDNode *User = *UI; - if (User->getOpcode() != ISD::ADD - || UI.getUse().getResNo() != Addr.getResNo()) - continue; - - // Check that the add is independent of the load. Otherwise, folding it - // would create a cycle. - if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User)) - continue; - // Also check that add is not used in the vector operand. This would also - // create a cycle. - if (User->isPredecessorOf(Vector.getNode())) - continue; - - // If the increment is a constant, it must match the memory ref size. - SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); - if (ConstantSDNode *CInc = dyn_cast(Inc.getNode())) { - uint32_t IncVal = CInc->getZExtValue(); - unsigned NumBytes = VT.getScalarSizeInBits() / 8; - if (IncVal != NumBytes) - continue; - Inc = DAG.getRegister(ARM64::XZR, MVT::i64); - } - - SmallVector Ops; - Ops.push_back(LD->getOperand(0)); // Chain - if (IsLaneOp) { - Ops.push_back(Vector); // The vector to be inserted - Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector - } - Ops.push_back(Addr); - Ops.push_back(Inc); - - EVT Tys[3] = { VT, MVT::i64, MVT::Other }; - SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, 3)); - unsigned NewOp = IsLaneOp ? ARM64ISD::LD1LANEpost : ARM64ISD::LD1DUPpost; - SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops, - MemVT, - LoadSDN->getMemOperand()); - - // Update the uses. - std::vector NewResults; - NewResults.push_back(SDValue(LD, 0)); // The result of load - NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain - DCI.CombineTo(LD, NewResults); - DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result - DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register - - break; - } - return SDValue(); -} - -/// Target-specific DAG combine function for NEON load/store intrinsics -/// to merge base address updates. -static SDValue performNEONPostLDSTCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG) { - if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) - return SDValue(); - - unsigned AddrOpIdx = N->getNumOperands() - 1; - SDValue Addr = N->getOperand(AddrOpIdx); - - // Search for a use of the address operand that is an increment. - for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), - UE = Addr.getNode()->use_end(); UI != UE; ++UI) { - SDNode *User = *UI; - if (User->getOpcode() != ISD::ADD || - UI.getUse().getResNo() != Addr.getResNo()) - continue; - - // Check that the add is independent of the load/store. Otherwise, folding - // it would create a cycle. - if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) - continue; - - // Find the new opcode for the updating load/store. - bool IsStore = false; - bool IsLaneOp = false; - bool IsDupOp = false; - unsigned NewOpc = 0; - unsigned NumVecs = 0; - unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); - switch (IntNo) { - default: llvm_unreachable("unexpected intrinsic for Neon base update"); - case Intrinsic::arm64_neon_ld2: NewOpc = ARM64ISD::LD2post; - NumVecs = 2; break; - case Intrinsic::arm64_neon_ld3: NewOpc = ARM64ISD::LD3post; - NumVecs = 3; break; - case Intrinsic::arm64_neon_ld4: NewOpc = ARM64ISD::LD4post; - NumVecs = 4; break; - case Intrinsic::arm64_neon_st2: NewOpc = ARM64ISD::ST2post; - NumVecs = 2; IsStore = true; break; - case Intrinsic::arm64_neon_st3: NewOpc = ARM64ISD::ST3post; - NumVecs = 3; IsStore = true; break; - case Intrinsic::arm64_neon_st4: NewOpc = ARM64ISD::ST4post; - NumVecs = 4; IsStore = true; break; - case Intrinsic::arm64_neon_ld1x2: NewOpc = ARM64ISD::LD1x2post; - NumVecs = 2; break; - case Intrinsic::arm64_neon_ld1x3: NewOpc = ARM64ISD::LD1x3post; - NumVecs = 3; break; - case Intrinsic::arm64_neon_ld1x4: NewOpc = ARM64ISD::LD1x4post; - NumVecs = 4; break; - case Intrinsic::arm64_neon_st1x2: NewOpc = ARM64ISD::ST1x2post; - NumVecs = 2; IsStore = true; break; - case Intrinsic::arm64_neon_st1x3: NewOpc = ARM64ISD::ST1x3post; - NumVecs = 3; IsStore = true; break; - case Intrinsic::arm64_neon_st1x4: NewOpc = ARM64ISD::ST1x4post; - NumVecs = 4; IsStore = true; break; - case Intrinsic::arm64_neon_ld2r: NewOpc = ARM64ISD::LD2DUPpost; - NumVecs = 2; IsDupOp = true; break; - case Intrinsic::arm64_neon_ld3r: NewOpc = ARM64ISD::LD3DUPpost; - NumVecs = 3; IsDupOp = true; break; - case Intrinsic::arm64_neon_ld4r: NewOpc = ARM64ISD::LD4DUPpost; - NumVecs = 4; IsDupOp = true; break; - case Intrinsic::arm64_neon_ld2lane: NewOpc = ARM64ISD::LD2LANEpost; - NumVecs = 2; IsLaneOp = true; break; - case Intrinsic::arm64_neon_ld3lane: NewOpc = ARM64ISD::LD3LANEpost; - NumVecs = 3; IsLaneOp = true; break; - case Intrinsic::arm64_neon_ld4lane: NewOpc = ARM64ISD::LD4LANEpost; - NumVecs = 4; IsLaneOp = true; break; - case Intrinsic::arm64_neon_st2lane: NewOpc = ARM64ISD::ST2LANEpost; - NumVecs = 2; IsStore = true; IsLaneOp = true; break; - case Intrinsic::arm64_neon_st3lane: NewOpc = ARM64ISD::ST3LANEpost; - NumVecs = 3; IsStore = true; IsLaneOp = true; break; - case Intrinsic::arm64_neon_st4lane: NewOpc = ARM64ISD::ST4LANEpost; - NumVecs = 4; IsStore = true; IsLaneOp = true; break; - } - - EVT VecTy; - if (IsStore) - VecTy = N->getOperand(2).getValueType(); - else - VecTy = N->getValueType(0); - - // If the increment is a constant, it must match the memory ref size. - SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); - if (ConstantSDNode *CInc = dyn_cast(Inc.getNode())) { - uint32_t IncVal = CInc->getZExtValue(); - unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; - if (IsLaneOp || IsDupOp) - NumBytes /= VecTy.getVectorNumElements(); - if (IncVal != NumBytes) - continue; - Inc = DAG.getRegister(ARM64::XZR, MVT::i64); - } - SmallVector Ops; - Ops.push_back(N->getOperand(0)); // Incoming chain - // Load lane and store have vector list as input. - if (IsLaneOp || IsStore) - for (unsigned i = 2; i < AddrOpIdx; ++i) - Ops.push_back(N->getOperand(i)); - Ops.push_back(Addr); // Base register - Ops.push_back(Inc); - - // Return Types. - EVT Tys[6]; - unsigned NumResultVecs = (IsStore ? 0 : NumVecs); - unsigned n; - for (n = 0; n < NumResultVecs; ++n) - Tys[n] = VecTy; - Tys[n++] = MVT::i64; // Type of write back register - Tys[n] = MVT::Other; // Type of the chain - SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2)); - - MemIntrinsicSDNode *MemInt = cast(N); - SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops, - MemInt->getMemoryVT(), - MemInt->getMemOperand()); - - // Update the uses. - std::vector NewResults; - for (unsigned i = 0; i < NumResultVecs; ++i) { - NewResults.push_back(SDValue(UpdN.getNode(), i)); - } - NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); - DCI.CombineTo(N, NewResults); - DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); - - break; - } - return SDValue(); -} - -// Optimize compare with zero and branch. -static SDValue performBRCONDCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG) { - SDValue Chain = N->getOperand(0); - SDValue Dest = N->getOperand(1); - SDValue CCVal = N->getOperand(2); - SDValue Cmp = N->getOperand(3); - - assert(isa(CCVal) && "Expected a ConstantSDNode here!"); - unsigned CC = cast(CCVal)->getZExtValue(); - if (CC != ARM64CC::EQ && CC != ARM64CC::NE) - return SDValue(); - - unsigned CmpOpc = Cmp.getOpcode(); - if (CmpOpc != ARM64ISD::ADDS && CmpOpc != ARM64ISD::SUBS) - return SDValue(); - - // Only attempt folding if there is only one use of the flag and no use of the - // value. - if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1)) - return SDValue(); - - SDValue LHS = Cmp.getOperand(0); - SDValue RHS = Cmp.getOperand(1); - - assert(LHS.getValueType() == RHS.getValueType() && - "Expected the value type to be the same for both operands!"); - if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) - return SDValue(); - - if (isa(LHS) && cast(LHS)->isNullValue()) - std::swap(LHS, RHS); - - if (!isa(RHS) || !cast(RHS)->isNullValue()) - return SDValue(); - - if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA || - LHS.getOpcode() == ISD::SRL) - return SDValue(); - - // Fold the compare into the branch instruction. - SDValue BR; - if (CC == ARM64CC::EQ) - BR = DAG.getNode(ARM64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); - else - BR = DAG.getNode(ARM64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); - - // Do not add new nodes to DAG combiner worklist. - DCI.CombineTo(N, BR, false); - - return SDValue(); -} - -// vselect (v1i1 setcc) -> -// vselect (v1iXX setcc) (XX is the size of the compared operand type) -// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as -// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine -// such VSELECT. -static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { - SDValue N0 = N->getOperand(0); - EVT CCVT = N0.getValueType(); - - if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 || - CCVT.getVectorElementType() != MVT::i1) - return SDValue(); - - EVT ResVT = N->getValueType(0); - EVT CmpVT = N0.getOperand(0).getValueType(); - // Only combine when the result type is of the same size as the compared - // operands. - if (ResVT.getSizeInBits() != CmpVT.getSizeInBits()) - return SDValue(); - - SDValue IfTrue = N->getOperand(1); - SDValue IfFalse = N->getOperand(2); - SDValue SetCC = - DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(), - N0.getOperand(0), N0.getOperand(1), - cast(N0.getOperand(2))->get()); - return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC, - IfTrue, IfFalse); -} - -/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with -/// the compare-mask instructions rather than going via NZCV, even if LHS and -/// RHS are really scalar. This replaces any scalar setcc in the above pattern -/// with a vector one followed by a DUP shuffle on the result. -static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) { - SDValue N0 = N->getOperand(0); - EVT ResVT = N->getValueType(0); - - if (!N->getOperand(1).getValueType().isVector()) - return SDValue(); - - if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1) - return SDValue(); - - SDLoc DL(N0); - - EVT SrcVT = N0.getOperand(0).getValueType(); - SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, - ResVT.getSizeInBits() / SrcVT.getSizeInBits()); - EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); - - // First perform a vector comparison, where lane 0 is the one we're interested - // in. - SDValue LHS = - DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0)); - SDValue RHS = - DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1)); - SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2)); - - // Now duplicate the comparison mask we want across all other lanes. - SmallVector DUPMask(CCVT.getVectorNumElements(), 0); - SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data()); - Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(), - Mask); - - return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); -} - -SDValue ARM64TargetLowering::PerformDAGCombine(SDNode *N, - DAGCombinerInfo &DCI) const { - SelectionDAG &DAG = DCI.DAG; - switch (N->getOpcode()) { - default: - break; - case ISD::ADD: - case ISD::SUB: - return performAddSubLongCombine(N, DCI, DAG); - case ISD::XOR: - return performXorCombine(N, DAG, DCI, Subtarget); - case ISD::MUL: - return performMulCombine(N, DAG, DCI, Subtarget); - case ISD::SINT_TO_FP: - case ISD::UINT_TO_FP: - return performIntToFpCombine(N, DAG); - case ISD::OR: - return performORCombine(N, DCI, Subtarget); - case ISD::INTRINSIC_WO_CHAIN: - return performIntrinsicCombine(N, DCI, Subtarget); - case ISD::ANY_EXTEND: - case ISD::ZERO_EXTEND: - case ISD::SIGN_EXTEND: - return performExtendCombine(N, DCI, DAG); - case ISD::BITCAST: - return performBitcastCombine(N, DCI, DAG); - case ISD::CONCAT_VECTORS: - return performConcatVectorsCombine(N, DCI, DAG); - case ISD::SELECT: - return performSelectCombine(N, DAG); - case ISD::VSELECT: - return performVSelectCombine(N, DCI.DAG); - case ISD::STORE: - return performSTORECombine(N, DCI, DAG, Subtarget); - case ARM64ISD::BRCOND: - return performBRCONDCombine(N, DCI, DAG); - case ARM64ISD::DUP: - return performPostLD1Combine(N, DCI, false); - case ISD::INSERT_VECTOR_ELT: - return performPostLD1Combine(N, DCI, true); - case ISD::INTRINSIC_VOID: - case ISD::INTRINSIC_W_CHAIN: - switch (cast(N->getOperand(1))->getZExtValue()) { - case Intrinsic::arm64_neon_ld2: - case Intrinsic::arm64_neon_ld3: - case Intrinsic::arm64_neon_ld4: - case Intrinsic::arm64_neon_ld1x2: - case Intrinsic::arm64_neon_ld1x3: - case Intrinsic::arm64_neon_ld1x4: - case Intrinsic::arm64_neon_ld2lane: - case Intrinsic::arm64_neon_ld3lane: - case Intrinsic::arm64_neon_ld4lane: - case Intrinsic::arm64_neon_ld2r: - case Intrinsic::arm64_neon_ld3r: - case Intrinsic::arm64_neon_ld4r: - case Intrinsic::arm64_neon_st2: - case Intrinsic::arm64_neon_st3: - case Intrinsic::arm64_neon_st4: - case Intrinsic::arm64_neon_st1x2: - case Intrinsic::arm64_neon_st1x3: - case Intrinsic::arm64_neon_st1x4: - case Intrinsic::arm64_neon_st2lane: - case Intrinsic::arm64_neon_st3lane: - case Intrinsic::arm64_neon_st4lane: - return performNEONPostLDSTCombine(N, DCI, DAG); - default: - break; - } - } - return SDValue(); -} - -// Check if the return value is used as only a return value, as otherwise -// we can't perform a tail-call. In particular, we need to check for -// target ISD nodes that are returns and any other "odd" constructs -// that the generic analysis code won't necessarily catch. -bool ARM64TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { - if (N->getNumValues() != 1) - return false; - if (!N->hasNUsesOfValue(1, 0)) - return false; - - SDValue TCChain = Chain; - SDNode *Copy = *N->use_begin(); - if (Copy->getOpcode() == ISD::CopyToReg) { - // If the copy has a glue operand, we conservatively assume it isn't safe to - // perform a tail call. - if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() == - MVT::Glue) - return false; - TCChain = Copy->getOperand(0); - } else if (Copy->getOpcode() != ISD::FP_EXTEND) - return false; - - bool HasRet = false; - for (SDNode *Node : Copy->uses()) { - if (Node->getOpcode() != ARM64ISD::RET_FLAG) - return false; - HasRet = true; - } - - if (!HasRet) - return false; - - Chain = TCChain; - return true; -} - -// Return whether the an instruction can potentially be optimized to a tail -// call. This will cause the optimizers to attempt to move, or duplicate, -// return instructions to help enable tail call optimizations for this -// instruction. -bool ARM64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { - if (!CI->isTailCall()) - return false; - - return true; -} - -bool ARM64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base, - SDValue &Offset, - ISD::MemIndexedMode &AM, - bool &IsInc, - SelectionDAG &DAG) const { - if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) - return false; - - Base = Op->getOperand(0); - // All of the indexed addressing mode instructions take a signed - // 9 bit immediate offset. - if (ConstantSDNode *RHS = dyn_cast(Op->getOperand(1))) { - int64_t RHSC = (int64_t)RHS->getZExtValue(); - if (RHSC >= 256 || RHSC <= -256) - return false; - IsInc = (Op->getOpcode() == ISD::ADD); - Offset = Op->getOperand(1); - return true; - } - return false; -} - -bool ARM64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, - SDValue &Offset, - ISD::MemIndexedMode &AM, - SelectionDAG &DAG) const { - EVT VT; - SDValue Ptr; - if (LoadSDNode *LD = dyn_cast(N)) { - VT = LD->getMemoryVT(); - Ptr = LD->getBasePtr(); - } else if (StoreSDNode *ST = dyn_cast(N)) { - VT = ST->getMemoryVT(); - Ptr = ST->getBasePtr(); - } else - return false; - - bool IsInc; - if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG)) - return false; - AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC; - return true; -} - -bool ARM64TargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, - SDValue &Base, - SDValue &Offset, - ISD::MemIndexedMode &AM, - SelectionDAG &DAG) const { - EVT VT; - SDValue Ptr; - if (LoadSDNode *LD = dyn_cast(N)) { - VT = LD->getMemoryVT(); - Ptr = LD->getBasePtr(); - } else if (StoreSDNode *ST = dyn_cast(N)) { - VT = ST->getMemoryVT(); - Ptr = ST->getBasePtr(); - } else - return false; - - bool IsInc; - if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG)) - return false; - // Post-indexing updates the base, so it's not a valid transform - // if that's not the same as the load's pointer. - if (Ptr != Base) - return false; - AM = IsInc ? ISD::POST_INC : ISD::POST_DEC; - return true; -} - -void ARM64TargetLowering::ReplaceNodeResults(SDNode *N, - SmallVectorImpl &Results, - SelectionDAG &DAG) const { - switch (N->getOpcode()) { - default: - llvm_unreachable("Don't know how to custom expand this"); - case ISD::FP_TO_UINT: - case ISD::FP_TO_SINT: - assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); - // Let normal code take care of it by not adding anything to Results. - return; - } -} - -bool ARM64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const { - // Loads and stores less than 128-bits are already atomic; ones above that - // are doomed anyway, so defer to the default libcall and blame the OS when - // things go wrong: - if (StoreInst *SI = dyn_cast(Inst)) - return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128; - else if (LoadInst *LI = dyn_cast(Inst)) - return LI->getType()->getPrimitiveSizeInBits() == 128; - - // For the real atomic operations, we have ldxr/stxr up to 128 bits. - return Inst->getType()->getPrimitiveSizeInBits() <= 128; -} - -Value *ARM64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, - AtomicOrdering Ord) const { - Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Type *ValTy = cast(Addr->getType())->getElementType(); - bool IsAcquire = - Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent; - - // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd - // intrinsic must return {i64, i64} and we have to recombine them into a - // single i128 here. - if (ValTy->getPrimitiveSizeInBits() == 128) { - Intrinsic::ID Int = - IsAcquire ? Intrinsic::arm64_ldaxp : Intrinsic::arm64_ldxp; - Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int); - - Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); - Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi"); - - Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); - Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); - Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); - Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); - return Builder.CreateOr( - Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64"); - } - - Type *Tys[] = { Addr->getType() }; - Intrinsic::ID Int = - IsAcquire ? Intrinsic::arm64_ldaxr : Intrinsic::arm64_ldxr; - Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys); - - return Builder.CreateTruncOrBitCast( - Builder.CreateCall(Ldxr, Addr), - cast(Addr->getType())->getElementType()); -} - -Value *ARM64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, - Value *Val, Value *Addr, - AtomicOrdering Ord) const { - Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - bool IsRelease = - Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent; - - // Since the intrinsics must have legal type, the i128 intrinsics take two - // parameters: "i64, i64". We must marshal Val into the appropriate form - // before the call. - if (Val->getType()->getPrimitiveSizeInBits() == 128) { - Intrinsic::ID Int = - IsRelease ? Intrinsic::arm64_stlxp : Intrinsic::arm64_stxp; - Function *Stxr = Intrinsic::getDeclaration(M, Int); - Type *Int64Ty = Type::getInt64Ty(M->getContext()); - - Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); - Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); - Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); - return Builder.CreateCall3(Stxr, Lo, Hi, Addr); - } - - Intrinsic::ID Int = - IsRelease ? Intrinsic::arm64_stlxr : Intrinsic::arm64_stxr; - Type *Tys[] = { Addr->getType() }; - Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); - - return Builder.CreateCall2( - Stxr, Builder.CreateZExtOrBitCast( - Val, Stxr->getFunctionType()->getParamType(0)), - Addr); -} diff --git a/lib/Target/ARM64/ARM64ISelLowering.h b/lib/Target/ARM64/ARM64ISelLowering.h deleted file mode 100644 index b2402c9791c..00000000000 --- a/lib/Target/ARM64/ARM64ISelLowering.h +++ /dev/null @@ -1,464 +0,0 @@ -//==-- ARM64ISelLowering.h - ARM64 DAG Lowering Interface --------*- C++ -*-==// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines the interfaces that ARM64 uses to lower LLVM code into a -// selection DAG. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TARGET_ARM64_ISELLOWERING_H -#define LLVM_TARGET_ARM64_ISELLOWERING_H - -#include "llvm/CodeGen/CallingConvLower.h" -#include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/IR/CallingConv.h" -#include "llvm/Target/TargetLowering.h" - -namespace llvm { - -namespace ARM64ISD { - -enum { - FIRST_NUMBER = ISD::BUILTIN_OP_END, - WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses. - CALL, // Function call. - - // Almost the same as a normal call node, except that a TLSDesc relocation is - // needed so the linker can relax it correctly if possible. - TLSDESC_CALL, - ADRP, // Page address of a TargetGlobalAddress operand. - ADDlow, // Add the low 12 bits of a TargetGlobalAddress operand. - LOADgot, // Load from automatically generated descriptor (e.g. Global - // Offset Table, TLS record). - RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand. - BRCOND, // Conditional branch instruction; "b.cond". - CSEL, - FCSEL, // Conditional move instruction. - CSINV, // Conditional select invert. - CSNEG, // Conditional select negate. - CSINC, // Conditional select increment. - - // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on - // ELF. - THREAD_POINTER, - ADC, - SBC, // adc, sbc instructions - - // Arithmetic instructions which write flags. - ADDS, - SUBS, - ADCS, - SBCS, - ANDS, - - // Floating point comparison - FCMP, - - // Floating point max and min instructions. - FMAX, - FMIN, - - // Scalar extract - EXTR, - - // Scalar-to-vector duplication - DUP, - DUPLANE8, - DUPLANE16, - DUPLANE32, - DUPLANE64, - - // Vector immedate moves - MOVI, - MOVIshift, - MOVIedit, - MOVImsl, - FMOV, - MVNIshift, - MVNImsl, - - // Vector immediate ops - BICi, - ORRi, - - // Vector bit select: similar to ISD::VSELECT but not all bits within an - // element must be identical. - BSL, - - // Vector arithmetic negation - NEG, - - // Vector shuffles - ZIP1, - ZIP2, - UZP1, - UZP2, - TRN1, - TRN2, - REV16, - REV32, - REV64, - EXT, - - // Vector shift by scalar - VSHL, - VLSHR, - VASHR, - - // Vector shift by scalar (again) - SQSHL_I, - UQSHL_I, - SQSHLU_I, - SRSHR_I, - URSHR_I, - - // Vector comparisons - CMEQ, - CMGE, - CMGT, - CMHI, - CMHS, - FCMEQ, - FCMGE, - FCMGT, - - // Vector zero comparisons - CMEQz, - CMGEz, - CMGTz, - CMLEz, - CMLTz, - FCMEQz, - FCMGEz, - FCMGTz, - FCMLEz, - FCMLTz, - - // Vector bitwise negation - NOT, - - // Vector bitwise selection - BIT, - - // Compare-and-branch - CBZ, - CBNZ, - TBZ, - TBNZ, - - // Tail calls - TC_RETURN, - - // Custom prefetch handling - PREFETCH, - - // {s|u}int to FP within a FP register. - SITOF, - UITOF, - - // NEON Load/Store with post-increment base updates - LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, - LD3post, - LD4post, - ST2post, - ST3post, - ST4post, - LD1x2post, - LD1x3post, - LD1x4post, - ST1x2post, - ST1x3post, - ST1x4post, - LD1DUPpost, - LD2DUPpost, - LD3DUPpost, - LD4DUPpost, - LD1LANEpost, - LD2LANEpost, - LD3LANEpost, - LD4LANEpost, - ST2LANEpost, - ST3LANEpost, - ST4LANEpost -}; - -} // end namespace ARM64ISD - -class ARM64Subtarget; -class ARM64TargetMachine; - -class ARM64TargetLowering : public TargetLowering { - bool RequireStrictAlign; - -public: - explicit ARM64TargetLowering(ARM64TargetMachine &TM); - - /// Selects the correct CCAssignFn for a the given CallingConvention - /// value. - CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const; - - /// computeKnownBitsForTargetNode - Determine which of the bits specified in - /// Mask are known to be either zero or one and return them in the - /// KnownZero/KnownOne bitsets. - void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero, - APInt &KnownOne, const SelectionDAG &DAG, - unsigned Depth = 0) const override; - - MVT getScalarShiftAmountTy(EVT LHSTy) const override; - - /// allowsUnalignedMemoryAccesses - Returns true if the target allows - /// unaligned memory accesses. of the specified type. - bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0, - bool *Fast = nullptr) const override { - if (RequireStrictAlign) - return false; - // FIXME: True for Cyclone, but not necessary others. - if (Fast) - *Fast = true; - return true; - } - - /// LowerOperation - Provide custom lowering hooks for some operations. - SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; - - const char *getTargetNodeName(unsigned Opcode) const override; - - SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; - - /// getFunctionAlignment - Return the Log2 alignment of this function. - unsigned getFunctionAlignment(const Function *F) const; - - /// getMaximalGlobalOffset - Returns the maximal possible offset which can - /// be used for loads / stores from the global. - unsigned getMaximalGlobalOffset() const override; - - /// Returns true if a cast between SrcAS and DestAS is a noop. - bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { - // Addrspacecasts are always noops. - return true; - } - - /// createFastISel - This method returns a target specific FastISel object, - /// or null if the target does not support "fast" ISel. - FastISel *createFastISel(FunctionLoweringInfo &funcInfo, - const TargetLibraryInfo *libInfo) const override; - - bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; - - bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; - - /// isShuffleMaskLegal - Return true if the given shuffle mask can be - /// codegen'd directly, or if it should be stack expanded. - bool isShuffleMaskLegal(const SmallVectorImpl &M, EVT VT) const override; - - /// getSetCCResultType - Return the ISD::SETCC ValueType - EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override; - - SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const; - - MachineBasicBlock *EmitF128CSEL(MachineInstr *MI, - MachineBasicBlock *BB) const; - - MachineBasicBlock * - EmitInstrWithCustomInserter(MachineInstr *MI, - MachineBasicBlock *MBB) const override; - - bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, - unsigned Intrinsic) const override; - - bool isTruncateFree(Type *Ty1, Type *Ty2) const override; - bool isTruncateFree(EVT VT1, EVT VT2) const override; - - bool isZExtFree(Type *Ty1, Type *Ty2) const override; - bool isZExtFree(EVT VT1, EVT VT2) const override; - bool isZExtFree(SDValue Val, EVT VT2) const override; - - bool hasPairedLoad(Type *LoadedType, - unsigned &RequiredAligment) const override; - bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override; - - bool isLegalAddImmediate(int64_t) const override; - bool isLegalICmpImmediate(int64_t) const override; - - EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, - bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, - MachineFunction &MF) const override; - - /// isLegalAddressingMode - Return true if the addressing mode represented - /// by AM is legal for this target, for a load/store of the specified type. - bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override; - - /// \brief Return the cost of the scaling factor used in the addressing - /// mode represented by AM for this target, for a load/store - /// of the specified type. - /// If the AM is supported, the return value must be >= 0. - /// If the AM is not supported, it returns a negative value. - int getScalingFactorCost(const AddrMode &AM, Type *Ty) const override; - - /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster - /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be - /// expanded to FMAs when this method returns true, otherwise fmuladd is - /// expanded to fmul + fadd. - bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; - - const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; - - /// \brief Returns false if N is a bit extraction pattern of (X >> C) & Mask. - bool isDesirableToCommuteWithShift(const SDNode *N) const override; - - /// \brief Returns true if it is beneficial to convert a load of a constant - /// to just the constant itself. - bool shouldConvertConstantLoadToIntImm(const APInt &Imm, - Type *Ty) const override; - - Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, - AtomicOrdering Ord) const override; - Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, - Value *Addr, AtomicOrdering Ord) const override; - - bool shouldExpandAtomicInIR(Instruction *Inst) const override; - -private: - /// Subtarget - Keep a pointer to the ARM64Subtarget around so that we can - /// make the right decision when generating code for different targets. - const ARM64Subtarget *Subtarget; - - void addTypeForNEON(EVT VT, EVT PromotedBitwiseVT); - void addDRTypeForNEON(MVT VT); - void addQRTypeForNEON(MVT VT); - - SDValue - LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Ins, SDLoc DL, - SelectionDAG &DAG, - SmallVectorImpl &InVals) const override; - - SDValue LowerCall(CallLoweringInfo & /*CLI*/, - SmallVectorImpl &InVals) const override; - - SDValue LowerCallResult(SDValue Chain, SDValue InFlag, - CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Ins, SDLoc DL, - SelectionDAG &DAG, SmallVectorImpl &InVals, - bool isThisReturn, SDValue ThisVal) const; - - bool isEligibleForTailCallOptimization( - SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, - bool isCalleeStructRet, bool isCallerStructRet, - const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, - const SmallVectorImpl &Ins, SelectionDAG &DAG) const; - - /// Finds the incoming stack arguments which overlap the given fixed stack - /// object and incorporates their load into the current chain. This prevents - /// an upcoming store from clobbering the stack argument before it's used. - SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, - MachineFrameInfo *MFI, int ClobberedFI) const; - - bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const; - - bool IsTailCallConvention(CallingConv::ID CallCC) const; - - void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL, - SDValue &Chain) const; - - bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, - bool isVarArg, - const SmallVectorImpl &Outs, - LLVMContext &Context) const override; - - SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, - const SmallVectorImpl &Outs, - const SmallVectorImpl &OutVals, SDLoc DL, - SelectionDAG &DAG) const override; - - SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerELFTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL, - SelectionDAG &DAG) const; - SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG, - RTLIB::Libcall Call) const; - SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; - - ConstraintType - getConstraintType(const std::string &Constraint) const override; - unsigned getRegisterByName(const char* RegName, EVT VT) const override; - - /// Examine constraint string and operand type and determine a weight value. - /// The operand object must already have been set up with the operand type. - ConstraintWeight - getSingleConstraintMatchWeight(AsmOperandInfo &info, - const char *constraint) const override; - - std::pair - getRegForInlineAsmConstraint(const std::string &Constraint, - MVT VT) const override; - void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, - std::vector &Ops, - SelectionDAG &DAG) const override; - - bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; - bool mayBeEmittedAsTailCall(CallInst *CI) const override; - bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset, - ISD::MemIndexedMode &AM, bool &IsInc, - SelectionDAG &DAG) const; - bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, - ISD::MemIndexedMode &AM, - SelectionDAG &DAG) const override; - bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, - SDValue &Offset, ISD::MemIndexedMode &AM, - SelectionDAG &DAG) const override; - - void ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, - SelectionDAG &DAG) const override; -}; - -namespace ARM64 { -FastISel *createFastISel(FunctionLoweringInfo &funcInfo, - const TargetLibraryInfo *libInfo); -} // end namespace ARM64 - -} // end namespace llvm - -#endif // LLVM_TARGET_ARM64_ISELLOWERING_H diff --git a/lib/Target/ARM64/ARM64InstrAtomics.td b/lib/Target/ARM64/ARM64InstrAtomics.td deleted file mode 100644 index 1d1483ac126..00000000000 --- a/lib/Target/ARM64/ARM64InstrAtomics.td +++ /dev/null @@ -1,364 +0,0 @@ -//===- ARM64InstrAtomics.td - ARM64 Atomic codegen support -*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// ARM64 Atomic operand code-gen constructs. -// -//===----------------------------------------------------------------------===// - -//===---------------------------------- -// Atomic fences -//===---------------------------------- -def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>; -def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>; - -//===---------------------------------- -// Atomic loads -//===---------------------------------- - -// When they're actually atomic, only one addressing mode (GPR64sp) is -// supported, but when they're relaxed and anything can be used, all the -// standard modes would be valid and may give efficiency gains. - -// A atomic load operation that actually needs acquire semantics. -class acquiring_load - : PatFrag<(ops node:$ptr), (base node:$ptr), [{ - AtomicOrdering Ordering = cast(N)->getOrdering(); - assert(Ordering != AcquireRelease && "unexpected load ordering"); - return Ordering == Acquire || Ordering == SequentiallyConsistent; -}]>; - -// An atomic load operation that does not need either acquire or release -// semantics. -class relaxed_load - : PatFrag<(ops node:$ptr), (base node:$ptr), [{ - AtomicOrdering Ordering = cast(N)->getOrdering(); - return Ordering == Monotonic || Ordering == Unordered; -}]>; - -// 8-bit loads -def : Pat<(acquiring_load GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>; -def : Pat<(relaxed_load (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm, - ro_Wextend8:$offset)), - (LDRBBroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$offset)>; -def : Pat<(relaxed_load (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm, - ro_Xextend8:$offset)), - (LDRBBroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$offset)>; -def : Pat<(relaxed_load (am_indexed8 GPR64sp:$Rn, - uimm12s1:$offset)), - (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>; -def : Pat<(relaxed_load - (am_unscaled8 GPR64sp:$Rn, simm9:$offset)), - (LDURBBi GPR64sp:$Rn, simm9:$offset)>; - -// 16-bit loads -def : Pat<(acquiring_load GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>; -def : Pat<(relaxed_load (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm, - ro_Wextend16:$extend)), - (LDRHHroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>; -def : Pat<(relaxed_load (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm, - ro_Xextend16:$extend)), - (LDRHHroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>; -def : Pat<(relaxed_load (am_indexed16 GPR64sp:$Rn, - uimm12s2:$offset)), - (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>; -def : Pat<(relaxed_load - (am_unscaled16 GPR64sp:$Rn, simm9:$offset)), - (LDURHHi GPR64sp:$Rn, simm9:$offset)>; - -// 32-bit loads -def : Pat<(acquiring_load GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>; -def : Pat<(relaxed_load (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm, - ro_Wextend32:$extend)), - (LDRWroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>; -def : Pat<(relaxed_load (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm, - ro_Xextend32:$extend)), - (LDRWroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>; -def : Pat<(relaxed_load (am_indexed32 GPR64sp:$Rn, - uimm12s4:$offset)), - (LDRWui GPR64sp:$Rn, uimm12s4:$offset)>; -def : Pat<(relaxed_load - (am_unscaled32 GPR64sp:$Rn, simm9:$offset)), - (LDURWi GPR64sp:$Rn, simm9:$offset)>; - -// 64-bit loads -def : Pat<(acquiring_load GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>; -def : Pat<(relaxed_load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm, - ro_Wextend64:$extend)), - (LDRXroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>; -def : Pat<(relaxed_load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm, - ro_Xextend64:$extend)), - (LDRXroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>; -def : Pat<(relaxed_load (am_indexed64 GPR64sp:$Rn, - uimm12s8:$offset)), - (LDRXui GPR64sp:$Rn, uimm12s8:$offset)>; -def : Pat<(relaxed_load - (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), - (LDURXi GPR64sp:$Rn, simm9:$offset)>; - -//===---------------------------------- -// Atomic stores -//===---------------------------------- - -// When they're actually atomic, only one addressing mode (GPR64sp) is -// supported, but when they're relaxed and anything can be used, all the -// standard modes would be valid and may give efficiency gains. - -// A store operation that actually needs release semantics. -class releasing_store - : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{ - AtomicOrdering Ordering = cast(N)->getOrdering(); - assert(Ordering != AcquireRelease && "unexpected store ordering"); - return Ordering == Release || Ordering == SequentiallyConsistent; -}]>; - -// An atomic store operation that doesn't actually need to be atomic on ARM64. -class relaxed_store - : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{ - AtomicOrdering Ordering = cast(N)->getOrdering(); - return Ordering == Monotonic || Ordering == Unordered; -}]>; - -// 8-bit stores -def : Pat<(releasing_store GPR64sp:$ptr, GPR32:$val), - (STLRB GPR32:$val, GPR64sp:$ptr)>; -def : Pat<(relaxed_store - (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend), - GPR32:$val), - (STRBBroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend)>; -def : Pat<(relaxed_store - (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend), - GPR32:$val), - (STRBBroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend)>; -def : Pat<(relaxed_store - (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset), GPR32:$val), - (STRBBui GPR32:$val, GPR64sp:$Rn, uimm12s1:$offset)>; -def : Pat<(relaxed_store - (am_unscaled8 GPR64sp:$Rn, simm9:$offset), GPR32:$val), - (STURBBi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>; - -// 16-bit stores -def : Pat<(releasing_store GPR64sp:$ptr, GPR32:$val), - (STLRH GPR32:$val, GPR64sp:$ptr)>; -def : Pat<(relaxed_store (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm, - ro_Wextend16:$extend), - GPR32:$val), - (STRHHroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>; -def : Pat<(relaxed_store (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm, - ro_Xextend16:$extend), - GPR32:$val), - (STRHHroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>; -def : Pat<(relaxed_store - (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset), GPR32:$val), - (STRHHui GPR32:$val, GPR64sp:$Rn, uimm12s2:$offset)>; -def : Pat<(relaxed_store - (am_unscaled16 GPR64sp:$Rn, simm9:$offset), GPR32:$val), - (STURHHi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>; - -// 32-bit stores -def : Pat<(releasing_store GPR64sp:$ptr, GPR32:$val), - (STLRW GPR32:$val, GPR64sp:$ptr)>; -def : Pat<(relaxed_store (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm, - ro_Wextend32:$extend), - GPR32:$val), - (STRWroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>; -def : Pat<(relaxed_store (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm, - ro_Xextend32:$extend), - GPR32:$val), - (STRWroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>; -def : Pat<(relaxed_store - (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset), GPR32:$val), - (STRWui GPR32:$val, GPR64sp:$Rn, uimm12s4:$offset)>; -def : Pat<(relaxed_store - (am_unscaled32 GPR64sp:$Rn, simm9:$offset), GPR32:$val), - (STURWi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>; - -// 64-bit stores -def : Pat<(releasing_store GPR64sp:$ptr, GPR64:$val), - (STLRX GPR64:$val, GPR64sp:$ptr)>; -def : Pat<(relaxed_store (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm, - ro_Wextend16:$extend), - GPR64:$val), - (STRXroW GPR64:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>; -def : Pat<(relaxed_store (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm, - ro_Xextend16:$extend), - GPR64:$val), - (STRXroX GPR64:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>; -def : Pat<(relaxed_store - (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset), GPR64:$val), - (STRXui GPR64:$val, GPR64sp:$Rn, uimm12s8:$offset)>; -def : Pat<(relaxed_store - (am_unscaled64 GPR64sp:$Rn, simm9:$offset), GPR64:$val), - (STURXi GPR64:$val, GPR64sp:$Rn, simm9:$offset)>; - -//===---------------------------------- -// Low-level exclusive operations -//===---------------------------------- - -// Load-exclusives. - -def ldxr_1 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i8; -}]>; - -def ldxr_2 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i16; -}]>; - -def ldxr_4 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i32; -}]>; - -def ldxr_8 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i64; -}]>; - -def : Pat<(ldxr_1 GPR64sp:$addr), - (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>; -def : Pat<(ldxr_2 GPR64sp:$addr), - (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>; -def : Pat<(ldxr_4 GPR64sp:$addr), - (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>; -def : Pat<(ldxr_8 GPR64sp:$addr), (LDXRX GPR64sp:$addr)>; - -def : Pat<(and (ldxr_1 GPR64sp:$addr), 0xff), - (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>; -def : Pat<(and (ldxr_2 GPR64sp:$addr), 0xffff), - (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>; -def : Pat<(and (ldxr_4 GPR64sp:$addr), 0xffffffff), - (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>; - -// Load-exclusives. - -def ldaxr_1 : PatFrag<(ops node:$ptr), (int_arm64_ldaxr node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i8; -}]>; - -def ldaxr_2 : PatFrag<(ops node:$ptr), (int_arm64_ldaxr node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i16; -}]>; - -def ldaxr_4 : PatFrag<(ops node:$ptr), (int_arm64_ldaxr node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i32; -}]>; - -def ldaxr_8 : PatFrag<(ops node:$ptr), (int_arm64_ldaxr node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i64; -}]>; - -def : Pat<(ldaxr_1 GPR64sp:$addr), - (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>; -def : Pat<(ldaxr_2 GPR64sp:$addr), - (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>; -def : Pat<(ldaxr_4 GPR64sp:$addr), - (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>; -def : Pat<(ldaxr_8 GPR64sp:$addr), (LDAXRX GPR64sp:$addr)>; - -def : Pat<(and (ldaxr_1 GPR64sp:$addr), 0xff), - (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>; -def : Pat<(and (ldaxr_2 GPR64sp:$addr), 0xffff), - (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>; -def : Pat<(and (ldaxr_4 GPR64sp:$addr), 0xffffffff), - (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>; - -// Store-exclusives. - -def stxr_1 : PatFrag<(ops node:$val, node:$ptr), - (int_arm64_stxr node:$val, node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i8; -}]>; - -def stxr_2 : PatFrag<(ops node:$val, node:$ptr), - (int_arm64_stxr node:$val, node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i16; -}]>; - -def stxr_4 : PatFrag<(ops node:$val, node:$ptr), - (int_arm64_stxr node:$val, node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i32; -}]>; - -def stxr_8 : PatFrag<(ops node:$val, node:$ptr), - (int_arm64_stxr node:$val, node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i64; -}]>; - - -def : Pat<(stxr_1 GPR64:$val, GPR64sp:$addr), - (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; -def : Pat<(stxr_2 GPR64:$val, GPR64sp:$addr), - (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; -def : Pat<(stxr_4 GPR64:$val, GPR64sp:$addr), - (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; -def : Pat<(stxr_8 GPR64:$val, GPR64sp:$addr), - (STXRX GPR64:$val, GPR64sp:$addr)>; - -def : Pat<(stxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr), - (STXRB GPR32:$val, GPR64sp:$addr)>; -def : Pat<(stxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr), - (STXRH GPR32:$val, GPR64sp:$addr)>; -def : Pat<(stxr_4 (zext GPR32:$val), GPR64sp:$addr), - (STXRW GPR32:$val, GPR64sp:$addr)>; - -def : Pat<(stxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr), - (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; -def : Pat<(stxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr), - (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; -def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr), - (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; - -// Store-release-exclusives. - -def stlxr_1 : PatFrag<(ops node:$val, node:$ptr), - (int_arm64_stlxr node:$val, node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i8; -}]>; - -def stlxr_2 : PatFrag<(ops node:$val, node:$ptr), - (int_arm64_stlxr node:$val, node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i16; -}]>; - -def stlxr_4 : PatFrag<(ops node:$val, node:$ptr), - (int_arm64_stlxr node:$val, node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i32; -}]>; - -def stlxr_8 : PatFrag<(ops node:$val, node:$ptr), - (int_arm64_stlxr node:$val, node:$ptr), [{ - return cast(N)->getMemoryVT() == MVT::i64; -}]>; - - -def : Pat<(stlxr_1 GPR64:$val, GPR64sp:$addr), - (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; -def : Pat<(stlxr_2 GPR64:$val, GPR64sp:$addr), - (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; -def : Pat<(stlxr_4 GPR64:$val, GPR64sp:$addr), - (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; -def : Pat<(stlxr_8 GPR64:$val, GPR64sp:$addr), - (STLXRX GPR64:$val, GPR64sp:$addr)>; - -def : Pat<(stlxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr), - (STLXRB GPR32:$val, GPR64sp:$addr)>; -def : Pat<(stlxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr), - (STLXRH GPR32:$val, GPR64sp:$addr)>; -def : Pat<(stlxr_4 (zext GPR32:$val), GPR64sp:$addr), - (STLXRW GPR32:$val, GPR64sp:$addr)>; - -def : Pat<(stlxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr), - (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; -def : Pat<(stlxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr), - (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; -def : Pat<(stlxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr), - (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>; - - -// And clear exclusive. - -def : Pat<(int_arm64_clrex), (CLREX 0xf)>; diff --git a/lib/Target/ARM64/ARM64InstrFormats.td b/lib/Target/ARM64/ARM64InstrFormats.td deleted file mode 100644 index ea45b3d4fb2..00000000000 --- a/lib/Target/ARM64/ARM64InstrFormats.td +++ /dev/null @@ -1,8574 +0,0 @@ -//===- ARM64InstrFormats.td - ARM64 Instruction Formats ------*- tblgen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// Describe ARM64 instructions format here -// - -// Format specifies the encoding used by the instruction. This is part of the -// ad-hoc solution used to emit machine instruction encodings by our machine -// code emitter. -class Format val> { - bits<2> Value = val; -} - -def PseudoFrm : Format<0>; -def NormalFrm : Format<1>; // Do we need any others? - -// ARM64 Instruction Format -class ARM64Inst : Instruction { - field bits<32> Inst; // Instruction encoding. - // Mask of bits that cause an encoding to be UNPREDICTABLE. - // If a bit is set, then if the corresponding bit in the - // target encoding differs from its value in the "Inst" field, - // the instruction is UNPREDICTABLE (SoftFail in abstract parlance). - field bits<32> Unpredictable = 0; - // SoftFail is the generic name for this field, but we alias it so - // as to make it more obvious what it means in ARM-land. - field bits<32> SoftFail = Unpredictable; - let Namespace = "ARM64"; - Format F = f; - bits<2> Form = F.Value; - let Pattern = []; - let Constraints = cstr; -} - -// Pseudo instructions (don't have encoding information) -class Pseudo pattern, string cstr = ""> - : ARM64Inst { - dag OutOperandList = oops; - dag InOperandList = iops; - let Pattern = pattern; - let isCodeGenOnly = 1; -} - -// Real instructions (have encoding information) -class EncodedI pattern> : ARM64Inst { - let Pattern = pattern; - let Size = 4; -} - -// Normal instructions -class I pattern> - : EncodedI { - dag OutOperandList = oops; - dag InOperandList = iops; - let AsmString = !strconcat(asm, operands); -} - -class TriOpFrag : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>; -class BinOpFrag : PatFrag<(ops node:$LHS, node:$RHS), res>; -class UnOpFrag : PatFrag<(ops node:$LHS), res>; - -// Helper fragment for an extract of the high portion of a 128-bit vector. -def extract_high_v16i8 : - UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>; -def extract_high_v8i16 : - UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>; -def extract_high_v4i32 : - UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>; -def extract_high_v2i64 : - UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>; - -//===----------------------------------------------------------------------===// -// Asm Operand Classes. -// - -// Shifter operand for arithmetic shifted encodings. -def ShifterOperand : AsmOperandClass { - let Name = "Shifter"; -} - -// Shifter operand for mov immediate encodings. -def MovImm32ShifterOperand : AsmOperandClass { - let SuperClasses = [ShifterOperand]; - let Name = "MovImm32Shifter"; - let RenderMethod = "addShifterOperands"; - let DiagnosticType = "InvalidMovImm32Shift"; -} -def MovImm64ShifterOperand : AsmOperandClass { - let SuperClasses = [ShifterOperand]; - let Name = "MovImm64Shifter"; - let RenderMethod = "addShifterOperands"; - let DiagnosticType = "InvalidMovImm64Shift"; -} - -// Shifter operand for arithmetic register shifted encodings. -class ArithmeticShifterOperand : AsmOperandClass { - let SuperClasses = [ShifterOperand]; - let Name = "ArithmeticShifter" # width; - let PredicateMethod = "isArithmeticShifter<" # width # ">"; - let RenderMethod = "addShifterOperands"; - let DiagnosticType = "AddSubRegShift" # width; -} - -def ArithmeticShifterOperand32 : ArithmeticShifterOperand<32>; -def ArithmeticShifterOperand64 : ArithmeticShifterOperand<64>; - -// Shifter operand for logical register shifted encodings. -class LogicalShifterOperand : AsmOperandClass { - let SuperClasses = [ShifterOperand]; - let Name = "LogicalShifter" # width; - let PredicateMethod = "isLogicalShifter<" # width # ">"; - let RenderMethod = "addShifterOperands"; - let DiagnosticType = "AddSubRegShift" # width; -} - -def LogicalShifterOperand32 : LogicalShifterOperand<32>; -def LogicalShifterOperand64 : LogicalShifterOperand<64>; - -// Shifter operand for logical vector 128/64-bit shifted encodings. -def LogicalVecShifterOperand : AsmOperandClass { - let SuperClasses = [ShifterOperand]; - let Name = "LogicalVecShifter"; - let RenderMethod = "addShifterOperands"; -} -def LogicalVecHalfWordShifterOperand : AsmOperandClass { - let SuperClasses = [LogicalVecShifterOperand]; - let Name = "LogicalVecHalfWordShifter"; - let RenderMethod = "addShifterOperands"; -} - -// The "MSL" shifter on the vector MOVI instruction. -def MoveVecShifterOperand : AsmOperandClass { - let SuperClasses = [ShifterOperand]; - let Name = "MoveVecShifter"; - let RenderMethod = "addShifterOperands"; -} - -// Extend operand for arithmetic encodings. -def ExtendOperand : AsmOperandClass { - let Name = "Extend"; - let DiagnosticType = "AddSubRegExtendLarge"; -} -def ExtendOperand64 : AsmOperandClass { - let SuperClasses = [ExtendOperand]; - let Name = "Extend64"; - let DiagnosticType = "AddSubRegExtendSmall"; -} -// 'extend' that's a lsl of a 64-bit register. -def ExtendOperandLSL64 : AsmOperandClass { - let SuperClasses = [ExtendOperand]; - let Name = "ExtendLSL64"; - let RenderMethod = "addExtend64Operands"; - let DiagnosticType = "AddSubRegExtendLarge"; -} - -// 8-bit floating-point immediate encodings. -def FPImmOperand : AsmOperandClass { - let Name = "FPImm"; - let ParserMethod = "tryParseFPImm"; - let DiagnosticType = "InvalidFPImm"; -} - -def CondCode : AsmOperandClass { - let Name = "CondCode"; - let DiagnosticType = "InvalidCondCode"; -} - -// A 32-bit register pasrsed as 64-bit -def GPR32as64Operand : AsmOperandClass { - let Name = "GPR32as64"; -} -def GPR32as64 : RegisterOperand { - let ParserMatchClass = GPR32as64Operand; -} - -// 8-bit immediate for AdvSIMD where 64-bit values of the form: -// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh -// are encoded as the eight bit value 'abcdefgh'. -def SIMDImmType10Operand : AsmOperandClass { let Name = "SIMDImmType10"; } - - -//===----------------------------------------------------------------------===// -// Operand Definitions. -// - -// ADR[P] instruction labels. -def AdrpOperand : AsmOperandClass { - let Name = "AdrpLabel"; - let ParserMethod = "tryParseAdrpLabel"; - let DiagnosticType = "InvalidLabel"; -} -def adrplabel : Operand { - let EncoderMethod = "getAdrLabelOpValue"; - let PrintMethod = "printAdrpLabel"; - let ParserMatchClass = AdrpOperand; -} - -def AdrOperand : AsmOperandClass { - let Name = "AdrLabel"; - let ParserMethod = "tryParseAdrLabel"; - let DiagnosticType = "InvalidLabel"; -} -def adrlabel : Operand { - let EncoderMethod = "getAdrLabelOpValue"; - let ParserMatchClass = AdrOperand; -} - -// simm9 predicate - True if the immediate is in the range [-256, 255]. -def SImm9Operand : AsmOperandClass { - let Name = "SImm9"; - let DiagnosticType = "InvalidMemoryIndexedSImm9"; -} -def simm9 : Operand, ImmLeaf= -256 && Imm < 256; }]> { - let ParserMatchClass = SImm9Operand; -} - -// simm7sN predicate - True if the immediate is a multiple of N in the range -// [-64 * N, 63 * N]. -class SImm7Scaled : AsmOperandClass { - let Name = "SImm7s" # Scale; - let DiagnosticType = "InvalidMemoryIndexed" # Scale # "SImm7"; -} - -def SImm7s4Operand : SImm7Scaled<4>; -def SImm7s8Operand : SImm7Scaled<8>; -def SImm7s16Operand : SImm7Scaled<16>; - -def simm7s4 : Operand { - let ParserMatchClass = SImm7s4Operand; - let PrintMethod = "printImmScale<4>"; -} - -def simm7s8 : Operand { - let ParserMatchClass = SImm7s8Operand; - let PrintMethod = "printImmScale<8>"; -} - -def simm7s16 : Operand { - let ParserMatchClass = SImm7s16Operand; - let PrintMethod = "printImmScale<16>"; -} - -class AsmImmRange : AsmOperandClass { - let Name = "Imm" # Low # "_" # High; - let DiagnosticType = "InvalidImm" # Low # "_" # High; -} - -def Imm1_8Operand : AsmImmRange<1, 8>; -def Imm1_16Operand : AsmImmRange<1, 16>; -def Imm1_32Operand : AsmImmRange<1, 32>; -def Imm1_64Operand : AsmImmRange<1, 64>; - -def MovZSymbolG3AsmOperand : AsmOperandClass { - let Name = "MovZSymbolG3"; - let RenderMethod = "addImmOperands"; -} - -def movz_symbol_g3 : Operand { - let ParserMatchClass = MovZSymbolG3AsmOperand; -} - -def MovZSymbolG2AsmOperand : AsmOperandClass { - let Name = "MovZSymbolG2"; - let RenderMethod = "addImmOperands"; -} - -def movz_symbol_g2 : Operand { - let ParserMatchClass = MovZSymbolG2AsmOperand; -} - -def MovZSymbolG1AsmOperand : AsmOperandClass { - let Name = "MovZSymbolG1"; - let RenderMethod = "addImmOperands"; -} - -def movz_symbol_g1 : Operand { - let ParserMatchClass = MovZSymbolG1AsmOperand; -} - -def MovZSymbolG0AsmOperand : AsmOperandClass { - let Name = "MovZSymbolG0"; - let RenderMethod = "addImmOperands"; -} - -def movz_symbol_g0 : Operand { - let ParserMatchClass = MovZSymbolG0AsmOperand; -} - -def MovKSymbolG3AsmOperand : AsmOperandClass { - let Name = "MovKSymbolG3"; - let RenderMethod = "addImmOperands"; -} - -def movk_symbol_g3 : Operand { - let ParserMatchClass = MovKSymbolG3AsmOperand; -} - -def MovKSymbolG2AsmOperand : AsmOperandClass { - let Name = "MovKSymbolG2"; - let RenderMethod = "addImmOperands"; -} - -def movk_symbol_g2 : Operand { - let ParserMatchClass = MovKSymbolG2AsmOperand; -} - -def MovKSymbolG1AsmOperand : AsmOperandClass { - let Name = "MovKSymbolG1"; - let RenderMethod = "addImmOperands"; -} - -def movk_symbol_g1 : Operand { - let ParserMatchClass = MovKSymbolG1AsmOperand; -} - -def MovKSymbolG0AsmOperand : AsmOperandClass { - let Name = "MovKSymbolG0"; - let RenderMethod = "addImmOperands"; -} - -def movk_symbol_g0 : Operand { - let ParserMatchClass = MovKSymbolG0AsmOperand; -} - -class fixedpoint_i32 - : Operand, - ComplexPattern", [fpimm, ld]> { - let EncoderMethod = "getFixedPointScaleOpValue"; - let DecoderMethod = "DecodeFixedPointScaleImm32"; - let ParserMatchClass = Imm1_32Operand; -} - -class fixedpoint_i64 - : Operand, - ComplexPattern", [fpimm, ld]> { - let EncoderMethod = "getFixedPointScaleOpValue"; - let DecoderMethod = "DecodeFixedPointScaleImm64"; - let ParserMatchClass = Imm1_64Operand; -} - -def fixedpoint_f32_i32 : fixedpoint_i32; -def fixedpoint_f64_i32 : fixedpoint_i32; - -def fixedpoint_f32_i64 : fixedpoint_i64; -def fixedpoint_f64_i64 : fixedpoint_i64; - -def vecshiftR8 : Operand, ImmLeaf 0) && (((uint32_t)Imm) < 9); -}]> { - let EncoderMethod = "getVecShiftR8OpValue"; - let DecoderMethod = "DecodeVecShiftR8Imm"; - let ParserMatchClass = Imm1_8Operand; -} -def vecshiftR16 : Operand, ImmLeaf 0) && (((uint32_t)Imm) < 17); -}]> { - let EncoderMethod = "getVecShiftR16OpValue"; - let DecoderMethod = "DecodeVecShiftR16Imm"; - let ParserMatchClass = Imm1_16Operand; -} -def vecshiftR16Narrow : Operand, ImmLeaf 0) && (((uint32_t)Imm) < 9); -}]> { - let EncoderMethod = "getVecShiftR16OpValue"; - let DecoderMethod = "DecodeVecShiftR16ImmNarrow"; - let ParserMatchClass = Imm1_8Operand; -} -def vecshiftR32 : Operand, ImmLeaf 0) && (((uint32_t)Imm) < 33); -}]> { - let EncoderMethod = "getVecShiftR32OpValue"; - let DecoderMethod = "DecodeVecShiftR32Imm"; - let ParserMatchClass = Imm1_32Operand; -} -def vecshiftR32Narrow : Operand, ImmLeaf 0) && (((uint32_t)Imm) < 17); -}]> { - let EncoderMethod = "getVecShiftR32OpValue"; - let DecoderMethod = "DecodeVecShiftR32ImmNarrow"; - let ParserMatchClass = Imm1_16Operand; -} -def vecshiftR64 : Operand, ImmLeaf 0) && (((uint32_t)Imm) < 65); -}]> { - let EncoderMethod = "getVecShiftR64OpValue"; - let DecoderMethod = "DecodeVecShiftR64Imm"; - let ParserMatchClass = Imm1_64Operand; -} -def vecshiftR64Narrow : Operand, ImmLeaf 0) && (((uint32_t)Imm) < 33); -}]> { - let EncoderMethod = "getVecShiftR64OpValue"; - let DecoderMethod = "DecodeVecShiftR64ImmNarrow"; - let ParserMatchClass = Imm1_32Operand; -} - -def Imm0_7Operand : AsmImmRange<0, 7>; -def Imm0_15Operand : AsmImmRange<0, 15>; -def Imm0_31Operand : AsmImmRange<0, 31>; -def Imm0_63Operand : AsmImmRange<0, 63>; - -def vecshiftL8 : Operand, ImmLeaf { - let EncoderMethod = "getVecShiftL8OpValue"; - let DecoderMethod = "DecodeVecShiftL8Imm"; - let ParserMatchClass = Imm0_7Operand; -} -def vecshiftL16 : Operand, ImmLeaf { - let EncoderMethod = "getVecShiftL16OpValue"; - let DecoderMethod = "DecodeVecShiftL16Imm"; - let ParserMatchClass = Imm0_15Operand; -} -def vecshiftL32 : Operand, ImmLeaf { - let EncoderMethod = "getVecShiftL32OpValue"; - let DecoderMethod = "DecodeVecShiftL32Imm"; - let ParserMatchClass = Imm0_31Operand; -} -def vecshiftL64 : Operand, ImmLeaf { - let EncoderMethod = "getVecShiftL64OpValue"; - let DecoderMethod = "DecodeVecShiftL64Imm"; - let ParserMatchClass = Imm0_63Operand; -} - - -// Crazy immediate formats used by 32-bit and 64-bit logical immediate -// instructions for splatting repeating bit patterns across the immediate. -def logical_imm32_XFORM : SDNodeXFormgetZExtValue(), 32); - return CurDAG->getTargetConstant(enc, MVT::i32); -}]>; -def logical_imm64_XFORM : SDNodeXFormgetZExtValue(), 64); - return CurDAG->getTargetConstant(enc, MVT::i32); -}]>; - -def LogicalImm32Operand : AsmOperandClass { - let Name = "LogicalImm32"; - let DiagnosticType = "LogicalSecondSource"; -} -def LogicalImm64Operand : AsmOperandClass { - let Name = "LogicalImm64"; - let DiagnosticType = "LogicalSecondSource"; -} -def logical_imm32 : Operand, PatLeaf<(imm), [{ - return ARM64_AM::isLogicalImmediate(N->getZExtValue(), 32); -}], logical_imm32_XFORM> { - let PrintMethod = "printLogicalImm32"; - let ParserMatchClass = LogicalImm32Operand; -} -def logical_imm64 : Operand, PatLeaf<(imm), [{ - return ARM64_AM::isLogicalImmediate(N->getZExtValue(), 64); -}], logical_imm64_XFORM> { - let PrintMethod = "printLogicalImm64"; - let ParserMatchClass = LogicalImm64Operand; -} - -// imm0_65535 predicate - True if the immediate is in the range [0,65535]. -def Imm0_65535Operand : AsmImmRange<0, 65535>; -def imm0_65535 : Operand, ImmLeaf { - let ParserMatchClass = Imm0_65535Operand; - let PrintMethod = "printHexImm"; -} - -// imm0_255 predicate - True if the immediate is in the range [0,255]. -def Imm0_255Operand : AsmOperandClass { let Name = "Imm0_255"; } -def imm0_255 : Operand, ImmLeaf { - let ParserMatchClass = Imm0_255Operand; - let PrintMethod = "printHexImm"; -} - -// imm0_127 predicate - True if the immediate is in the range [0,127] -def Imm0_127Operand : AsmImmRange<0, 127>; -def imm0_127 : Operand, ImmLeaf { - let ParserMatchClass = Imm0_127Operand; - let PrintMethod = "printHexImm"; -} - -// NOTE: These imm0_N operands have to be of type i64 because i64 is the size -// for all shift-amounts. - -// imm0_63 predicate - True if the immediate is in the range [0,63] -def imm0_63 : Operand, ImmLeaf { - let ParserMatchClass = Imm0_63Operand; -} - -// imm0_31 predicate - True if the immediate is in the range [0,31] -def imm0_31 : Operand, ImmLeaf { - let ParserMatchClass = Imm0_31Operand; -} - -// imm0_15 predicate - True if the immediate is in the range [0,15] -def imm0_15 : Operand, ImmLeaf { - let ParserMatchClass = Imm0_15Operand; -} - -// imm0_7 predicate - True if the immediate is in the range [0,7] -def imm0_7 : Operand, ImmLeaf { - let ParserMatchClass = Imm0_7Operand; -} - -// An arithmetic shifter operand: -// {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr -// {5-0} - imm6 -class arith_shift : Operand { - let PrintMethod = "printShifter"; - let ParserMatchClass = !cast( - "ArithmeticShifterOperand" # width); -} - -def arith_shift32 : arith_shift; -def arith_shift64 : arith_shift; - -class arith_shifted_reg - : Operand, - ComplexPattern { - let PrintMethod = "printShiftedRegister"; - let MIOperandInfo = (ops regclass, !cast("arith_shift" # width)); -} - -def arith_shifted_reg32 : arith_shifted_reg; -def arith_shifted_reg64 : arith_shifted_reg; - -// An arithmetic shifter operand: -// {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror -// {5-0} - imm6 -class logical_shift : Operand { - let PrintMethod = "printShifter"; - let ParserMatchClass = !cast( - "LogicalShifterOperand" # width); -} - -def logical_shift32 : logical_shift<32>; -def logical_shift64 : logical_shift<64>; - -class logical_shifted_reg - : Operand, - ComplexPattern { - let PrintMethod = "printShiftedRegister"; - let MIOperandInfo = (ops regclass, shiftop); -} - -def logical_shifted_reg32 : logical_shifted_reg; -def logical_shifted_reg64 : logical_shifted_reg; - -// A logical vector shifter operand: -// {7-6} - shift type: 00 = lsl -// {5-0} - imm6: #0, #8, #16, or #24 -def logical_vec_shift : Operand { - let PrintMethod = "printShifter"; - let EncoderMethod = "getVecShifterOpValue"; - let ParserMatchClass = LogicalVecShifterOperand; -} - -// A logical vector half-word shifter operand: -// {7-6} - shift type: 00 = lsl -// {5-0} - imm6: #0 or #8 -def logical_vec_hw_shift : Operand { - let PrintMethod = "printShifter"; - let EncoderMethod = "getVecShifterOpValue"; - let ParserMatchClass = LogicalVecHalfWordShifterOperand; -} - -// A vector move shifter operand: -// {0} - imm1: #8 or #16 -def move_vec_shift : Operand { - let PrintMethod = "printShifter"; - let EncoderMethod = "getMoveVecShifterOpValue"; - let ParserMatchClass = MoveVecShifterOperand; -} - -def AddSubImmOperand : AsmOperandClass { - let Name = "AddSubImm"; - let ParserMethod = "tryParseAddSubImm"; - let DiagnosticType = "AddSubSecondSource"; -} -// An ADD/SUB immediate shifter operand: -// second operand: -// {7-6} - shift type: 00 = lsl -// {5-0} - imm6: #0 or #12 -class addsub_shifted_imm - : Operand, ComplexPattern { - let PrintMethod = "printAddSubImm"; - let EncoderMethod = "getAddSubImmOpValue"; - let ParserMatchClass = AddSubImmOperand; - let MIOperandInfo = (ops i32imm, i32imm); -} - -def addsub_shifted_imm32 : addsub_shifted_imm; -def addsub_shifted_imm64 : addsub_shifted_imm; - -class neg_addsub_shifted_imm - : Operand, ComplexPattern { - let PrintMethod = "printAddSubImm"; - let EncoderMethod = "getAddSubImmOpValue"; - let ParserMatchClass = AddSubImmOperand; - let MIOperandInfo = (ops i32imm, i32imm); -} - -def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm; -def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm; - -// An extend operand: -// {5-3} - extend type -// {2-0} - imm3 -def arith_extend : Operand { - let PrintMethod = "printArithExtend"; - let ParserMatchClass = ExtendOperand; -} -def arith_extend64 : Operand { - let PrintMethod = "printArithExtend"; - let ParserMatchClass = ExtendOperand64; -} - -// 'extend' that's a lsl of a 64-bit register. -def arith_extendlsl64 : Operand { - let PrintMethod = "printArithExtend"; - let ParserMatchClass = ExtendOperandLSL64; -} - -class arith_extended_reg32 : Operand, - ComplexPattern { - let PrintMethod = "printExtendedRegister"; - let MIOperandInfo = (ops GPR32, arith_extend); -} - -class arith_extended_reg32to64 : Operand, - ComplexPattern { - let PrintMethod = "printExtendedRegister"; - let MIOperandInfo = (ops GPR32, arith_extend64); -} - -// Floating-point immediate. -def fpimm32 : Operand, - PatLeaf<(f32 fpimm), [{ - return ARM64_AM::getFP32Imm(N->getValueAPF()) != -1; - }], SDNodeXFormgetValueAPF(); - uint32_t enc = ARM64_AM::getFP32Imm(InVal); - return CurDAG->getTargetConstant(enc, MVT::i32); - }]>> { - let ParserMatchClass = FPImmOperand; - let PrintMethod = "printFPImmOperand"; -} -def fpimm64 : Operand, - PatLeaf<(f64 fpimm), [{ - return ARM64_AM::getFP64Imm(N->getValueAPF()) != -1; - }], SDNodeXFormgetValueAPF(); - uint32_t enc = ARM64_AM::getFP64Imm(InVal); - return CurDAG->getTargetConstant(enc, MVT::i32); - }]>> { - let ParserMatchClass = FPImmOperand; - let PrintMethod = "printFPImmOperand"; -} - -def fpimm8 : Operand { - let ParserMatchClass = FPImmOperand; - let PrintMethod = "printFPImmOperand"; -} - -def fpimm0 : PatLeaf<(fpimm), [{ - return N->isExactlyValue(+0.0); -}]>; - -// Vector lane operands -class AsmVectorIndex : AsmOperandClass { - let Name = "VectorIndex" # Suffix; - let DiagnosticType = "InvalidIndex" # Suffix; -} -def VectorIndex1Operand : AsmVectorIndex<"1">; -def VectorIndexBOperand : AsmVectorIndex<"B">; -def VectorIndexHOperand : AsmVectorIndex<"H">; -def VectorIndexSOperand : AsmVectorIndex<"S">; -def VectorIndexDOperand : AsmVectorIndex<"D">; - -def VectorIndex1 : Operand, ImmLeaf { - let ParserMatchClass = VectorIndex1Operand; - let PrintMethod = "printVectorIndex"; - let MIOperandInfo = (ops i64imm); -} -def VectorIndexB : Operand, ImmLeaf { - let ParserMatchClass = VectorIndexBOperand; - let PrintMethod = "printVectorIndex"; - let MIOperandInfo = (ops i64imm); -} -def VectorIndexH : Operand, ImmLeaf { - let ParserMatchClass = VectorIndexHOperand; - let PrintMethod = "printVectorIndex"; - let MIOperandInfo = (ops i64imm); -} -def VectorIndexS : Operand, ImmLeaf { - let ParserMatchClass = VectorIndexSOperand; - let PrintMethod = "printVectorIndex"; - let MIOperandInfo = (ops i64imm); -} -def VectorIndexD : Operand, ImmLeaf { - let ParserMatchClass = VectorIndexDOperand; - let PrintMethod = "printVectorIndex"; - let MIOperandInfo = (ops i64imm); -} - -// 8-bit immediate for AdvSIMD where 64-bit values of the form: -// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh -// are encoded as the eight bit value 'abcdefgh'. -def simdimmtype10 : Operand, - PatLeaf<(f64 fpimm), [{ - return ARM64_AM::isAdvSIMDModImmType10(N->getValueAPF() - .bitcastToAPInt() - .getZExtValue()); - }], SDNodeXFormgetValueAPF(); - uint32_t enc = ARM64_AM::encodeAdvSIMDModImmType10(N->getValueAPF() - .bitcastToAPInt() - .getZExtValue()); - return CurDAG->getTargetConstant(enc, MVT::i32); - }]>> { - let ParserMatchClass = SIMDImmType10Operand; - let PrintMethod = "printSIMDType10Operand"; -} - - -//--- -// System management -//--- - -// Base encoding for system instruction operands. -let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in -class BaseSystemI - : I { - let Inst{31-22} = 0b1101010100; - let Inst{21} = L; -} - -// System instructions which do not have an Rt register. -class SimpleSystemI - : BaseSystemI { - let Inst{4-0} = 0b11111; -} - -// System instructions which have an Rt register. -class RtSystemI - : BaseSystemI, - Sched<[WriteSys]> { - bits<5> Rt; - let Inst{4-0} = Rt; -} - -// Hint instructions that take both a CRm and a 3-bit immediate. -class HintI - : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "">, - Sched<[WriteHint]> { - bits <7> imm; - let Inst{20-12} = 0b000110010; - let Inst{11-5} = imm; -} - -// System instructions taking a single literal operand which encodes into -// CRm. op2 differentiates the opcodes. -def BarrierAsmOperand : AsmOperandClass { - let Name = "Barrier"; - let ParserMethod = "tryParseBarrierOperand"; -} -def barrier_op : Operand { - let PrintMethod = "printBarrierOption"; - let ParserMatchClass = BarrierAsmOperand; -} -class CRmSystemI opc, string asm> - : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm">, - Sched<[WriteBarrier]> { - bits<4> CRm; - let Inst{20-12} = 0b000110011; - let Inst{11-8} = CRm; - let Inst{7-5} = opc; -} - -// MRS/MSR system instructions. These have different operand classes because -// a different subset of registers can be accessed through each instruction. -def MRSSystemRegisterOperand : AsmOperandClass { - let Name = "MRSSystemRegister"; - let ParserMethod = "tryParseSysReg"; - let DiagnosticType = "MRS"; -} -// concatenation of 1, op0, op1, CRn, CRm, op2. 16-bit immediate. -def mrs_sysreg_op : Operand { - let ParserMatchClass = MRSSystemRegisterOperand; - let DecoderMethod = "DecodeMRSSystemRegister"; - let PrintMethod = "printMRSSystemRegister"; -} - -def MSRSystemRegisterOperand : AsmOperandClass { - let Name = "MSRSystemRegister"; - let ParserMethod = "tryParseSysReg"; - let DiagnosticType = "MSR"; -} -def msr_sysreg_op : Operand { - let ParserMatchClass = MSRSystemRegisterOperand; - let DecoderMethod = "DecodeMSRSystemRegister"; - let PrintMethod = "printMSRSystemRegister"; -} - -class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg), - "mrs", "\t$Rt, $systemreg"> { - bits<15> systemreg; - let Inst{20} = 1; - let Inst{19-5} = systemreg; -} - -// FIXME: Some of these def NZCV, others don't. Best way to model that? -// Explicitly modeling each of the system register as a register class -// would do it, but feels like overkill at this point. -class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt), - "msr", "\t$systemreg, $Rt"> { - bits<15> systemreg; - let Inst{20} = 1; - let Inst{19-5} = systemreg; -} - -def SystemPStateFieldOperand : AsmOperandClass { - let Name = "SystemPStateField"; - let ParserMethod = "tryParseSysReg"; -} -def pstatefield_op : Operand { - let ParserMatchClass = SystemPStateFieldOperand; - let PrintMethod = "printSystemPStateField"; -} - -let Defs = [NZCV] in -class MSRpstateI - : SimpleSystemI<0, (ins pstatefield_op:$pstate_field, imm0_15:$imm), - "msr", "\t$pstate_field, $imm">, - Sched<[WriteSys]> { - bits<6> pstatefield; - bits<4> imm; - let Inst{20-19} = 0b00; - let Inst{18-16} = pstatefield{5-3}; - let Inst{15-12} = 0b0100; - let Inst{11-8} = imm; - let Inst{7-5} = pstatefield{2-0}; - - let DecoderMethod = "DecodeSystemPStateInstruction"; -} - -// SYS and SYSL generic system instructions. -def SysCRAsmOperand : AsmOperandClass { - let Name = "SysCR"; - let ParserMethod = "tryParseSysCROperand"; -} - -def sys_cr_op : Operand { - let PrintMethod = "printSysCROperand"; - let ParserMatchClass = SysCRAsmOperand; -} - -class SystemXtI - : RtSystemI { - bits<3> op1; - bits<4> Cn; - bits<4> Cm; - bits<3> op2; - let Inst{20-19} = 0b01; - let Inst{18-16} = op1; - let Inst{15-12} = Cn; - let Inst{11-8} = Cm; - let Inst{7-5} = op2; -} - -class SystemLXtI - : RtSystemI { - bits<3> op1; - bits<4> Cn; - bits<4> Cm; - bits<3> op2; - let Inst{20-19} = 0b01; - let Inst{18-16} = op1; - let Inst{15-12} = Cn; - let Inst{11-8} = Cm; - let Inst{7-5} = op2; -} - - -// Branch (register) instructions: -// -// case opc of -// 0001 blr -// 0000 br -// 0101 dret -// 0100 eret -// 0010 ret -// otherwise UNDEFINED -class BaseBranchReg opc, dag oops, dag iops, string asm, - string operands, list pattern> - : I, Sched<[WriteBrReg]> { - let Inst{31-25} = 0b1101011; - let Inst{24-21} = opc; - let Inst{20-16} = 0b11111; - let Inst{15-10} = 0b000000; - let Inst{4-0} = 0b00000; -} - -class BranchReg opc, string asm, list pattern> - : BaseBranchReg { - bits<5> Rn; - let Inst{9-5} = Rn; -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 1, isReturn = 1 in -class SpecialReturn opc, string asm> - : BaseBranchReg { - let Inst{9-5} = 0b11111; -} - -//--- -// Conditional branch instruction. -//--- - -// Condition code. -// 4-bit immediate. Pretty-printed as -def ccode : Operand { - let PrintMethod = "printCondCode"; - let ParserMatchClass = CondCode; -} -def inv_ccode : Operand { - let PrintMethod = "printInverseCondCode"; - let ParserMatchClass = CondCode; -} - -// Conditional branch target. 19-bit immediate. The low two bits of the target -// offset are implied zero and so are not part of the immediate. -def PCRelLabel19Operand : AsmOperandClass { - let Name = "PCRelLabel19"; - let DiagnosticType = "InvalidLabel"; -} -def am_brcond : Operand { - let EncoderMethod = "getCondBranchTargetOpValue"; - let DecoderMethod = "DecodePCRelLabel19"; - let PrintMethod = "printAlignedLabel"; - let ParserMatchClass = PCRelLabel19Operand; -} - -class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target), - "b", ".$cond\t$target", "", - [(ARM64brcond bb:$target, imm:$cond, NZCV)]>, - Sched<[WriteBr]> { - let isBranch = 1; - let isTerminator = 1; - let Uses = [NZCV]; - - bits<4> cond; - bits<19> target; - let Inst{31-24} = 0b01010100; - let Inst{23-5} = target; - let Inst{4} = 0; - let Inst{3-0} = cond; -} - -//--- -// Compare-and-branch instructions. -//--- -class BaseCmpBranch - : I<(outs), (ins regtype:$Rt, am_brcond:$target), - asm, "\t$Rt, $target", "", - [(node regtype:$Rt, bb:$target)]>, - Sched<[WriteBr]> { - let isBranch = 1; - let isTerminator = 1; - - bits<5> Rt; - bits<19> target; - let Inst{30-25} = 0b011010; - let Inst{24} = op; - let Inst{23-5} = target; - let Inst{4-0} = Rt; -} - -multiclass CmpBranch { - def W : BaseCmpBranch { - let Inst{31} = 0; - } - def X : BaseCmpBranch { - let Inst{31} = 1; - } -} - -//--- -// Test-bit-and-branch instructions. -//--- -// Test-and-branch target. 14-bit sign-extended immediate. The low two bits of -// the target offset are implied zero and so are not part of the immediate. -def BranchTarget14Operand : AsmOperandClass { - let Name = "BranchTarget14"; -} -def am_tbrcond : Operand { - let EncoderMethod = "getTestBranchTargetOpValue"; - let PrintMethod = "printAlignedLabel"; - let ParserMatchClass = BranchTarget14Operand; -} - -// AsmOperand classes to emit (or not) special diagnostics -def TBZImm0_31Operand : AsmOperandClass { - let Name = "TBZImm0_31"; - let PredicateMethod = "isImm0_31"; - let RenderMethod = "addImm0_31Operands"; -} -def TBZImm32_63Operand : AsmOperandClass { - let Name = "Imm32_63"; - let DiagnosticType = "InvalidImm0_63"; -} - -class tbz_imm0_31 : Operand, ImmLeaf { - let ParserMatchClass = matcher; -} - -def tbz_imm0_31_diag : tbz_imm0_31; -def tbz_imm0_31_nodiag : tbz_imm0_31; - -def tbz_imm32_63 : Operand, ImmLeaf 31) && (((uint32_t)Imm) < 64); -}]> { - let ParserMatchClass = TBZImm32_63Operand; -} - -class BaseTestBranch - : I<(outs), (ins regtype:$Rt, immtype:$bit_off, am_tbrcond:$target), - asm, "\t$Rt, $bit_off, $target", "", - [(node regtype:$Rt, immtype:$bit_off, bb:$target)]>, - Sched<[WriteBr]> { - let isBranch = 1; - let isTerminator = 1; - - bits<5> Rt; - bits<6> bit_off; - bits<14> target; - - let Inst{30-25} = 0b011011; - let Inst{24} = op; - let Inst{23-19} = bit_off{4-0}; - let Inst{18-5} = target; - let Inst{4-0} = Rt; - - let DecoderMethod = "DecodeTestAndBranch"; -} - -multiclass TestBranch { - def W : BaseTestBranch { - let Inst{31} = 0; - } - - def X : BaseTestBranch { - let Inst{31} = 1; - } - - // Alias X-reg with 0-31 imm to W-Reg. - def : InstAlias(NAME#"W") GPR32as64:$Rd, - tbz_imm0_31_nodiag:$imm, am_tbrcond:$target), 0>; - def : Pat<(node GPR64:$Rn, tbz_imm0_31_diag:$imm, bb:$target), - (!cast(NAME#"W") (EXTRACT_SUBREG GPR64:$Rn, sub_32), - tbz_imm0_31_diag:$imm, bb:$target)>; -} - -//--- -// Unconditional branch (immediate) instructions. -//--- -def BranchTarget26Operand : AsmOperandClass { - let Name = "BranchTarget26"; - let DiagnosticType = "InvalidLabel"; -} -def am_b_target : Operand { - let EncoderMethod = "getBranchTargetOpValue"; - let PrintMethod = "printAlignedLabel"; - let ParserMatchClass = BranchTarget26Operand; -} -def am_bl_target : Operand { - let EncoderMethod = "getBranchTargetOpValue"; - let PrintMethod = "printAlignedLabel"; - let ParserMatchClass = BranchTarget26Operand; -} - -class BImm pattern> - : I<(outs), iops, asm, "\t$addr", "", pattern>, Sched<[WriteBr]> { - bits<26> addr; - let Inst{31} = op; - let Inst{30-26} = 0b00101; - let Inst{25-0} = addr; - - let DecoderMethod = "DecodeUnconditionalBranch"; -} - -class BranchImm pattern> - : BImm; -class CallImm pattern> - : BImm; - -//--- -// Basic one-operand data processing instructions. -//--- - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseOneOperandData opc, RegisterClass regtype, string asm, - SDPatternOperator node> - : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "", - [(set regtype:$Rd, (node regtype:$Rn))]>, - Sched<[WriteI, ReadI]> { - bits<5> Rd; - bits<5> Rn; - - let Inst{30-13} = 0b101101011000000000; - let Inst{12-10} = opc; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -multiclass OneOperandData opc, string asm, - SDPatternOperator node = null_frag> { - def Wr : BaseOneOperandData { - let Inst{31} = 0; - } - - def Xr : BaseOneOperandData { - let Inst{31} = 1; - } -} - -class OneWRegData opc, string asm, SDPatternOperator node> - : BaseOneOperandData { - let Inst{31} = 0; -} - -class OneXRegData opc, string asm, SDPatternOperator node> - : BaseOneOperandData { - let Inst{31} = 1; -} - -//--- -// Basic two-operand data processing instructions. -//--- -class BaseBaseAddSubCarry pattern> - : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), - asm, "\t$Rd, $Rn, $Rm", "", pattern>, - Sched<[WriteI, ReadI, ReadI]> { - let Uses = [NZCV]; - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - let Inst{30} = isSub; - let Inst{28-21} = 0b11010000; - let Inst{20-16} = Rm; - let Inst{15-10} = 0; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -class BaseAddSubCarry - : BaseBaseAddSubCarry; - -class BaseAddSubCarrySetFlags - : BaseBaseAddSubCarry { - let Defs = [NZCV]; -} - -multiclass AddSubCarry { - def Wr : BaseAddSubCarry { - let Inst{31} = 0; - let Inst{29} = 0; - } - def Xr : BaseAddSubCarry { - let Inst{31} = 1; - let Inst{29} = 0; - } - - // Sets flags. - def SWr : BaseAddSubCarrySetFlags { - let Inst{31} = 0; - let Inst{29} = 1; - } - def SXr : BaseAddSubCarrySetFlags { - let Inst{31} = 1; - let Inst{29} = 1; - } -} - -class BaseTwoOperand opc, RegisterClass regtype, string asm, - SDPatternOperator OpNode> - : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), - asm, "\t$Rd, $Rn, $Rm", "", - [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]> { - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - let Inst{30-21} = 0b0011010110; - let Inst{20-16} = Rm; - let Inst{15-14} = 0b00; - let Inst{13-10} = opc; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -class BaseDiv - : BaseTwoOperand<{0,0,1,?}, regtype, asm, OpNode> { - let Inst{10} = isSigned; -} - -multiclass Div { - def Wr : BaseDiv, - Sched<[WriteID32, ReadID, ReadID]> { - let Inst{31} = 0; - } - def Xr : BaseDiv, - Sched<[WriteID64, ReadID, ReadID]> { - let Inst{31} = 1; - } -} - -class BaseShift shift_type, RegisterClass regtype, string asm, - SDPatternOperator OpNode = null_frag> - : BaseTwoOperand<{1,0,?,?}, regtype, asm, OpNode>, - Sched<[WriteIS, ReadI]> { - let Inst{11-10} = shift_type; -} - -multiclass Shift shift_type, string asm, SDNode OpNode> { - def Wr : BaseShift { - let Inst{31} = 0; - } - - def Xr : BaseShift { - let Inst{31} = 1; - } - - def : Pat<(i32 (OpNode GPR32:$Rn, i64:$Rm)), - (!cast(NAME # "Wr") GPR32:$Rn, - (EXTRACT_SUBREG i64:$Rm, sub_32))>; - - def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (zext GPR32:$Rm)))), - (!cast(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>; - - def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (anyext GPR32:$Rm)))), - (!cast(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>; - - def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (sext GPR32:$Rm)))), - (!cast(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>; -} - -class ShiftAlias - : InstAlias; - -class BaseMulAccum opc, RegisterClass multype, - RegisterClass addtype, string asm, - list pattern> - : I<(outs addtype:$Rd), (ins multype:$Rn, multype:$Rm, addtype:$Ra), - asm, "\t$Rd, $Rn, $Rm, $Ra", "", pattern> { - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - bits<5> Ra; - let Inst{30-24} = 0b0011011; - let Inst{23-21} = opc; - let Inst{20-16} = Rm; - let Inst{15} = isSub; - let Inst{14-10} = Ra; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -multiclass MulAccum { - def Wrrr : BaseMulAccum, - Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> { - let Inst{31} = 0; - } - - def Xrrr : BaseMulAccum, - Sched<[WriteIM64, ReadIMA, ReadIM, ReadIM]> { - let Inst{31} = 1; - } -} - -class WideMulAccum opc, string asm, - SDNode AccNode, SDNode ExtNode> - : BaseMulAccum, - Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> { - let Inst{31} = 1; -} - -class MulHi opc, string asm, SDNode OpNode> - : I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm), - asm, "\t$Rd, $Rn, $Rm", "", - [(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64:$Rm))]>, - Sched<[WriteIM64, ReadIM, ReadIM]> { - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - let Inst{31-24} = 0b10011011; - let Inst{23-21} = opc; - let Inst{20-16} = Rm; - let Inst{15} = 0; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; - - // The Ra field of SMULH and UMULH is unused: it should be assembled as 31 - // (i.e. all bits 1) but is ignored by the processor. - let PostEncoderMethod = "fixMulHigh"; -} - -class MulAccumWAlias - : InstAlias; -class MulAccumXAlias - : InstAlias; -class WideMulAccumAlias - : InstAlias; - -class BaseCRC32 sz, bit C, RegisterClass StreamReg, - SDPatternOperator OpNode, string asm> - : I<(outs GPR32:$Rd), (ins GPR32:$Rn, StreamReg:$Rm), - asm, "\t$Rd, $Rn, $Rm", "", - [(set GPR32:$Rd, (OpNode GPR32:$Rn, StreamReg:$Rm))]>, - Sched<[WriteISReg, ReadI, ReadISReg]> { - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - - let Inst{31} = sf; - let Inst{30-21} = 0b0011010110; - let Inst{20-16} = Rm; - let Inst{15-13} = 0b010; - let Inst{12} = C; - let Inst{11-10} = sz; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; - let Predicates = [HasCRC]; -} - -//--- -// Address generation. -//--- - -class ADRI pattern> - : I<(outs GPR64:$Xd), (ins adr:$label), asm, "\t$Xd, $label", "", - pattern>, - Sched<[WriteI]> { - bits<5> Xd; - bits<21> label; - let Inst{31} = page; - let Inst{30-29} = label{1-0}; - let Inst{28-24} = 0b10000; - let Inst{23-5} = label{20-2}; - let Inst{4-0} = Xd; - - let DecoderMethod = "DecodeAdrInstruction"; -} - -//--- -// Move immediate. -//--- - -def movimm32_imm : Operand { - let ParserMatchClass = Imm0_65535Operand; - let EncoderMethod = "getMoveWideImmOpValue"; - let PrintMethod = "printHexImm"; -} -def movimm32_shift : Operand { - let PrintMethod = "printShifter"; - let ParserMatchClass = MovImm32ShifterOperand; -} -def movimm64_shift : Operand { - let PrintMethod = "printShifter"; - let ParserMatchClass = MovImm64ShifterOperand; -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseMoveImmediate opc, RegisterClass regtype, Operand shifter, - string asm> - : I<(outs regtype:$Rd), (ins movimm32_imm:$imm, shifter:$shift), - asm, "\t$Rd, $imm$shift", "", []>, - Sched<[WriteImm]> { - bits<5> Rd; - bits<16> imm; - bits<6> shift; - let Inst{30-29} = opc; - let Inst{28-23} = 0b100101; - let Inst{22-21} = shift{5-4}; - let Inst{20-5} = imm; - let Inst{4-0} = Rd; - - let DecoderMethod = "DecodeMoveImmInstruction"; -} - -multiclass MoveImmediate opc, string asm> { - def Wi : BaseMoveImmediate { - let Inst{31} = 0; - } - - def Xi : BaseMoveImmediate { - let Inst{31} = 1; - } -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseInsertImmediate opc, RegisterClass regtype, Operand shifter, - string asm> - : I<(outs regtype:$Rd), - (ins regtype:$src, movimm32_imm:$imm, shifter:$shift), - asm, "\t$Rd, $imm$shift", "$src = $Rd", []>, - Sched<[WriteI, ReadI]> { - bits<5> Rd; - bits<16> imm; - bits<6> shift; - let Inst{30-29} = opc; - let Inst{28-23} = 0b100101; - let Inst{22-21} = shift{5-4}; - let Inst{20-5} = imm; - let Inst{4-0} = Rd; - - let DecoderMethod = "DecodeMoveImmInstruction"; -} - -multiclass InsertImmediate opc, string asm> { - def Wi : BaseInsertImmediate { - let Inst{31} = 0; - } - - def Xi : BaseInsertImmediate { - let Inst{31} = 1; - } -} - -//--- -// Add/Subtract -//--- - -class BaseAddSubImm - : I<(outs dstRegtype:$Rd), (ins srcRegtype:$Rn, immtype:$imm), - asm, "\t$Rd, $Rn, $imm", "", - [(set dstRegtype:$Rd, (OpNode srcRegtype:$Rn, immtype:$imm))]>, - Sched<[WriteI, ReadI]> { - bits<5> Rd; - bits<5> Rn; - bits<14> imm; - let Inst{30} = isSub; - let Inst{29} = setFlags; - let Inst{28-24} = 0b10001; - let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12 - let Inst{21-10} = imm{11-0}; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; - let DecoderMethod = "DecodeBaseAddSubImm"; -} - -class BaseAddSubRegPseudo - : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), - [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>, - Sched<[WriteI, ReadI, ReadI]>; - -class BaseAddSubSReg - : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm), - asm, "\t$Rd, $Rn, $Rm", "", - [(set regtype:$Rd, (OpNode regtype:$Rn, shifted_regtype:$Rm))]>, - Sched<[WriteISReg, ReadI, ReadISReg]> { - // The operands are in order to match the 'addr' MI operands, so we - // don't need an encoder method and by-name matching. Just use the default - // in-order handling. Since we're using by-order, make sure the names - // do not match. - bits<5> dst; - bits<5> src1; - bits<5> src2; - bits<8> shift; - let Inst{30} = isSub; - let Inst{29} = setFlags; - let Inst{28-24} = 0b01011; - let Inst{23-22} = shift{7-6}; - let Inst{21} = 0; - let Inst{20-16} = src2; - let Inst{15-10} = shift{5-0}; - let Inst{9-5} = src1; - let Inst{4-0} = dst; - - let DecoderMethod = "DecodeThreeAddrSRegInstruction"; -} - -class BaseAddSubEReg - : I<(outs dstRegtype:$R1), - (ins src1Regtype:$R2, src2Regtype:$R3), - asm, "\t$R1, $R2, $R3", "", - [(set dstRegtype:$R1, (OpNode src1Regtype:$R2, src2Regtype:$R3))]>, - Sched<[WriteIEReg, ReadI, ReadIEReg]> { - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - bits<6> ext; - let Inst{30} = isSub; - let Inst{29} = setFlags; - let Inst{28-24} = 0b01011; - let Inst{23-21} = 0b001; - let Inst{20-16} = Rm; - let Inst{15-13} = ext{5-3}; - let Inst{12-10} = ext{2-0}; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; - - let DecoderMethod = "DecodeAddSubERegInstruction"; -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseAddSubEReg64 - : I<(outs dstRegtype:$Rd), - (ins src1Regtype:$Rn, src2Regtype:$Rm, ext_op:$ext), - asm, "\t$Rd, $Rn, $Rm$ext", "", []>, - Sched<[WriteIEReg, ReadI, ReadIEReg]> { - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - bits<6> ext; - let Inst{30} = isSub; - let Inst{29} = setFlags; - let Inst{28-24} = 0b01011; - let Inst{23-21} = 0b001; - let Inst{20-16} = Rm; - let Inst{15} = ext{5}; - let Inst{12-10} = ext{2-0}; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; - - let DecoderMethod = "DecodeAddSubERegInstruction"; -} - -// Aliases for register+register add/subtract. -class AddSubRegAlias - : InstAlias; - -multiclass AddSub { - let hasSideEffects = 0 in { - // Add/Subtract immediate - def Wri : BaseAddSubImm { - let Inst{31} = 0; - } - def Xri : BaseAddSubImm { - let Inst{31} = 1; - } - - // Add/Subtract register - Only used for CodeGen - def Wrr : BaseAddSubRegPseudo; - def Xrr : BaseAddSubRegPseudo; - - // Add/Subtract shifted register - def Wrs : BaseAddSubSReg { - let Inst{31} = 0; - } - def Xrs : BaseAddSubSReg { - let Inst{31} = 1; - } - } - - // Add/Subtract extended register - let AddedComplexity = 1, hasSideEffects = 0 in { - def Wrx : BaseAddSubEReg, mnemonic, OpNode> { - let Inst{31} = 0; - } - def Xrx : BaseAddSubEReg, mnemonic, OpNode> { - let Inst{31} = 1; - } - } - - def Xrx64 : BaseAddSubEReg64 { - // UXTX and SXTX only. - let Inst{14-13} = 0b11; - let Inst{31} = 1; - } - - // Register/register aliases with no shift when SP is not used. - def : AddSubRegAlias(NAME#"Wrs"), - GPR32, GPR32, GPR32, 0>; - def : AddSubRegAlias(NAME#"Xrs"), - GPR64, GPR64, GPR64, 0>; - - // Register/register aliases with no shift when either the destination or - // first source register is SP. - def : AddSubRegAlias(NAME#"Wrx"), - GPR32sponly, GPR32sp, GPR32, 16>; // UXTW #0 - def : AddSubRegAlias(NAME#"Wrx"), - GPR32sp, GPR32sponly, GPR32, 16>; // UXTW #0 - def : AddSubRegAlias(NAME#"Xrx64"), - GPR64sponly, GPR64sp, GPR64, 24>; // UXTX #0 - def : AddSubRegAlias(NAME#"Xrx64"), - GPR64sp, GPR64sponly, GPR64, 24>; // UXTX #0 -} - -multiclass AddSubS { - let isCompare = 1, Defs = [NZCV] in { - // Add/Subtract immediate - def Wri : BaseAddSubImm { - let Inst{31} = 0; - } - def Xri : BaseAddSubImm { - let Inst{31} = 1; - } - - // Add/Subtract register - def Wrr : BaseAddSubRegPseudo; - def Xrr : BaseAddSubRegPseudo; - - // Add/Subtract shifted register - def Wrs : BaseAddSubSReg { - let Inst{31} = 0; - } - def Xrs : BaseAddSubSReg { - let Inst{31} = 1; - } - - // Add/Subtract extended register - let AddedComplexity = 1 in { - def Wrx : BaseAddSubEReg, mnemonic, OpNode> { - let Inst{31} = 0; - } - def Xrx : BaseAddSubEReg, mnemonic, OpNode> { - let Inst{31} = 1; - } - } - - def Xrx64 : BaseAddSubEReg64 { - // UXTX and SXTX only. - let Inst{14-13} = 0b11; - let Inst{31} = 1; - } - } // Defs = [NZCV] - - // Compare aliases - def : InstAlias(NAME#"Wri") - WZR, GPR32sp:$src, addsub_shifted_imm32:$imm), 5>; - def : InstAlias(NAME#"Xri") - XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>; - def : InstAlias(NAME#"Wrx") - WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>; - def : InstAlias(NAME#"Xrx") - XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>; - def : InstAlias(NAME#"Xrx64") - XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>; - def : InstAlias(NAME#"Wrs") - WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>; - def : InstAlias(NAME#"Xrs") - XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>; - - // Compare shorthands - def : InstAlias(NAME#"Wrs") - WZR, GPR32:$src1, GPR32:$src2, 0), 5>; - def : InstAlias(NAME#"Xrs") - XZR, GPR64:$src1, GPR64:$src2, 0), 5>; - - // Register/register aliases with no shift when SP is not used. - def : AddSubRegAlias(NAME#"Wrs"), - GPR32, GPR32, GPR32, 0>; - def : AddSubRegAlias(NAME#"Xrs"), - GPR64, GPR64, GPR64, 0>; - - // Register/register aliases with no shift when the first source register - // is SP. - def : AddSubRegAlias(NAME#"Wrx"), - GPR32, GPR32sponly, GPR32, 16>; // UXTW #0 - def : AddSubRegAlias(NAME#"Xrx64"), - GPR64, GPR64sponly, GPR64, 24>; // UXTX #0 -} - -//--- -// Extract -//--- -def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, - SDTCisPtrTy<3>]>; -def ARM64Extr : SDNode<"ARM64ISD::EXTR", SDTA64EXTR>; - -class BaseExtractImm patterns> - : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, imm_type:$imm), - asm, "\t$Rd, $Rn, $Rm, $imm", "", patterns>, - Sched<[WriteExtr, ReadExtrHi]> { - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - bits<6> imm; - - let Inst{30-23} = 0b00100111; - let Inst{21} = 0; - let Inst{20-16} = Rm; - let Inst{15-10} = imm; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -multiclass ExtractImm { - def Wrri : BaseExtractImm { - let Inst{31} = 0; - let Inst{22} = 0; - // imm<5> must be zero. - let imm{5} = 0; - } - def Xrri : BaseExtractImm { - - let Inst{31} = 1; - let Inst{22} = 1; - } -} - -//--- -// Bitfield -//--- - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseBitfieldImm opc, - RegisterClass regtype, Operand imm_type, string asm> - : I<(outs regtype:$Rd), (ins regtype:$Rn, imm_type:$immr, imm_type:$imms), - asm, "\t$Rd, $Rn, $immr, $imms", "", []>, - Sched<[WriteIS, ReadI]> { - bits<5> Rd; - bits<5> Rn; - bits<6> immr; - bits<6> imms; - - let Inst{30-29} = opc; - let Inst{28-23} = 0b100110; - let Inst{21-16} = immr; - let Inst{15-10} = imms; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -multiclass BitfieldImm opc, string asm> { - def Wri : BaseBitfieldImm { - let Inst{31} = 0; - let Inst{22} = 0; - // imms<5> and immr<5> must be zero, else ReservedValue(). - let Inst{21} = 0; - let Inst{15} = 0; - } - def Xri : BaseBitfieldImm { - let Inst{31} = 1; - let Inst{22} = 1; - } -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseBitfieldImmWith2RegArgs opc, - RegisterClass regtype, Operand imm_type, string asm> - : I<(outs regtype:$Rd), (ins regtype:$src, regtype:$Rn, imm_type:$immr, - imm_type:$imms), - asm, "\t$Rd, $Rn, $immr, $imms", "$src = $Rd", []>, - Sched<[WriteIS, ReadI]> { - bits<5> Rd; - bits<5> Rn; - bits<6> immr; - bits<6> imms; - - let Inst{30-29} = opc; - let Inst{28-23} = 0b100110; - let Inst{21-16} = immr; - let Inst{15-10} = imms; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -multiclass BitfieldImmWith2RegArgs opc, string asm> { - def Wri : BaseBitfieldImmWith2RegArgs { - let Inst{31} = 0; - let Inst{22} = 0; - // imms<5> and immr<5> must be zero, else ReservedValue(). - let Inst{21} = 0; - let Inst{15} = 0; - } - def Xri : BaseBitfieldImmWith2RegArgs { - let Inst{31} = 1; - let Inst{22} = 1; - } -} - -//--- -// Logical -//--- - -// Logical (immediate) -class BaseLogicalImm opc, RegisterClass dregtype, - RegisterClass sregtype, Operand imm_type, string asm, - list pattern> - : I<(outs dregtype:$Rd), (ins sregtype:$Rn, imm_type:$imm), - asm, "\t$Rd, $Rn, $imm", "", pattern>, - Sched<[WriteI, ReadI]> { - bits<5> Rd; - bits<5> Rn; - bits<13> imm; - let Inst{30-29} = opc; - let Inst{28-23} = 0b100100; - let Inst{22} = imm{12}; - let Inst{21-16} = imm{11-6}; - let Inst{15-10} = imm{5-0}; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; - - let DecoderMethod = "DecodeLogicalImmInstruction"; -} - -// Logical (shifted register) -class BaseLogicalSReg opc, bit N, RegisterClass regtype, - logical_shifted_reg shifted_regtype, string asm, - list pattern> - : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm), - asm, "\t$Rd, $Rn, $Rm", "", pattern>, - Sched<[WriteISReg, ReadI, ReadISReg]> { - // The operands are in order to match the 'addr' MI operands, so we - // don't need an encoder method and by-name matching. Just use the default - // in-order handling. Since we're using by-order, make sure the names - // do not match. - bits<5> dst; - bits<5> src1; - bits<5> src2; - bits<8> shift; - let Inst{30-29} = opc; - let Inst{28-24} = 0b01010; - let Inst{23-22} = shift{7-6}; - let Inst{21} = N; - let Inst{20-16} = src2; - let Inst{15-10} = shift{5-0}; - let Inst{9-5} = src1; - let Inst{4-0} = dst; - - let DecoderMethod = "DecodeThreeAddrSRegInstruction"; -} - -// Aliases for register+register logical instructions. -class LogicalRegAlias - : InstAlias; - -let AddedComplexity = 6 in -multiclass LogicalImm opc, string mnemonic, SDNode OpNode> { - def Wri : BaseLogicalImm { - let Inst{31} = 0; - let Inst{22} = 0; // 64-bit version has an additional bit of immediate. - } - def Xri : BaseLogicalImm { - let Inst{31} = 1; - } -} - -multiclass LogicalImmS opc, string mnemonic, SDNode OpNode> { - let isCompare = 1, Defs = [NZCV] in { - def Wri : BaseLogicalImm { - let Inst{31} = 0; - let Inst{22} = 0; // 64-bit version has an additional bit of immediate. - } - def Xri : BaseLogicalImm { - let Inst{31} = 1; - } - } // end Defs = [NZCV] -} - -class BaseLogicalRegPseudo - : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), - [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>, - Sched<[WriteI, ReadI, ReadI]>; - -// Split from LogicalImm as not all instructions have both. -multiclass LogicalReg opc, bit N, string mnemonic, - SDPatternOperator OpNode> { - def Wrr : BaseLogicalRegPseudo; - def Xrr : BaseLogicalRegPseudo; - - def Wrs : BaseLogicalSReg { - let Inst{31} = 0; - } - def Xrs : BaseLogicalSReg { - let Inst{31} = 1; - } - - def : LogicalRegAlias(NAME#"Wrs"), GPR32>; - def : LogicalRegAlias(NAME#"Xrs"), GPR64>; -} - -// Split from LogicalReg to allow setting NZCV Defs -multiclass LogicalRegS opc, bit N, string mnemonic, - SDPatternOperator OpNode = null_frag> { - let Defs = [NZCV], mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - def Wrr : BaseLogicalRegPseudo; - def Xrr : BaseLogicalRegPseudo; - - def Wrs : BaseLogicalSReg { - let Inst{31} = 0; - } - def Xrs : BaseLogicalSReg { - let Inst{31} = 1; - } - } // Defs = [NZCV] - - def : LogicalRegAlias(NAME#"Wrs"), GPR32>; - def : LogicalRegAlias(NAME#"Xrs"), GPR64>; -} - -//--- -// Conditionally set flags -//--- - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseCondSetFlagsImm - : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond), - asm, "\t$Rn, $imm, $nzcv, $cond", "", []>, - Sched<[WriteI, ReadI]> { - let Uses = [NZCV]; - let Defs = [NZCV]; - - bits<5> Rn; - bits<5> imm; - bits<4> nzcv; - bits<4> cond; - - let Inst{30} = op; - let Inst{29-21} = 0b111010010; - let Inst{20-16} = imm; - let Inst{15-12} = cond; - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4} = 0b0; - let Inst{3-0} = nzcv; -} - -multiclass CondSetFlagsImm { - def Wi : BaseCondSetFlagsImm { - let Inst{31} = 0; - } - def Xi : BaseCondSetFlagsImm { - let Inst{31} = 1; - } -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseCondSetFlagsReg - : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond), - asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>, - Sched<[WriteI, ReadI, ReadI]> { - let Uses = [NZCV]; - let Defs = [NZCV]; - - bits<5> Rn; - bits<5> Rm; - bits<4> nzcv; - bits<4> cond; - - let Inst{30} = op; - let Inst{29-21} = 0b111010010; - let Inst{20-16} = Rm; - let Inst{15-12} = cond; - let Inst{11-10} = 0b00; - let Inst{9-5} = Rn; - let Inst{4} = 0b0; - let Inst{3-0} = nzcv; -} - -multiclass CondSetFlagsReg { - def Wr : BaseCondSetFlagsReg { - let Inst{31} = 0; - } - def Xr : BaseCondSetFlagsReg { - let Inst{31} = 1; - } -} - -//--- -// Conditional select -//--- - -class BaseCondSelect op2, RegisterClass regtype, string asm> - : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond), - asm, "\t$Rd, $Rn, $Rm, $cond", "", - [(set regtype:$Rd, - (ARM64csel regtype:$Rn, regtype:$Rm, (i32 imm:$cond), NZCV))]>, - Sched<[WriteI, ReadI, ReadI]> { - let Uses = [NZCV]; - - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - bits<4> cond; - - let Inst{30} = op; - let Inst{29-21} = 0b011010100; - let Inst{20-16} = Rm; - let Inst{15-12} = cond; - let Inst{11-10} = op2; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -multiclass CondSelect op2, string asm> { - def Wr : BaseCondSelect { - let Inst{31} = 0; - } - def Xr : BaseCondSelect { - let Inst{31} = 1; - } -} - -class BaseCondSelectOp op2, RegisterClass regtype, string asm, - PatFrag frag> - : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond), - asm, "\t$Rd, $Rn, $Rm, $cond", "", - [(set regtype:$Rd, - (ARM64csel regtype:$Rn, (frag regtype:$Rm), - (i32 imm:$cond), NZCV))]>, - Sched<[WriteI, ReadI, ReadI]> { - let Uses = [NZCV]; - - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - bits<4> cond; - - let Inst{30} = op; - let Inst{29-21} = 0b011010100; - let Inst{20-16} = Rm; - let Inst{15-12} = cond; - let Inst{11-10} = op2; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -def inv_cond_XFORM : SDNodeXForm(N->getZExtValue()); - return CurDAG->getTargetConstant(ARM64CC::getInvertedCondCode(CC), MVT::i32); -}]>; - -multiclass CondSelectOp op2, string asm, PatFrag frag> { - def Wr : BaseCondSelectOp { - let Inst{31} = 0; - } - def Xr : BaseCondSelectOp { - let Inst{31} = 1; - } - - def : Pat<(ARM64csel (frag GPR32:$Rm), GPR32:$Rn, (i32 imm:$cond), NZCV), - (!cast(NAME # Wr) GPR32:$Rn, GPR32:$Rm, - (inv_cond_XFORM imm:$cond))>; - - def : Pat<(ARM64csel (frag GPR64:$Rm), GPR64:$Rn, (i32 imm:$cond), NZCV), - (!cast(NAME # Xr) GPR64:$Rn, GPR64:$Rm, - (inv_cond_XFORM imm:$cond))>; -} - -//--- -// Special Mask Value -//--- -def maski8_or_more : Operand, - ImmLeaf { -} -def maski16_or_more : Operand, - ImmLeaf { -} - - -//--- -// Load/store -//--- - -// (unsigned immediate) -// Indexed for 8-bit registers. offset is in range [0,4095]. -def am_indexed8 : ComplexPattern; -def am_indexed16 : ComplexPattern; -def am_indexed32 : ComplexPattern; -def am_indexed64 : ComplexPattern; -def am_indexed128 : ComplexPattern; - -class UImm12OffsetOperand : AsmOperandClass { - let Name = "UImm12Offset" # Scale; - let RenderMethod = "addUImm12OffsetOperands<" # Scale # ">"; - let PredicateMethod = "isUImm12Offset<" # Scale # ">"; - let DiagnosticType = "InvalidMemoryIndexed" # Scale; -} - -def UImm12OffsetScale1Operand : UImm12OffsetOperand<1>; -def UImm12OffsetScale2Operand : UImm12OffsetOperand<2>; -def UImm12OffsetScale4Operand : UImm12OffsetOperand<4>; -def UImm12OffsetScale8Operand : UImm12OffsetOperand<8>; -def UImm12OffsetScale16Operand : UImm12OffsetOperand<16>; - -class uimm12_scaled : Operand { - let ParserMatchClass - = !cast("UImm12OffsetScale" # Scale # "Operand"); - let EncoderMethod - = "getLdStUImm12OpValue"; - let PrintMethod = "printUImm12Offset<" # Scale # ">"; -} - -def uimm12s1 : uimm12_scaled<1>; -def uimm12s2 : uimm12_scaled<2>; -def uimm12s4 : uimm12_scaled<4>; -def uimm12s8 : uimm12_scaled<8>; -def uimm12s16 : uimm12_scaled<16>; - -class BaseLoadStoreUI sz, bit V, bits<2> opc, dag oops, dag iops, - string asm, list pattern> - : I { - bits<5> Rt; - - bits<5> Rn; - bits<12> offset; - - let Inst{31-30} = sz; - let Inst{29-27} = 0b111; - let Inst{26} = V; - let Inst{25-24} = 0b01; - let Inst{23-22} = opc; - let Inst{21-10} = offset; - let Inst{9-5} = Rn; - let Inst{4-0} = Rt; - - let DecoderMethod = "DecodeUnsignedLdStInstruction"; -} - -multiclass LoadUI sz, bit V, bits<2> opc, RegisterClass regtype, - Operand indextype, string asm, list pattern> { - let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in - def ui : BaseLoadStoreUI, - Sched<[WriteLD]>; - - def : InstAlias(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>; -} - -multiclass StoreUI sz, bit V, bits<2> opc, RegisterClass regtype, - Operand indextype, string asm, list pattern> { - let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in - def ui : BaseLoadStoreUI, - Sched<[WriteST]>; - - def : InstAlias(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>; -} - -def PrefetchOperand : AsmOperandClass { - let Name = "Prefetch"; - let ParserMethod = "tryParsePrefetch"; -} -def prfop : Operand { - let PrintMethod = "printPrefetchOp"; - let ParserMatchClass = PrefetchOperand; -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in -class PrefetchUI sz, bit V, bits<2> opc, string asm, list pat> - : BaseLoadStoreUI, - Sched<[WriteLD]>; - -//--- -// Load literal -//--- - -// Load literal address: 19-bit immediate. The low two bits of the target -// offset are implied zero and so are not part of the immediate. -def am_ldrlit : Operand { - let EncoderMethod = "getLoadLiteralOpValue"; - let DecoderMethod = "DecodePCRelLabel19"; - let PrintMethod = "printAlignedLabel"; - let ParserMatchClass = PCRelLabel19Operand; -} - -let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in -class LoadLiteral opc, bit V, RegisterClass regtype, string asm> - : I<(outs regtype:$Rt), (ins am_ldrlit:$label), - asm, "\t$Rt, $label", "", []>, - Sched<[WriteLD]> { - bits<5> Rt; - bits<19> label; - let Inst{31-30} = opc; - let Inst{29-27} = 0b011; - let Inst{26} = V; - let Inst{25-24} = 0b00; - let Inst{23-5} = label; - let Inst{4-0} = Rt; -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in -class PrefetchLiteral opc, bit V, string asm, list pat> - : I<(outs), (ins prfop:$Rt, am_ldrlit:$label), - asm, "\t$Rt, $label", "", pat>, - Sched<[WriteLD]> { - bits<5> Rt; - bits<19> label; - let Inst{31-30} = opc; - let Inst{29-27} = 0b011; - let Inst{26} = V; - let Inst{25-24} = 0b00; - let Inst{23-5} = label; - let Inst{4-0} = Rt; -} - -//--- -// Load/store register offset -//--- - -def ro_Xindexed8 : ComplexPattern", []>; -def ro_Xindexed16 : ComplexPattern", []>; -def ro_Xindexed32 : ComplexPattern", []>; -def ro_Xindexed64 : ComplexPattern", []>; -def ro_Xindexed128 : ComplexPattern", []>; - -def ro_Windexed8 : ComplexPattern", []>; -def ro_Windexed16 : ComplexPattern", []>; -def ro_Windexed32 : ComplexPattern", []>; -def ro_Windexed64 : ComplexPattern", []>; -def ro_Windexed128 : ComplexPattern", []>; - -class MemExtendOperand : AsmOperandClass { - let Name = "Mem" # Reg # "Extend" # Width; - let PredicateMethod = "isMem" # Reg # "Extend<" # Width # ">"; - let RenderMethod = "addMemExtendOperands"; - let DiagnosticType = "InvalidMemory" # Reg # "Extend" # Width; -} - -def MemWExtend8Operand : MemExtendOperand<"W", 8> { - // The address "[x0, x1, lsl #0]" actually maps to the variant which performs - // the trivial shift. - let RenderMethod = "addMemExtend8Operands"; -} -def MemWExtend16Operand : MemExtendOperand<"W", 16>; -def MemWExtend32Operand : MemExtendOperand<"W", 32>; -def MemWExtend64Operand : MemExtendOperand<"W", 64>; -def MemWExtend128Operand : MemExtendOperand<"W", 128>; - -def MemXExtend8Operand : MemExtendOperand<"X", 8> { - // The address "[x0, x1, lsl #0]" actually maps to the variant which performs - // the trivial shift. - let RenderMethod = "addMemExtend8Operands"; -} -def MemXExtend16Operand : MemExtendOperand<"X", 16>; -def MemXExtend32Operand : MemExtendOperand<"X", 32>; -def MemXExtend64Operand : MemExtendOperand<"X", 64>; -def MemXExtend128Operand : MemExtendOperand<"X", 128>; - -class ro_extend - : Operand { - let ParserMatchClass = ParserClass; - let PrintMethod = "printMemExtend<'" # Reg # "', " # Width # ">"; - let DecoderMethod = "DecodeMemExtend"; - let EncoderMethod = "getMemExtendOpValue"; - let MIOperandInfo = (ops i32imm:$signed, i32imm:$doshift); -} - -def ro_Wextend8 : ro_extend; -def ro_Wextend16 : ro_extend; -def ro_Wextend32 : ro_extend; -def ro_Wextend64 : ro_extend; -def ro_Wextend128 : ro_extend; - -def ro_Xextend8 : ro_extend; -def ro_Xextend16 : ro_extend; -def ro_Xextend32 : ro_extend; -def ro_Xextend64 : ro_extend; -def ro_Xextend128 : ro_extend; - -class ROAddrMode { - // CodeGen-level pattern covering the entire addressing mode. - ComplexPattern Wpat = windex; - ComplexPattern Xpat = xindex; - - // Asm-level Operand covering the valid "uxtw #3" style syntax. - Operand Wext = wextend; - Operand Xext = xextend; -} - -def ro8 : ROAddrMode; -def ro16 : ROAddrMode; -def ro32 : ROAddrMode; -def ro64 : ROAddrMode; -def ro128 : ROAddrMode; - -class LoadStore8RO sz, bit V, bits<2> opc, RegisterClass regtype, - string asm, dag ins, dag outs, list pat> - : I { - bits<5> Rt; - bits<5> Rn; - bits<5> Rm; - bits<2> extend; - let Inst{31-30} = sz; - let Inst{29-27} = 0b111; - let Inst{26} = V; - let Inst{25-24} = 0b00; - let Inst{23-22} = opc; - let Inst{21} = 1; - let Inst{20-16} = Rm; - let Inst{15} = extend{1}; // sign extend Rm? - let Inst{14} = 1; - let Inst{12} = extend{0}; // do shift? - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rt; -} - -class ROInstAlias - : InstAlias; - -multiclass Load8RO sz, bit V, bits<2> opc, RegisterClass regtype, - string asm, ValueType Ty, SDPatternOperator loadop> { - let AddedComplexity = 10 in - def roW : LoadStore8RO, - Sched<[WriteLDIdx, ReadAdrBase]> { - let Inst{13} = 0b0; - } - - let AddedComplexity = 10 in - def roX : LoadStore8RO, - Sched<[WriteLDIdx, ReadAdrBase]> { - let Inst{13} = 0b1; - } - - def : ROInstAlias(NAME # "roX")>; -} - -multiclass Store8RO sz, bit V, bits<2> opc, RegisterClass regtype, - string asm, ValueType Ty, SDPatternOperator storeop> { - let AddedComplexity = 10 in - def roW : LoadStore8RO, - Sched<[WriteSTIdx, ReadAdrBase]> { - let Inst{13} = 0b0; - } - - let AddedComplexity = 10 in - def roX : LoadStore8RO, - Sched<[WriteSTIdx, ReadAdrBase]> { - let Inst{13} = 0b1; - } - - def : ROInstAlias(NAME # "roX")>; -} - -class LoadStore16RO sz, bit V, bits<2> opc, RegisterClass regtype, - string asm, dag ins, dag outs, list pat> - : I { - bits<5> Rt; - bits<5> Rn; - bits<5> Rm; - bits<2> extend; - let Inst{31-30} = sz; - let Inst{29-27} = 0b111; - let Inst{26} = V; - let Inst{25-24} = 0b00; - let Inst{23-22} = opc; - let Inst{21} = 1; - let Inst{20-16} = Rm; - let Inst{15} = extend{1}; // sign extend Rm? - let Inst{14} = 1; - let Inst{12} = extend{0}; // do shift? - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rt; -} - -multiclass Load16RO sz, bit V, bits<2> opc, RegisterClass regtype, - string asm, ValueType Ty, SDPatternOperator loadop> { - let AddedComplexity = 10 in - def roW : LoadStore16RO, - Sched<[WriteLDIdx, ReadAdrBase]> { - let Inst{13} = 0b0; - } - - let AddedComplexity = 10 in - def roX : LoadStore16RO, - Sched<[WriteLDIdx, ReadAdrBase]> { - let Inst{13} = 0b1; - } - - def : ROInstAlias(NAME # "roX")>; -} - -multiclass Store16RO sz, bit V, bits<2> opc, RegisterClass regtype, - string asm, ValueType Ty, SDPatternOperator storeop> { - let AddedComplexity = 10 in - def roW : LoadStore16RO, - Sched<[WriteSTIdx, ReadAdrBase]> { - let Inst{13} = 0b0; - } - - let AddedComplexity = 10 in - def roX : LoadStore16RO, - Sched<[WriteSTIdx, ReadAdrBase]> { - let Inst{13} = 0b1; - } - - def : ROInstAlias(NAME # "roX")>; -} - -class LoadStore32RO sz, bit V, bits<2> opc, RegisterClass regtype, - string asm, dag ins, dag outs, list pat> - : I { - bits<5> Rt; - bits<5> Rn; - bits<5> Rm; - bits<2> extend; - let Inst{31-30} = sz; - let Inst{29-27} = 0b111; - let Inst{26} = V; - let Inst{25-24} = 0b00; - let Inst{23-22} = opc; - let Inst{21} = 1; - let Inst{20-16} = Rm; - let Inst{15} = extend{1}; // sign extend Rm? - let Inst{14} = 1; - let Inst{12} = extend{0}; // do shift? - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rt; -} - -multiclass Load32RO sz, bit V, bits<2> opc, RegisterClass regtype, - string asm, ValueType Ty, SDPatternOperator loadop> { - let AddedComplexity = 10 in - def roW : LoadStore32RO, - Sched<[WriteLDIdx, ReadAdrBase]> { - let Inst{13} = 0b0; - } - - let AddedComplexity = 10 in - def roX : LoadStore32RO, - Sched<[WriteLDIdx, ReadAdrBase]> { - let Inst{13} = 0b1; - } - - def : ROInstAlias(NAME # "roX")>; -} - -multiclass Store32RO sz, bit V, bits<2> opc, RegisterClass regtype, - string asm, ValueType Ty, SDPatternOperator storeop> { - let AddedComplexity = 10 in - def roW : LoadStore32RO, - Sched<[WriteSTIdx, ReadAdrBase]> { - let Inst{13} = 0b0; - } - - let AddedComplexity = 10 in - def roX : LoadStore32RO, - Sched<[WriteSTIdx, ReadAdrBase]> { - let Inst{13} = 0b1; - } - - def : ROInstAlias(NAME # "roX")>; -} - -class LoadStore64RO sz, bit V, bits<2> opc, RegisterClass regtype, - string asm, dag ins, dag outs, list pat> - : I { - bits<5> Rt; - bits<5> Rn; - bits<5> Rm; - bits<2> extend; - let Inst{31-30} = sz; - let Inst{29-27} = 0b111; - let Inst{26} = V; - let Inst{25-24} = 0b00; - let Inst{23-22} = opc; - let Inst{21} = 1; - let Inst{20-16} = Rm; - let Inst{15} = extend{1}; // sign extend Rm? - let Inst{14} = 1; - let Inst{12} = extend{0}; // do shift? - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rt; -} - -multiclass Load64RO sz, bit V, bits<2> opc, RegisterClass regtype, - string asm, ValueType Ty, SDPatternOperator loadop> { - let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in - def roW : LoadStore64RO, - Sched<[WriteLDIdx, ReadAdrBase]> { - let Inst{13} = 0b0; - } - - let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in - def roX : LoadStore64RO, - Sched<[WriteLDIdx, ReadAdrBase]> { - let Inst{13} = 0b1; - } - - def : ROInstAlias(NAME # "roX")>; -} - -multiclass Store64RO sz, bit V, bits<2> opc, RegisterClass regtype, - string asm, ValueType Ty, SDPatternOperator storeop> { - let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in - def roW : LoadStore64RO, - Sched<[WriteSTIdx, ReadAdrBase]> { - let Inst{13} = 0b0; - } - - let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in - def roX : LoadStore64RO, - Sched<[WriteSTIdx, ReadAdrBase]> { - let Inst{13} = 0b1; - } - - def : ROInstAlias(NAME # "roX")>; -} - -class LoadStore128RO sz, bit V, bits<2> opc, RegisterClass regtype, - string asm, dag ins, dag outs, list pat> - : I { - bits<5> Rt; - bits<5> Rn; - bits<5> Rm; - bits<2> extend; - let Inst{31-30} = sz; - let Inst{29-27} = 0b111; - let Inst{26} = V; - let Inst{25-24} = 0b00; - let Inst{23-22} = opc; - let Inst{21} = 1; - let Inst{20-16} = Rm; - let Inst{15} = extend{1}; // sign extend Rm? - let Inst{14} = 1; - let Inst{12} = extend{0}; // do shift? - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rt; -} - -multiclass Load128RO sz, bit V, bits<2> opc, RegisterClass regtype, - string asm, ValueType Ty, SDPatternOperator loadop> { - let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in - def roW : LoadStore128RO, - Sched<[WriteLDIdx, ReadAdrBase]> { - let Inst{13} = 0b0; - } - - let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in - def roX : LoadStore128RO, - Sched<[WriteLDIdx, ReadAdrBase]> { - let Inst{13} = 0b1; - } - - def : ROInstAlias(NAME # "roX")>; -} - -multiclass Store128RO sz, bit V, bits<2> opc, RegisterClass regtype, - string asm, ValueType Ty, SDPatternOperator storeop> { - let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in - def roW : LoadStore128RO, - Sched<[WriteSTIdx, ReadAdrBase]> { - let Inst{13} = 0b0; - } - - let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in - def roX : LoadStore128RO, - Sched<[WriteSTIdx, ReadAdrBase]> { - let Inst{13} = 0b1; - } - - def : ROInstAlias(NAME # "roX")>; -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in -class BasePrefetchRO sz, bit V, bits<2> opc, dag outs, dag ins, - string asm, list pat> - : I, - Sched<[WriteLD]> { - bits<5> Rt; - bits<5> Rn; - bits<5> Rm; - bits<2> extend; - let Inst{31-30} = sz; - let Inst{29-27} = 0b111; - let Inst{26} = V; - let Inst{25-24} = 0b00; - let Inst{23-22} = opc; - let Inst{21} = 1; - let Inst{20-16} = Rm; - let Inst{15} = extend{1}; // sign extend Rm? - let Inst{14} = 1; - let Inst{12} = extend{0}; // do shift? - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rt; -} - -multiclass PrefetchRO sz, bit V, bits<2> opc, string asm> { - def roW : BasePrefetchRO { - let Inst{13} = 0b0; - } - - def roX : BasePrefetchRO { - let Inst{13} = 0b1; - } - - def : InstAlias<"prfm $Rt, [$Rn, $Rm]", - (!cast(NAME # "roX") prfop:$Rt, - GPR64sp:$Rn, GPR64:$Rm, 0, 0)>; -} - -//--- -// Load/store unscaled immediate -//--- - -def am_unscaled8 : ComplexPattern; -def am_unscaled16 : ComplexPattern; -def am_unscaled32 : ComplexPattern; -def am_unscaled64 : ComplexPattern; -def am_unscaled128 :ComplexPattern; - -class BaseLoadStoreUnscale sz, bit V, bits<2> opc, dag oops, dag iops, - string asm, list pattern> - : I { - bits<5> Rt; - bits<5> Rn; - bits<9> offset; - let Inst{31-30} = sz; - let Inst{29-27} = 0b111; - let Inst{26} = V; - let Inst{25-24} = 0b00; - let Inst{23-22} = opc; - let Inst{21} = 0; - let Inst{20-12} = offset; - let Inst{11-10} = 0b00; - let Inst{9-5} = Rn; - let Inst{4-0} = Rt; - - let DecoderMethod = "DecodeSignedLdStInstruction"; -} - -multiclass LoadUnscaled sz, bit V, bits<2> opc, RegisterClass regtype, - string asm, list pattern> { - let AddedComplexity = 1 in // try this before LoadUI - def i : BaseLoadStoreUnscale, - Sched<[WriteLD]>; - - def : InstAlias(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; -} - -multiclass StoreUnscaled sz, bit V, bits<2> opc, RegisterClass regtype, - string asm, list pattern> { - let AddedComplexity = 1 in // try this before StoreUI - def i : BaseLoadStoreUnscale, - Sched<[WriteST]>; - - def : InstAlias(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; -} - -multiclass PrefetchUnscaled sz, bit V, bits<2> opc, string asm, - list pat> { - let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in - def i : BaseLoadStoreUnscale, - Sched<[WriteLD]>; - - def : InstAlias(NAME # "i") prfop:$Rt, GPR64sp:$Rn, 0)>; -} - -//--- -// Load/store unscaled immediate, unprivileged -//--- - -class BaseLoadStoreUnprivileged sz, bit V, bits<2> opc, - dag oops, dag iops, string asm> - : I { - bits<5> Rt; - bits<5> Rn; - bits<9> offset; - let Inst{31-30} = sz; - let Inst{29-27} = 0b111; - let Inst{26} = V; - let Inst{25-24} = 0b00; - let Inst{23-22} = opc; - let Inst{21} = 0; - let Inst{20-12} = offset; - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rt; - - let DecoderMethod = "DecodeSignedLdStInstruction"; -} - -multiclass LoadUnprivileged sz, bit V, bits<2> opc, - RegisterClass regtype, string asm> { - let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in - def i : BaseLoadStoreUnprivileged, - Sched<[WriteLD]>; - - def : InstAlias(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; -} - -multiclass StoreUnprivileged sz, bit V, bits<2> opc, - RegisterClass regtype, string asm> { - let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in - def i : BaseLoadStoreUnprivileged, - Sched<[WriteST]>; - - def : InstAlias(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; -} - -//--- -// Load/store pre-indexed -//--- - -class BaseLoadStorePreIdx sz, bit V, bits<2> opc, dag oops, dag iops, - string asm, string cstr, list pat> - : I { - bits<5> Rt; - bits<5> Rn; - bits<9> offset; - let Inst{31-30} = sz; - let Inst{29-27} = 0b111; - let Inst{26} = V; - let Inst{25-24} = 0; - let Inst{23-22} = opc; - let Inst{21} = 0; - let Inst{20-12} = offset; - let Inst{11-10} = 0b11; - let Inst{9-5} = Rn; - let Inst{4-0} = Rt; - - let DecoderMethod = "DecodeSignedLdStInstruction"; -} - -let hasSideEffects = 0 in { -let mayStore = 0, mayLoad = 1 in -class LoadPreIdx sz, bit V, bits<2> opc, RegisterClass regtype, - string asm> - : BaseLoadStorePreIdx, - Sched<[WriteLD, WriteAdr]>; - -let mayStore = 1, mayLoad = 0 in -class StorePreIdx sz, bit V, bits<2> opc, RegisterClass regtype, - string asm, SDPatternOperator storeop, ValueType Ty> - : BaseLoadStorePreIdx, - Sched<[WriteAdr, WriteST]>; -} // hasSideEffects = 0 - -//--- -// Load/store post-indexed -//--- - -// (pre-index) load/stores. -class BaseLoadStorePostIdx sz, bit V, bits<2> opc, dag oops, dag iops, - string asm, string cstr, list pat> - : I { - bits<5> Rt; - bits<5> Rn; - bits<9> offset; - let Inst{31-30} = sz; - let Inst{29-27} = 0b111; - let Inst{26} = V; - let Inst{25-24} = 0b00; - let Inst{23-22} = opc; - let Inst{21} = 0b0; - let Inst{20-12} = offset; - let Inst{11-10} = 0b01; - let Inst{9-5} = Rn; - let Inst{4-0} = Rt; - - let DecoderMethod = "DecodeSignedLdStInstruction"; -} - -let hasSideEffects = 0 in { -let mayStore = 0, mayLoad = 1 in -class LoadPostIdx sz, bit V, bits<2> opc, RegisterClass regtype, - string asm> - : BaseLoadStorePostIdx, - Sched<[WriteLD, WriteI]>; - -let mayStore = 1, mayLoad = 0 in -class StorePostIdx sz, bit V, bits<2> opc, RegisterClass regtype, - string asm, SDPatternOperator storeop, ValueType Ty> - : BaseLoadStorePostIdx, - Sched<[WriteAdr, WriteST, ReadAdrBase]>; -} // hasSideEffects = 0 - - -//--- -// Load/store pair -//--- - -// (indexed, offset) - -class BaseLoadStorePairOffset opc, bit V, bit L, dag oops, dag iops, - string asm> - : I { - bits<5> Rt; - bits<5> Rt2; - bits<5> Rn; - bits<7> offset; - let Inst{31-30} = opc; - let Inst{29-27} = 0b101; - let Inst{26} = V; - let Inst{25-23} = 0b010; - let Inst{22} = L; - let Inst{21-15} = offset; - let Inst{14-10} = Rt2; - let Inst{9-5} = Rn; - let Inst{4-0} = Rt; - - let DecoderMethod = "DecodePairLdStInstruction"; -} - -multiclass LoadPairOffset opc, bit V, RegisterClass regtype, - Operand indextype, string asm> { - let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in - def i : BaseLoadStorePairOffset, - Sched<[WriteLD, WriteLDHi]>; - - def : InstAlias(NAME # "i") regtype:$Rt, regtype:$Rt2, - GPR64sp:$Rn, 0)>; -} - - -multiclass StorePairOffset opc, bit V, RegisterClass regtype, - Operand indextype, string asm> { - let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in - def i : BaseLoadStorePairOffset, - Sched<[WriteSTP]>; - - def : InstAlias(NAME # "i") regtype:$Rt, regtype:$Rt2, - GPR64sp:$Rn, 0)>; -} - -// (pre-indexed) -class BaseLoadStorePairPreIdx opc, bit V, bit L, dag oops, dag iops, - string asm> - : I { - bits<5> Rt; - bits<5> Rt2; - bits<5> Rn; - bits<7> offset; - let Inst{31-30} = opc; - let Inst{29-27} = 0b101; - let Inst{26} = V; - let Inst{25-23} = 0b011; - let Inst{22} = L; - let Inst{21-15} = offset; - let Inst{14-10} = Rt2; - let Inst{9-5} = Rn; - let Inst{4-0} = Rt; - - let DecoderMethod = "DecodePairLdStInstruction"; -} - -let hasSideEffects = 0 in { -let mayStore = 0, mayLoad = 1 in -class LoadPairPreIdx opc, bit V, RegisterClass regtype, - Operand indextype, string asm> - : BaseLoadStorePairPreIdx, - Sched<[WriteLD, WriteLDHi, WriteAdr]>; - -let mayStore = 1, mayLoad = 0 in -class StorePairPreIdx opc, bit V, RegisterClass regtype, - Operand indextype, string asm> - : BaseLoadStorePairPreIdx, - Sched<[WriteAdr, WriteSTP]>; -} // hasSideEffects = 0 - -// (post-indexed) - -class BaseLoadStorePairPostIdx opc, bit V, bit L, dag oops, dag iops, - string asm> - : I { - bits<5> Rt; - bits<5> Rt2; - bits<5> Rn; - bits<7> offset; - let Inst{31-30} = opc; - let Inst{29-27} = 0b101; - let Inst{26} = V; - let Inst{25-23} = 0b001; - let Inst{22} = L; - let Inst{21-15} = offset; - let Inst{14-10} = Rt2; - let Inst{9-5} = Rn; - let Inst{4-0} = Rt; - - let DecoderMethod = "DecodePairLdStInstruction"; -} - -let hasSideEffects = 0 in { -let mayStore = 0, mayLoad = 1 in -class LoadPairPostIdx opc, bit V, RegisterClass regtype, - Operand idxtype, string asm> - : BaseLoadStorePairPostIdx, - Sched<[WriteLD, WriteLDHi, WriteAdr]>; - -let mayStore = 1, mayLoad = 0 in -class StorePairPostIdx opc, bit V, RegisterClass regtype, - Operand idxtype, string asm> - : BaseLoadStorePairPostIdx, - Sched<[WriteAdr, WriteSTP]>; -} // hasSideEffects = 0 - -// (no-allocate) - -class BaseLoadStorePairNoAlloc opc, bit V, bit L, dag oops, dag iops, - string asm> - : I { - bits<5> Rt; - bits<5> Rt2; - bits<5> Rn; - bits<7> offset; - let Inst{31-30} = opc; - let Inst{29-27} = 0b101; - let Inst{26} = V; - let Inst{25-23} = 0b000; - let Inst{22} = L; - let Inst{21-15} = offset; - let Inst{14-10} = Rt2; - let Inst{9-5} = Rn; - let Inst{4-0} = Rt; - - let DecoderMethod = "DecodePairLdStInstruction"; -} - -multiclass LoadPairNoAlloc opc, bit V, RegisterClass regtype, - Operand indextype, string asm> { - let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in - def i : BaseLoadStorePairNoAlloc, - Sched<[WriteLD, WriteLDHi]>; - - - def : InstAlias(NAME # "i") regtype:$Rt, regtype:$Rt2, - GPR64sp:$Rn, 0)>; -} - -multiclass StorePairNoAlloc opc, bit V, RegisterClass regtype, - Operand indextype, string asm> { - let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in - def i : BaseLoadStorePairNoAlloc, - Sched<[WriteSTP]>; - - def : InstAlias(NAME # "i") regtype:$Rt, regtype:$Rt2, - GPR64sp:$Rn, 0)>; -} - -//--- -// Load/store exclusive -//--- - -// True exclusive operations write to and/or read from the system's exclusive -// monitors, which as far as a compiler is concerned can be modelled as a -// random shared memory address. Hence LoadExclusive mayStore. -// -// Since these instructions have the undefined register bits set to 1 in -// their canonical form, we need a post encoder method to set those bits -// to 1 when encoding these instructions. We do this using the -// fixLoadStoreExclusive function. This function has template parameters: -// -// fixLoadStoreExclusive -// -// hasRs indicates that the instruction uses the Rs field, so we won't set -// it to 1 (and the same for Rt2). We don't need template parameters for -// the other register fields since Rt and Rn are always used. -// -let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in -class BaseLoadStoreExclusive sz, bit o2, bit L, bit o1, bit o0, - dag oops, dag iops, string asm, string operands> - : I { - let Inst{31-30} = sz; - let Inst{29-24} = 0b001000; - let Inst{23} = o2; - let Inst{22} = L; - let Inst{21} = o1; - let Inst{15} = o0; - - let DecoderMethod = "DecodeExclusiveLdStInstruction"; -} - -// Neither Rs nor Rt2 operands. -class LoadStoreExclusiveSimple sz, bit o2, bit L, bit o1, bit o0, - dag oops, dag iops, string asm, string operands> - : BaseLoadStoreExclusive { - bits<5> Rt; - bits<5> Rn; - let Inst{9-5} = Rn; - let Inst{4-0} = Rt; - - let PostEncoderMethod = "fixLoadStoreExclusive<0,0>"; -} - -// Simple load acquires don't set the exclusive monitor -let mayLoad = 1, mayStore = 0 in -class LoadAcquire sz, bit o2, bit L, bit o1, bit o0, - RegisterClass regtype, string asm> - : LoadStoreExclusiveSimple, - Sched<[WriteLD]>; - -class LoadExclusive sz, bit o2, bit L, bit o1, bit o0, - RegisterClass regtype, string asm> - : LoadStoreExclusiveSimple, - Sched<[WriteLD]>; - -class LoadExclusivePair sz, bit o2, bit L, bit o1, bit o0, - RegisterClass regtype, string asm> - : BaseLoadStoreExclusive, - Sched<[WriteLD, WriteLDHi]> { - bits<5> Rt; - bits<5> Rt2; - bits<5> Rn; - let Inst{14-10} = Rt2; - let Inst{9-5} = Rn; - let Inst{4-0} = Rt; - - let PostEncoderMethod = "fixLoadStoreExclusive<0,1>"; -} - -// Simple store release operations do not check the exclusive monitor. -let mayLoad = 0, mayStore = 1 in -class StoreRelease sz, bit o2, bit L, bit o1, bit o0, - RegisterClass regtype, string asm> - : LoadStoreExclusiveSimple, - Sched<[WriteST]>; - -let mayLoad = 1, mayStore = 1 in -class StoreExclusive sz, bit o2, bit L, bit o1, bit o0, - RegisterClass regtype, string asm> - : BaseLoadStoreExclusive, - Sched<[WriteSTX]> { - bits<5> Ws; - bits<5> Rt; - bits<5> Rn; - let Inst{20-16} = Ws; - let Inst{9-5} = Rn; - let Inst{4-0} = Rt; - - let Constraints = "@earlyclobber $Ws"; - let PostEncoderMethod = "fixLoadStoreExclusive<1,0>"; -} - -class StoreExclusivePair sz, bit o2, bit L, bit o1, bit o0, - RegisterClass regtype, string asm> - : BaseLoadStoreExclusive, - Sched<[WriteSTX]> { - bits<5> Ws; - bits<5> Rt; - bits<5> Rt2; - bits<5> Rn; - let Inst{20-16} = Ws; - let Inst{14-10} = Rt2; - let Inst{9-5} = Rn; - let Inst{4-0} = Rt; - - let Constraints = "@earlyclobber $Ws"; -} - -//--- -// Exception generation -//--- - -let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in -class ExceptionGeneration op1, bits<2> ll, string asm> - : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>, - Sched<[WriteSys]> { - bits<16> imm; - let Inst{31-24} = 0b11010100; - let Inst{23-21} = op1; - let Inst{20-5} = imm; - let Inst{4-2} = 0b000; - let Inst{1-0} = ll; -} - -let Predicates = [HasFPARMv8] in { - -//--- -// Floating point to integer conversion -//--- - -class BaseFPToIntegerUnscaled type, bits<2> rmode, bits<3> opcode, - RegisterClass srcType, RegisterClass dstType, - string asm, list pattern> - : I<(outs dstType:$Rd), (ins srcType:$Rn), - asm, "\t$Rd, $Rn", "", pattern>, - Sched<[WriteFCvt]> { - bits<5> Rd; - bits<5> Rn; - let Inst{30-29} = 0b00; - let Inst{28-24} = 0b11110; - let Inst{23-22} = type; - let Inst{21} = 1; - let Inst{20-19} = rmode; - let Inst{18-16} = opcode; - let Inst{15-10} = 0; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseFPToInteger type, bits<2> rmode, bits<3> opcode, - RegisterClass srcType, RegisterClass dstType, - Operand immType, string asm, list pattern> - : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale), - asm, "\t$Rd, $Rn, $scale", "", pattern>, - Sched<[WriteFCvt]> { - bits<5> Rd; - bits<5> Rn; - bits<6> scale; - let Inst{30-29} = 0b00; - let Inst{28-24} = 0b11110; - let Inst{23-22} = type; - let Inst{21} = 0; - let Inst{20-19} = rmode; - let Inst{18-16} = opcode; - let Inst{15-10} = scale; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -multiclass FPToIntegerUnscaled rmode, bits<3> opcode, string asm, - SDPatternOperator OpN> { - // Unscaled single-precision to 32-bit - def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm, - [(set GPR32:$Rd, (OpN FPR32:$Rn))]> { - let Inst{31} = 0; // 32-bit GPR flag - } - - // Unscaled single-precision to 64-bit - def UXSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR64, asm, - [(set GPR64:$Rd, (OpN FPR32:$Rn))]> { - let Inst{31} = 1; // 64-bit GPR flag - } - - // Unscaled double-precision to 32-bit - def UWDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR32, asm, - [(set GPR32:$Rd, (OpN (f64 FPR64:$Rn)))]> { - let Inst{31} = 0; // 32-bit GPR flag - } - - // Unscaled double-precision to 64-bit - def UXDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR64, asm, - [(set GPR64:$Rd, (OpN (f64 FPR64:$Rn)))]> { - let Inst{31} = 1; // 64-bit GPR flag - } -} - -multiclass FPToIntegerScaled rmode, bits<3> opcode, string asm, - SDPatternOperator OpN> { - // Scaled single-precision to 32-bit - def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32, - fixedpoint_f32_i32, asm, - [(set GPR32:$Rd, (OpN (fmul FPR32:$Rn, - fixedpoint_f32_i32:$scale)))]> { - let Inst{31} = 0; // 32-bit GPR flag - let scale{5} = 1; - } - - // Scaled single-precision to 64-bit - def SXSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR64, - fixedpoint_f32_i64, asm, - [(set GPR64:$Rd, (OpN (fmul FPR32:$Rn, - fixedpoint_f32_i64:$scale)))]> { - let Inst{31} = 1; // 64-bit GPR flag - } - - // Scaled double-precision to 32-bit - def SWDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR32, - fixedpoint_f64_i32, asm, - [(set GPR32:$Rd, (OpN (fmul FPR64:$Rn, - fixedpoint_f64_i32:$scale)))]> { - let Inst{31} = 0; // 32-bit GPR flag - let scale{5} = 1; - } - - // Scaled double-precision to 64-bit - def SXDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR64, - fixedpoint_f64_i64, asm, - [(set GPR64:$Rd, (OpN (fmul FPR64:$Rn, - fixedpoint_f64_i64:$scale)))]> { - let Inst{31} = 1; // 64-bit GPR flag - } -} - -//--- -// Integer to floating point conversion -//--- - -let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in -class BaseIntegerToFP pattern> - : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale), - asm, "\t$Rd, $Rn, $scale", "", pattern>, - Sched<[WriteFCvt]> { - bits<5> Rd; - bits<5> Rn; - bits<6> scale; - let Inst{30-23} = 0b00111100; - let Inst{21-17} = 0b00001; - let Inst{16} = isUnsigned; - let Inst{15-10} = scale; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -class BaseIntegerToFPUnscaled - : I<(outs dstType:$Rd), (ins srcType:$Rn), - asm, "\t$Rd, $Rn", "", [(set (dvt dstType:$Rd), (node srcType:$Rn))]>, - Sched<[WriteFCvt]> { - bits<5> Rd; - bits<5> Rn; - bits<6> scale; - let Inst{30-23} = 0b00111100; - let Inst{21-17} = 0b10001; - let Inst{16} = isUnsigned; - let Inst{15-10} = 0b000000; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -multiclass IntegerToFP { - // Unscaled - def UWSri: BaseIntegerToFPUnscaled { - let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag - } - - def UWDri: BaseIntegerToFPUnscaled { - let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag - } - - def UXSri: BaseIntegerToFPUnscaled { - let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag - } - - def UXDri: BaseIntegerToFPUnscaled { - let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag - } - - // Scaled - def SWSri: BaseIntegerToFP { - let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag - let scale{5} = 1; - } - - def SWDri: BaseIntegerToFP { - let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag - let scale{5} = 1; - } - - def SXSri: BaseIntegerToFP { - let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag - } - - def SXDri: BaseIntegerToFP { - let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag - } -} - -//--- -// Unscaled integer <-> floating point conversion (i.e. FMOV) -//--- - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseUnscaledConversion rmode, bits<3> opcode, - RegisterClass srcType, RegisterClass dstType, - string asm> - : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", - // We use COPY_TO_REGCLASS for these bitconvert operations. - // copyPhysReg() expands the resultant COPY instructions after - // regalloc is done. This gives greater freedom for the allocator - // and related passes (coalescing, copy propagation, et. al.) to - // be more effective. - [/*(set (dvt dstType:$Rd), (bitconvert (svt srcType:$Rn)))*/]>, - Sched<[WriteFCopy]> { - bits<5> Rd; - bits<5> Rn; - let Inst{30-23} = 0b00111100; - let Inst{21} = 1; - let Inst{20-19} = rmode; - let Inst{18-16} = opcode; - let Inst{15-10} = 0b000000; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseUnscaledConversionToHigh rmode, bits<3> opcode, - RegisterClass srcType, RegisterOperand dstType, string asm, - string kind> - : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm, - "{\t$Rd"#kind#"$idx, $Rn|"#kind#"\t$Rd$idx, $Rn}", "", []>, - Sched<[WriteFCopy]> { - bits<5> Rd; - bits<5> Rn; - let Inst{30-23} = 0b00111101; - let Inst{21} = 1; - let Inst{20-19} = rmode; - let Inst{18-16} = opcode; - let Inst{15-10} = 0b000000; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; - - let DecoderMethod = "DecodeFMOVLaneInstruction"; -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseUnscaledConversionFromHigh rmode, bits<3> opcode, - RegisterOperand srcType, RegisterClass dstType, string asm, - string kind> - : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm, - "{\t$Rd, $Rn"#kind#"$idx|"#kind#"\t$Rd, $Rn$idx}", "", []>, - Sched<[WriteFCopy]> { - bits<5> Rd; - bits<5> Rn; - let Inst{30-23} = 0b00111101; - let Inst{21} = 1; - let Inst{20-19} = rmode; - let Inst{18-16} = opcode; - let Inst{15-10} = 0b000000; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; - - let DecoderMethod = "DecodeFMOVLaneInstruction"; -} - - - -multiclass UnscaledConversion { - def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> { - let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag - } - - def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> { - let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag - } - - def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> { - let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag - } - - def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> { - let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag - } - - def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128, - asm, ".d"> { - let Inst{31} = 1; - let Inst{22} = 0; - } - - def DXHighr : BaseUnscaledConversionFromHigh<0b01, 0b110, V128, GPR64, - asm, ".d"> { - let Inst{31} = 1; - let Inst{22} = 0; - } -} - -//--- -// Floating point conversion -//--- - -class BaseFPConversion type, bits<2> opcode, RegisterClass dstType, - RegisterClass srcType, string asm, list pattern> - : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>, - Sched<[WriteFCvt]> { - bits<5> Rd; - bits<5> Rn; - let Inst{31-24} = 0b00011110; - let Inst{23-22} = type; - let Inst{21-17} = 0b10001; - let Inst{16-15} = opcode; - let Inst{14-10} = 0b10000; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -multiclass FPConversion { - // Double-precision to Half-precision - def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm, - [(set FPR16:$Rd, (fround FPR64:$Rn))]>; - - // Double-precision to Single-precision - def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm, - [(set FPR32:$Rd, (fround FPR64:$Rn))]>; - - // Half-precision to Double-precision - def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm, - [(set FPR64:$Rd, (fextend FPR16:$Rn))]>; - - // Half-precision to Single-precision - def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm, - [(set FPR32:$Rd, (fextend FPR16:$Rn))]>; - - // Single-precision to Double-precision - def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm, - [(set FPR64:$Rd, (fextend FPR32:$Rn))]>; - - // Single-precision to Half-precision - def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm, - [(set FPR16:$Rd, (fround FPR32:$Rn))]>; -} - -//--- -// Single operand floating point data processing -//--- - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSingleOperandFPData opcode, RegisterClass regtype, - ValueType vt, string asm, SDPatternOperator node> - : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "", - [(set (vt regtype:$Rd), (node (vt regtype:$Rn)))]>, - Sched<[WriteF]> { - bits<5> Rd; - bits<5> Rn; - let Inst{31-23} = 0b000111100; - let Inst{21-19} = 0b100; - let Inst{18-15} = opcode; - let Inst{14-10} = 0b10000; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -multiclass SingleOperandFPData opcode, string asm, - SDPatternOperator node = null_frag> { - def Sr : BaseSingleOperandFPData { - let Inst{22} = 0; // 32-bit size flag - } - - def Dr : BaseSingleOperandFPData { - let Inst{22} = 1; // 64-bit size flag - } -} - -//--- -// Two operand floating point data processing -//--- - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseTwoOperandFPData opcode, RegisterClass regtype, - string asm, list pat> - : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), - asm, "\t$Rd, $Rn, $Rm", "", pat>, - Sched<[WriteF]> { - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - let Inst{31-23} = 0b000111100; - let Inst{21} = 1; - let Inst{20-16} = Rm; - let Inst{15-12} = opcode; - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -multiclass TwoOperandFPData opcode, string asm, - SDPatternOperator node = null_frag> { - def Srr : BaseTwoOperandFPData { - let Inst{22} = 0; // 32-bit size flag - } - - def Drr : BaseTwoOperandFPData { - let Inst{22} = 1; // 64-bit size flag - } -} - -multiclass TwoOperandFPDataNeg opcode, string asm, SDNode node> { - def Srr : BaseTwoOperandFPData { - let Inst{22} = 0; // 32-bit size flag - } - - def Drr : BaseTwoOperandFPData { - let Inst{22} = 1; // 64-bit size flag - } -} - - -//--- -// Three operand floating point data processing -//--- - -class BaseThreeOperandFPData pat> - : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, regtype: $Ra), - asm, "\t$Rd, $Rn, $Rm, $Ra", "", pat>, - Sched<[WriteFMul]> { - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - bits<5> Ra; - let Inst{31-23} = 0b000111110; - let Inst{21} = isNegated; - let Inst{20-16} = Rm; - let Inst{15} = isSub; - let Inst{14-10} = Ra; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -multiclass ThreeOperandFPData { - def Srrr : BaseThreeOperandFPData { - let Inst{22} = 0; // 32-bit size flag - } - - def Drrr : BaseThreeOperandFPData { - let Inst{22} = 1; // 64-bit size flag - } -} - -//--- -// Floating point data comparisons -//--- - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseOneOperandFPComparison pat> - : I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>, - Sched<[WriteFCmp]> { - bits<5> Rn; - let Inst{31-23} = 0b000111100; - let Inst{21} = 1; - - let Inst{15-10} = 0b001000; - let Inst{9-5} = Rn; - let Inst{4} = signalAllNans; - let Inst{3-0} = 0b1000; - - // Rm should be 0b00000 canonically, but we need to accept any value. - let PostEncoderMethod = "fixOneOperandFPComparison"; -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseTwoOperandFPComparison pat> - : I<(outs), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rn, $Rm", "", pat>, - Sched<[WriteFCmp]> { - bits<5> Rm; - bits<5> Rn; - let Inst{31-23} = 0b000111100; - let Inst{21} = 1; - let Inst{20-16} = Rm; - let Inst{15-10} = 0b001000; - let Inst{9-5} = Rn; - let Inst{4} = signalAllNans; - let Inst{3-0} = 0b0000; -} - -multiclass FPComparison { - let Defs = [NZCV] in { - def Srr : BaseTwoOperandFPComparison { - let Inst{22} = 0; - } - - def Sri : BaseOneOperandFPComparison { - let Inst{22} = 0; - } - - def Drr : BaseTwoOperandFPComparison { - let Inst{22} = 1; - } - - def Dri : BaseOneOperandFPComparison { - let Inst{22} = 1; - } - } // Defs = [NZCV] -} - -//--- -// Floating point conditional comparisons -//--- - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseFPCondComparison - : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond), - asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>, - Sched<[WriteFCmp]> { - bits<5> Rn; - bits<5> Rm; - bits<4> nzcv; - bits<4> cond; - - let Inst{31-23} = 0b000111100; - let Inst{21} = 1; - let Inst{20-16} = Rm; - let Inst{15-12} = cond; - let Inst{11-10} = 0b01; - let Inst{9-5} = Rn; - let Inst{4} = signalAllNans; - let Inst{3-0} = nzcv; -} - -multiclass FPCondComparison { - let Defs = [NZCV], Uses = [NZCV] in { - def Srr : BaseFPCondComparison { - let Inst{22} = 0; - } - - def Drr : BaseFPCondComparison { - let Inst{22} = 1; - } - } // Defs = [NZCV], Uses = [NZCV] -} - -//--- -// Floating point conditional select -//--- - -class BaseFPCondSelect - : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond), - asm, "\t$Rd, $Rn, $Rm, $cond", "", - [(set regtype:$Rd, - (ARM64csel (vt regtype:$Rn), regtype:$Rm, - (i32 imm:$cond), NZCV))]>, - Sched<[WriteF]> { - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - bits<4> cond; - - let Inst{31-23} = 0b000111100; - let Inst{21} = 1; - let Inst{20-16} = Rm; - let Inst{15-12} = cond; - let Inst{11-10} = 0b11; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -multiclass FPCondSelect { - let Uses = [NZCV] in { - def Srrr : BaseFPCondSelect { - let Inst{22} = 0; - } - - def Drrr : BaseFPCondSelect { - let Inst{22} = 1; - } - } // Uses = [NZCV] -} - -//--- -// Floating move immediate -//--- - -class BaseFPMoveImmediate - : I<(outs regtype:$Rd), (ins fpimmtype:$imm), asm, "\t$Rd, $imm", "", - [(set regtype:$Rd, fpimmtype:$imm)]>, - Sched<[WriteFImm]> { - bits<5> Rd; - bits<8> imm; - let Inst{31-23} = 0b000111100; - let Inst{21} = 1; - let Inst{20-13} = imm; - let Inst{12-5} = 0b10000000; - let Inst{4-0} = Rd; -} - -multiclass FPMoveImmediate { - def Si : BaseFPMoveImmediate { - let Inst{22} = 0; - } - - def Di : BaseFPMoveImmediate { - let Inst{22} = 1; - } -} -} // end of 'let Predicates = [HasFPARMv8]' - -//---------------------------------------------------------------------------- -// AdvSIMD -//---------------------------------------------------------------------------- - -let Predicates = [HasNEON] in { - -//---------------------------------------------------------------------------- -// AdvSIMD three register vector instructions -//---------------------------------------------------------------------------- - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDThreeSameVector size, bits<5> opcode, - RegisterOperand regtype, string asm, string kind, - list pattern> - : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, - "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # - "|" # kind # "\t$Rd, $Rn, $Rm|}", "", pattern>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - let Inst{31} = 0; - let Inst{30} = Q; - let Inst{29} = U; - let Inst{28-24} = 0b01110; - let Inst{23-22} = size; - let Inst{21} = 1; - let Inst{20-16} = Rm; - let Inst{15-11} = opcode; - let Inst{10} = 1; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDThreeSameVectorTied size, bits<5> opcode, - RegisterOperand regtype, string asm, string kind, - list pattern> - : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm, - "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # - "|" # kind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - let Inst{31} = 0; - let Inst{30} = Q; - let Inst{29} = U; - let Inst{28-24} = 0b01110; - let Inst{23-22} = size; - let Inst{21} = 1; - let Inst{20-16} = Rm; - let Inst{15-11} = opcode; - let Inst{10} = 1; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -// All operand sizes distinguished in the encoding. -multiclass SIMDThreeSameVector opc, string asm, - SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64, - asm, ".8b", - [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; - def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128, - asm, ".16b", - [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; - def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64, - asm, ".4h", - [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; - def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128, - asm, ".8h", - [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>; - def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64, - asm, ".2s", - [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; - def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128, - asm, ".4s", - [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>; - def v2i64 : BaseSIMDThreeSameVector<1, U, 0b11, opc, V128, - asm, ".2d", - [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>; -} - -// As above, but D sized elements unsupported. -multiclass SIMDThreeSameVectorBHS opc, string asm, - SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64, - asm, ".8b", - [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>; - def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128, - asm, ".16b", - [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>; - def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64, - asm, ".4h", - [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>; - def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128, - asm, ".8h", - [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>; - def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64, - asm, ".2s", - [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>; - def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128, - asm, ".4s", - [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>; -} - -multiclass SIMDThreeSameVectorBHSTied opc, string asm, - SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVectorTied<0, U, 0b00, opc, V64, - asm, ".8b", - [(set (v8i8 V64:$dst), - (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; - def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b00, opc, V128, - asm, ".16b", - [(set (v16i8 V128:$dst), - (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; - def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b01, opc, V64, - asm, ".4h", - [(set (v4i16 V64:$dst), - (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; - def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b01, opc, V128, - asm, ".8h", - [(set (v8i16 V128:$dst), - (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>; - def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b10, opc, V64, - asm, ".2s", - [(set (v2i32 V64:$dst), - (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; - def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b10, opc, V128, - asm, ".4s", - [(set (v4i32 V128:$dst), - (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>; -} - -// As above, but only B sized elements supported. -multiclass SIMDThreeSameVectorB opc, string asm, - SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64, - asm, ".8b", - [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; - def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128, - asm, ".16b", - [(set (v16i8 V128:$Rd), - (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; -} - -// As above, but only S and D sized floating point elements supported. -multiclass SIMDThreeSameVectorFP opc, - string asm, SDPatternOperator OpNode> { - def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64, - asm, ".2s", - [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>; - def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128, - asm, ".4s", - [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>; - def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128, - asm, ".2d", - [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; -} - -multiclass SIMDThreeSameVectorFPCmp opc, - string asm, - SDPatternOperator OpNode> { - def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64, - asm, ".2s", - [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>; - def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128, - asm, ".4s", - [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>; - def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128, - asm, ".2d", - [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; -} - -multiclass SIMDThreeSameVectorFPTied opc, - string asm, SDPatternOperator OpNode> { - def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0}, opc, V64, - asm, ".2s", - [(set (v2f32 V64:$dst), - (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>; - def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0}, opc, V128, - asm, ".4s", - [(set (v4f32 V128:$dst), - (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>; - def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,1}, opc, V128, - asm, ".2d", - [(set (v2f64 V128:$dst), - (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; -} - -// As above, but D and B sized elements unsupported. -multiclass SIMDThreeSameVectorHS opc, string asm, - SDPatternOperator OpNode> { - def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64, - asm, ".4h", - [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; - def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128, - asm, ".8h", - [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>; - def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64, - asm, ".2s", - [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; - def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128, - asm, ".4s", - [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>; -} - -// Logical three vector ops share opcode bits, and only use B sized elements. -multiclass SIMDLogicalThreeVector size, string asm, - SDPatternOperator OpNode = null_frag> { - def v8i8 : BaseSIMDThreeSameVector<0, U, size, 0b00011, V64, - asm, ".8b", - [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>; - def v16i8 : BaseSIMDThreeSameVector<1, U, size, 0b00011, V128, - asm, ".16b", - [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>; - - def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)), - (!cast(NAME#"v8i8") V64:$LHS, V64:$RHS)>; - def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)), - (!cast(NAME#"v8i8") V64:$LHS, V64:$RHS)>; - def : Pat<(v1i64 (OpNode V64:$LHS, V64:$RHS)), - (!cast(NAME#"v8i8") V64:$LHS, V64:$RHS)>; - - def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)), - (!cast(NAME#"v16i8") V128:$LHS, V128:$RHS)>; - def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)), - (!cast(NAME#"v16i8") V128:$LHS, V128:$RHS)>; - def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)), - (!cast(NAME#"v16i8") V128:$LHS, V128:$RHS)>; -} - -multiclass SIMDLogicalThreeVectorTied size, - string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVectorTied<0, U, size, 0b00011, V64, - asm, ".8b", - [(set (v8i8 V64:$dst), - (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; - def v16i8 : BaseSIMDThreeSameVectorTied<1, U, size, 0b00011, V128, - asm, ".16b", - [(set (v16i8 V128:$dst), - (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), - (v16i8 V128:$Rm)))]>; - - def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS), - (v4i16 V64:$RHS))), - (!cast(NAME#"v8i8") - V64:$LHS, V64:$MHS, V64:$RHS)>; - def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS), - (v2i32 V64:$RHS))), - (!cast(NAME#"v8i8") - V64:$LHS, V64:$MHS, V64:$RHS)>; - def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS), - (v1i64 V64:$RHS))), - (!cast(NAME#"v8i8") - V64:$LHS, V64:$MHS, V64:$RHS)>; - - def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS), - (v8i16 V128:$RHS))), - (!cast(NAME#"v16i8") - V128:$LHS, V128:$MHS, V128:$RHS)>; - def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS), - (v4i32 V128:$RHS))), - (!cast(NAME#"v16i8") - V128:$LHS, V128:$MHS, V128:$RHS)>; - def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS), - (v2i64 V128:$RHS))), - (!cast(NAME#"v16i8") - V128:$LHS, V128:$MHS, V128:$RHS)>; -} - - -//---------------------------------------------------------------------------- -// AdvSIMD two register vector instructions. -//---------------------------------------------------------------------------- - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDTwoSameVector size, bits<5> opcode, - RegisterOperand regtype, string asm, string dstkind, - string srckind, list pattern> - : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, - "{\t$Rd" # dstkind # ", $Rn" # srckind # - "|" # dstkind # "\t$Rd, $Rn}", "", pattern>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - let Inst{31} = 0; - let Inst{30} = Q; - let Inst{29} = U; - let Inst{28-24} = 0b01110; - let Inst{23-22} = size; - let Inst{21-17} = 0b10000; - let Inst{16-12} = opcode; - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDTwoSameVectorTied size, bits<5> opcode, - RegisterOperand regtype, string asm, string dstkind, - string srckind, list pattern> - : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm, - "{\t$Rd" # dstkind # ", $Rn" # srckind # - "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - let Inst{31} = 0; - let Inst{30} = Q; - let Inst{29} = U; - let Inst{28-24} = 0b01110; - let Inst{23-22} = size; - let Inst{21-17} = 0b10000; - let Inst{16-12} = opcode; - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -// Supports B, H, and S element sizes. -multiclass SIMDTwoVectorBHS opc, string asm, - SDPatternOperator OpNode> { - def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64, - asm, ".8b", ".8b", - [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; - def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128, - asm, ".16b", ".16b", - [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; - def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64, - asm, ".4h", ".4h", - [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; - def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128, - asm, ".8h", ".8h", - [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; - def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64, - asm, ".2s", ".2s", - [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; - def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128, - asm, ".4s", ".4s", - [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; -} - -class BaseSIMDVectorLShiftLongBySize size, - RegisterOperand regtype, string asm, string dstkind, - string srckind, string amount> - : I<(outs V128:$Rd), (ins regtype:$Rn), asm, - "{\t$Rd" # dstkind # ", $Rn" # srckind # ", #" # amount # - "|" # dstkind # "\t$Rd, $Rn, #" # amount # "}", "", []>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - let Inst{31} = 0; - let Inst{30} = Q; - let Inst{29-24} = 0b101110; - let Inst{23-22} = size; - let Inst{21-10} = 0b100001001110; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -multiclass SIMDVectorLShiftLongBySizeBHS { - let neverHasSideEffects = 1 in { - def v8i8 : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64, - "shll", ".8h", ".8b", "8">; - def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128, - "shll2", ".8h", ".16b", "8">; - def v4i16 : BaseSIMDVectorLShiftLongBySize<0, 0b01, V64, - "shll", ".4s", ".4h", "16">; - def v8i16 : BaseSIMDVectorLShiftLongBySize<1, 0b01, V128, - "shll2", ".4s", ".8h", "16">; - def v2i32 : BaseSIMDVectorLShiftLongBySize<0, 0b10, V64, - "shll", ".2d", ".2s", "32">; - def v4i32 : BaseSIMDVectorLShiftLongBySize<1, 0b10, V128, - "shll2", ".2d", ".4s", "32">; - } -} - -// Supports all element sizes. -multiclass SIMDLongTwoVector opc, string asm, - SDPatternOperator OpNode> { - def v8i8_v4i16 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64, - asm, ".4h", ".8b", - [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; - def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128, - asm, ".8h", ".16b", - [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; - def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64, - asm, ".2s", ".4h", - [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; - def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128, - asm, ".4s", ".8h", - [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; - def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64, - asm, ".1d", ".2s", - [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; - def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128, - asm, ".2d", ".4s", - [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; -} - -multiclass SIMDLongTwoVectorTied opc, string asm, - SDPatternOperator OpNode> { - def v8i8_v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64, - asm, ".4h", ".8b", - [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), - (v8i8 V64:$Rn)))]>; - def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128, - asm, ".8h", ".16b", - [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), - (v16i8 V128:$Rn)))]>; - def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64, - asm, ".2s", ".4h", - [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), - (v4i16 V64:$Rn)))]>; - def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128, - asm, ".4s", ".8h", - [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), - (v8i16 V128:$Rn)))]>; - def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64, - asm, ".1d", ".2s", - [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd), - (v2i32 V64:$Rn)))]>; - def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128, - asm, ".2d", ".4s", - [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), - (v4i32 V128:$Rn)))]>; -} - -// Supports all element sizes, except 1xD. -multiclass SIMDTwoVectorBHSDTied opc, string asm, - SDPatternOperator OpNode> { - def v8i8 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64, - asm, ".8b", ".8b", - [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>; - def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128, - asm, ".16b", ".16b", - [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>; - def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64, - asm, ".4h", ".4h", - [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>; - def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128, - asm, ".8h", ".8h", - [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>; - def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64, - asm, ".2s", ".2s", - [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>; - def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128, - asm, ".4s", ".4s", - [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>; - def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, V128, - asm, ".2d", ".2d", - [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>; -} - -multiclass SIMDTwoVectorBHSD opc, string asm, - SDPatternOperator OpNode = null_frag> { - def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64, - asm, ".8b", ".8b", - [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; - def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128, - asm, ".16b", ".16b", - [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; - def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64, - asm, ".4h", ".4h", - [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; - def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128, - asm, ".8h", ".8h", - [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; - def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64, - asm, ".2s", ".2s", - [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; - def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128, - asm, ".4s", ".4s", - [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; - def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, V128, - asm, ".2d", ".2d", - [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>; -} - - -// Supports only B element sizes. -multiclass SIMDTwoVectorB size, bits<5> opc, string asm, - SDPatternOperator OpNode> { - def v8i8 : BaseSIMDTwoSameVector<0, U, size, opc, V64, - asm, ".8b", ".8b", - [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; - def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, V128, - asm, ".16b", ".16b", - [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; - -} - -// Supports only B and H element sizes. -multiclass SIMDTwoVectorBH opc, string asm, - SDPatternOperator OpNode> { - def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64, - asm, ".8b", ".8b", - [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>; - def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128, - asm, ".16b", ".16b", - [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>; - def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64, - asm, ".4h", ".4h", - [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>; - def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128, - asm, ".8h", ".8h", - [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>; -} - -// Supports only S and D element sizes, uses high bit of the size field -// as an extra opcode bit. -multiclass SIMDTwoVectorFP opc, string asm, - SDPatternOperator OpNode> { - def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64, - asm, ".2s", ".2s", - [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>; - def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128, - asm, ".4s", ".4s", - [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>; - def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128, - asm, ".2d", ".2d", - [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>; -} - -// Supports only S element size. -multiclass SIMDTwoVectorS opc, string asm, - SDPatternOperator OpNode> { - def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64, - asm, ".2s", ".2s", - [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; - def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128, - asm, ".4s", ".4s", - [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; -} - - -multiclass SIMDTwoVectorFPToInt opc, string asm, - SDPatternOperator OpNode> { - def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64, - asm, ".2s", ".2s", - [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>; - def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128, - asm, ".4s", ".4s", - [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>; - def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128, - asm, ".2d", ".2d", - [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>; -} - -multiclass SIMDTwoVectorIntToFP opc, string asm, - SDPatternOperator OpNode> { - def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64, - asm, ".2s", ".2s", - [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; - def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128, - asm, ".4s", ".4s", - [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; - def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128, - asm, ".2d", ".2d", - [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>; -} - - -class BaseSIMDMixedTwoVector size, bits<5> opcode, - RegisterOperand inreg, RegisterOperand outreg, - string asm, string outkind, string inkind, - list pattern> - : I<(outs outreg:$Rd), (ins inreg:$Rn), asm, - "{\t$Rd" # outkind # ", $Rn" # inkind # - "|" # outkind # "\t$Rd, $Rn}", "", pattern>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - let Inst{31} = 0; - let Inst{30} = Q; - let Inst{29} = U; - let Inst{28-24} = 0b01110; - let Inst{23-22} = size; - let Inst{21-17} = 0b10000; - let Inst{16-12} = opcode; - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -class BaseSIMDMixedTwoVectorTied size, bits<5> opcode, - RegisterOperand inreg, RegisterOperand outreg, - string asm, string outkind, string inkind, - list pattern> - : I<(outs outreg:$dst), (ins outreg:$Rd, inreg:$Rn), asm, - "{\t$Rd" # outkind # ", $Rn" # inkind # - "|" # outkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - let Inst{31} = 0; - let Inst{30} = Q; - let Inst{29} = U; - let Inst{28-24} = 0b01110; - let Inst{23-22} = size; - let Inst{21-17} = 0b10000; - let Inst{16-12} = opcode; - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -multiclass SIMDMixedTwoVector opc, string asm, - SDPatternOperator OpNode> { - def v8i8 : BaseSIMDMixedTwoVector<0, U, 0b00, opc, V128, V64, - asm, ".8b", ".8h", - [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn)))]>; - def v16i8 : BaseSIMDMixedTwoVectorTied<1, U, 0b00, opc, V128, V128, - asm#"2", ".16b", ".8h", []>; - def v4i16 : BaseSIMDMixedTwoVector<0, U, 0b01, opc, V128, V64, - asm, ".4h", ".4s", - [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn)))]>; - def v8i16 : BaseSIMDMixedTwoVectorTied<1, U, 0b01, opc, V128, V128, - asm#"2", ".8h", ".4s", []>; - def v2i32 : BaseSIMDMixedTwoVector<0, U, 0b10, opc, V128, V64, - asm, ".2s", ".2d", - [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn)))]>; - def v4i32 : BaseSIMDMixedTwoVectorTied<1, U, 0b10, opc, V128, V128, - asm#"2", ".4s", ".2d", []>; - - def : Pat<(concat_vectors (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn))), - (!cast(NAME # "v16i8") - (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; - def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn))), - (!cast(NAME # "v8i16") - (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; - def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn))), - (!cast(NAME # "v4i32") - (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; -} - -class BaseSIMDCmpTwoVector size, bits<5> opcode, - RegisterOperand regtype, - string asm, string kind, string zero, - ValueType dty, ValueType sty, SDNode OpNode> - : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, - "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero # - "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "", - [(set (dty regtype:$Rd), (OpNode (sty regtype:$Rn)))]>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - let Inst{31} = 0; - let Inst{30} = Q; - let Inst{29} = U; - let Inst{28-24} = 0b01110; - let Inst{23-22} = size; - let Inst{21-17} = 0b10000; - let Inst{16-12} = opcode; - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -// Comparisons support all element sizes, except 1xD. -multiclass SIMDCmpTwoVector opc, string asm, - SDNode OpNode> { - def v8i8rz : BaseSIMDCmpTwoVector<0, U, 0b00, opc, V64, - asm, ".8b", "0", - v8i8, v8i8, OpNode>; - def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, opc, V128, - asm, ".16b", "0", - v16i8, v16i8, OpNode>; - def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, opc, V64, - asm, ".4h", "0", - v4i16, v4i16, OpNode>; - def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, opc, V128, - asm, ".8h", "0", - v8i16, v8i16, OpNode>; - def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, opc, V64, - asm, ".2s", "0", - v2i32, v2i32, OpNode>; - def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, opc, V128, - asm, ".4s", "0", - v4i32, v4i32, OpNode>; - def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, opc, V128, - asm, ".2d", "0", - v2i64, v2i64, OpNode>; -} - -// FP Comparisons support only S and D element sizes. -multiclass SIMDFPCmpTwoVector opc, - string asm, SDNode OpNode> { - - def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, opc, V64, - asm, ".2s", "0.0", - v2i32, v2f32, OpNode>; - def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, opc, V128, - asm, ".4s", "0.0", - v4i32, v4f32, OpNode>; - def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, opc, V128, - asm, ".2d", "0.0", - v2i64, v2f64, OpNode>; - - def : InstAlias(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>; - def : InstAlias(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>; - def : InstAlias(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>; - def : InstAlias(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>; - def : InstAlias(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>; - def : InstAlias(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>; -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDFPCvtTwoVector size, bits<5> opcode, - RegisterOperand outtype, RegisterOperand intype, - string asm, string VdTy, string VnTy, - list pattern> - : I<(outs outtype:$Rd), (ins intype:$Rn), asm, - !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "", pattern>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - let Inst{31} = 0; - let Inst{30} = Q; - let Inst{29} = U; - let Inst{28-24} = 0b01110; - let Inst{23-22} = size; - let Inst{21-17} = 0b10000; - let Inst{16-12} = opcode; - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -class BaseSIMDFPCvtTwoVectorTied size, bits<5> opcode, - RegisterOperand outtype, RegisterOperand intype, - string asm, string VdTy, string VnTy, - list pattern> - : I<(outs outtype:$dst), (ins outtype:$Rd, intype:$Rn), asm, - !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "$Rd = $dst", pattern>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - let Inst{31} = 0; - let Inst{30} = Q; - let Inst{29} = U; - let Inst{28-24} = 0b01110; - let Inst{23-22} = size; - let Inst{21-17} = 0b10000; - let Inst{16-12} = opcode; - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -multiclass SIMDFPWidenTwoVector opc, string asm> { - def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V128, V64, - asm, ".4s", ".4h", []>; - def v8i16 : BaseSIMDFPCvtTwoVector<1, U, {S,0}, opc, V128, V128, - asm#"2", ".4s", ".8h", []>; - def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V128, V64, - asm, ".2d", ".2s", []>; - def v4i32 : BaseSIMDFPCvtTwoVector<1, U, {S,1}, opc, V128, V128, - asm#"2", ".2d", ".4s", []>; -} - -multiclass SIMDFPNarrowTwoVector opc, string asm> { - def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V64, V128, - asm, ".4h", ".4s", []>; - def v8i16 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,0}, opc, V128, V128, - asm#"2", ".8h", ".4s", []>; - def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128, - asm, ".2s", ".2d", []>; - def v4i32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128, - asm#"2", ".4s", ".2d", []>; -} - -multiclass SIMDFPInexactCvtTwoVector opc, string asm, - Intrinsic OpNode> { - def v2f32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128, - asm, ".2s", ".2d", - [(set (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn)))]>; - def v4f32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128, - asm#"2", ".4s", ".2d", []>; - - def : Pat<(concat_vectors (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn))), - (!cast(NAME # "v4f32") - (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; -} - -//---------------------------------------------------------------------------- -// AdvSIMD three register different-size vector instructions. -//---------------------------------------------------------------------------- - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDDifferentThreeVector size, bits<4> opcode, - RegisterOperand outtype, RegisterOperand intype1, - RegisterOperand intype2, string asm, - string outkind, string inkind1, string inkind2, - list pattern> - : I<(outs outtype:$Rd), (ins intype1:$Rn, intype2:$Rm), asm, - "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 # - "|" # outkind # "\t$Rd, $Rn, $Rm}", "", pattern>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - let Inst{31} = 0; - let Inst{30} = size{0}; - let Inst{29} = U; - let Inst{28-24} = 0b01110; - let Inst{23-22} = size{2-1}; - let Inst{21} = 1; - let Inst{20-16} = Rm; - let Inst{15-12} = opcode; - let Inst{11-10} = 0b00; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDDifferentThreeVectorTied size, bits<4> opcode, - RegisterOperand outtype, RegisterOperand intype1, - RegisterOperand intype2, string asm, - string outkind, string inkind1, string inkind2, - list pattern> - : I<(outs outtype:$dst), (ins outtype:$Rd, intype1:$Rn, intype2:$Rm), asm, - "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 # - "|" # outkind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - let Inst{31} = 0; - let Inst{30} = size{0}; - let Inst{29} = U; - let Inst{28-24} = 0b01110; - let Inst{23-22} = size{2-1}; - let Inst{21} = 1; - let Inst{20-16} = Rm; - let Inst{15-12} = opcode; - let Inst{11-10} = 0b00; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -// FIXME: TableGen doesn't know how to deal with expanded types that also -// change the element count (in this case, placing the results in -// the high elements of the result register rather than the low -// elements). Until that's fixed, we can't code-gen those. -multiclass SIMDNarrowThreeVectorBHS opc, string asm, - Intrinsic IntOp> { - def v8i16_v8i8 : BaseSIMDDifferentThreeVector; - def v8i16_v16i8 : BaseSIMDDifferentThreeVectorTied; - def v4i32_v4i16 : BaseSIMDDifferentThreeVector; - def v4i32_v8i16 : BaseSIMDDifferentThreeVectorTied; - def v2i64_v2i32 : BaseSIMDDifferentThreeVector; - def v2i64_v4i32 : BaseSIMDDifferentThreeVectorTied; - - - // Patterns for the '2' variants involve INSERT_SUBREG, which you can't put in - // a version attached to an instruction. - def : Pat<(concat_vectors (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn), - (v8i16 V128:$Rm))), - (!cast(NAME # "v8i16_v16i8") - (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), - V128:$Rn, V128:$Rm)>; - def : Pat<(concat_vectors (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn), - (v4i32 V128:$Rm))), - (!cast(NAME # "v4i32_v8i16") - (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), - V128:$Rn, V128:$Rm)>; - def : Pat<(concat_vectors (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn), - (v2i64 V128:$Rm))), - (!cast(NAME # "v2i64_v4i32") - (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), - V128:$Rn, V128:$Rm)>; -} - -multiclass SIMDDifferentThreeVectorBD opc, string asm, - Intrinsic IntOp> { - def v8i8 : BaseSIMDDifferentThreeVector; - def v16i8 : BaseSIMDDifferentThreeVector; - let Predicates = [HasCrypto] in { - def v1i64 : BaseSIMDDifferentThreeVector; - def v2i64 : BaseSIMDDifferentThreeVector; - } - - def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)), - (v8i8 (extract_high_v16i8 V128:$Rm)))), - (!cast(NAME#"v16i8") V128:$Rn, V128:$Rm)>; -} - -multiclass SIMDLongThreeVectorHS opc, string asm, - SDPatternOperator OpNode> { - def v4i16_v4i32 : BaseSIMDDifferentThreeVector; - def v8i16_v4i32 : BaseSIMDDifferentThreeVector; - def v2i32_v2i64 : BaseSIMDDifferentThreeVector; - def v4i32_v2i64 : BaseSIMDDifferentThreeVector; -} - -multiclass SIMDLongThreeVectorBHSabdl opc, string asm, - SDPatternOperator OpNode = null_frag> { - def v8i8_v8i16 : BaseSIMDDifferentThreeVector; - def v16i8_v8i16 : BaseSIMDDifferentThreeVector; - def v4i16_v4i32 : BaseSIMDDifferentThreeVector; - def v8i16_v4i32 : BaseSIMDDifferentThreeVector; - def v2i32_v2i64 : BaseSIMDDifferentThreeVector; - def v4i32_v2i64 : BaseSIMDDifferentThreeVector; -} - -multiclass SIMDLongThreeVectorTiedBHSabal opc, - string asm, - SDPatternOperator OpNode> { - def v8i8_v8i16 : BaseSIMDDifferentThreeVectorTied; - def v16i8_v8i16 : BaseSIMDDifferentThreeVectorTied; - def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied; - def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied; - def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied; - def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied; -} - -multiclass SIMDLongThreeVectorBHS opc, string asm, - SDPatternOperator OpNode = null_frag> { - def v8i8_v8i16 : BaseSIMDDifferentThreeVector; - def v16i8_v8i16 : BaseSIMDDifferentThreeVector; - def v4i16_v4i32 : BaseSIMDDifferentThreeVector; - def v8i16_v4i32 : BaseSIMDDifferentThreeVector; - def v2i32_v2i64 : BaseSIMDDifferentThreeVector; - def v4i32_v2i64 : BaseSIMDDifferentThreeVector; -} - -multiclass SIMDLongThreeVectorTiedBHS opc, - string asm, - SDPatternOperator OpNode> { - def v8i8_v8i16 : BaseSIMDDifferentThreeVectorTied; - def v16i8_v8i16 : BaseSIMDDifferentThreeVectorTied; - def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied; - def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied; - def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied; - def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied; -} - -multiclass SIMDLongThreeVectorSQDMLXTiedHS opc, string asm, - SDPatternOperator Accum> { - def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied; - def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied; - def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied; - def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied; -} - -multiclass SIMDWideThreeVectorBHS opc, string asm, - SDPatternOperator OpNode> { - def v8i8_v8i16 : BaseSIMDDifferentThreeVector; - def v16i8_v8i16 : BaseSIMDDifferentThreeVector; - def v4i16_v4i32 : BaseSIMDDifferentThreeVector; - def v8i16_v4i32 : BaseSIMDDifferentThreeVector; - def v2i32_v2i64 : BaseSIMDDifferentThreeVector; - def v4i32_v2i64 : BaseSIMDDifferentThreeVector; -} - -//---------------------------------------------------------------------------- -// AdvSIMD bitwise extract from vector -//---------------------------------------------------------------------------- - -class BaseSIMDBitwiseExtract - : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, i32imm:$imm), asm, - "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $imm" # - "|" # kind # "\t$Rd, $Rn, $Rm, $imm}", "", - [(set (vty regtype:$Rd), - (ARM64ext regtype:$Rn, regtype:$Rm, (i32 imm:$imm)))]>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - bits<4> imm; - let Inst{31} = 0; - let Inst{30} = size; - let Inst{29-21} = 0b101110000; - let Inst{20-16} = Rm; - let Inst{15} = 0; - let Inst{14-11} = imm; - let Inst{10} = 0; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - - -multiclass SIMDBitwiseExtract { - def v8i8 : BaseSIMDBitwiseExtract<0, V64, v8i8, asm, ".8b"> { - let imm{3} = 0; - } - def v16i8 : BaseSIMDBitwiseExtract<1, V128, v16i8, asm, ".16b">; -} - -//---------------------------------------------------------------------------- -// AdvSIMD zip vector -//---------------------------------------------------------------------------- - -class BaseSIMDZipVector size, bits<3> opc, RegisterOperand regtype, - string asm, string kind, SDNode OpNode, ValueType valty> - : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, - "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # - "|" # kind # "\t$Rd, $Rn, $Rm}", "", - [(set (valty regtype:$Rd), (OpNode regtype:$Rn, regtype:$Rm))]>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - let Inst{31} = 0; - let Inst{30} = size{0}; - let Inst{29-24} = 0b001110; - let Inst{23-22} = size{2-1}; - let Inst{21} = 0; - let Inst{20-16} = Rm; - let Inst{15} = 0; - let Inst{14-12} = opc; - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -multiclass SIMDZipVectoropc, string asm, - SDNode OpNode> { - def v8i8 : BaseSIMDZipVector<0b000, opc, V64, - asm, ".8b", OpNode, v8i8>; - def v16i8 : BaseSIMDZipVector<0b001, opc, V128, - asm, ".16b", OpNode, v16i8>; - def v4i16 : BaseSIMDZipVector<0b010, opc, V64, - asm, ".4h", OpNode, v4i16>; - def v8i16 : BaseSIMDZipVector<0b011, opc, V128, - asm, ".8h", OpNode, v8i16>; - def v2i32 : BaseSIMDZipVector<0b100, opc, V64, - asm, ".2s", OpNode, v2i32>; - def v4i32 : BaseSIMDZipVector<0b101, opc, V128, - asm, ".4s", OpNode, v4i32>; - def v2i64 : BaseSIMDZipVector<0b111, opc, V128, - asm, ".2d", OpNode, v2i64>; - - def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)), - (!cast(NAME#"v2i32") V64:$Rn, V64:$Rm)>; - def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)), - (!cast(NAME#"v4i32") V128:$Rn, V128:$Rm)>; - def : Pat<(v2f64 (OpNode V128:$Rn, V128:$Rm)), - (!cast(NAME#"v2i64") V128:$Rn, V128:$Rm)>; -} - -//---------------------------------------------------------------------------- -// AdvSIMD three register scalar instructions -//---------------------------------------------------------------------------- - -let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in -class BaseSIMDThreeScalar size, bits<5> opcode, - RegisterClass regtype, string asm, - list pattern> - : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, - "\t$Rd, $Rn, $Rm", "", pattern>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - let Inst{31-30} = 0b01; - let Inst{29} = U; - let Inst{28-24} = 0b11110; - let Inst{23-22} = size; - let Inst{21} = 1; - let Inst{20-16} = Rm; - let Inst{15-11} = opcode; - let Inst{10} = 1; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -multiclass SIMDThreeScalarD opc, string asm, - SDPatternOperator OpNode> { - def v1i64 : BaseSIMDThreeScalar; -} - -multiclass SIMDThreeScalarBHSD opc, string asm, - SDPatternOperator OpNode> { - def v1i64 : BaseSIMDThreeScalar; - def v1i32 : BaseSIMDThreeScalar; - def v1i16 : BaseSIMDThreeScalar; - def v1i8 : BaseSIMDThreeScalar; - - def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))), - (!cast(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>; - def : Pat<(i32 (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))), - (!cast(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>; -} - -multiclass SIMDThreeScalarHS opc, string asm, - SDPatternOperator OpNode> { - def v1i32 : BaseSIMDThreeScalar; - def v1i16 : BaseSIMDThreeScalar; -} - -multiclass SIMDThreeScalarSD opc, string asm, - SDPatternOperator OpNode = null_frag> { - let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - def #NAME#64 : BaseSIMDThreeScalar; - def #NAME#32 : BaseSIMDThreeScalar; - } - - def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), - (!cast(NAME # "64") FPR64:$Rn, FPR64:$Rm)>; -} - -multiclass SIMDThreeScalarFPCmp opc, string asm, - SDPatternOperator OpNode = null_frag> { - let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - def #NAME#64 : BaseSIMDThreeScalar; - def #NAME#32 : BaseSIMDThreeScalar; - } - - def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), - (!cast(NAME # "64") FPR64:$Rn, FPR64:$Rm)>; -} - -class BaseSIMDThreeScalarMixed size, bits<5> opcode, - dag oops, dag iops, string asm, string cstr, list pat> - : I, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - let Inst{31-30} = 0b01; - let Inst{29} = U; - let Inst{28-24} = 0b11110; - let Inst{23-22} = size; - let Inst{21} = 1; - let Inst{20-16} = Rm; - let Inst{15-11} = opcode; - let Inst{10} = 0; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -multiclass SIMDThreeScalarMixedHS opc, string asm, - SDPatternOperator OpNode = null_frag> { - def i16 : BaseSIMDThreeScalarMixed; - def i32 : BaseSIMDThreeScalarMixed; -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -multiclass SIMDThreeScalarMixedTiedHS opc, string asm, - SDPatternOperator OpNode = null_frag> { - def i16 : BaseSIMDThreeScalarMixed; - def i32 : BaseSIMDThreeScalarMixed; -} - -//---------------------------------------------------------------------------- -// AdvSIMD two register scalar instructions -//---------------------------------------------------------------------------- - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDTwoScalar size, bits<5> opcode, - RegisterClass regtype, RegisterClass regtype2, - string asm, list pat> - : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm, - "\t$Rd, $Rn", "", pat>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - let Inst{31-30} = 0b01; - let Inst{29} = U; - let Inst{28-24} = 0b11110; - let Inst{23-22} = size; - let Inst{21-17} = 0b10000; - let Inst{16-12} = opcode; - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDTwoScalarTied size, bits<5> opcode, - RegisterClass regtype, RegisterClass regtype2, - string asm, list pat> - : I<(outs regtype:$dst), (ins regtype:$Rd, regtype2:$Rn), asm, - "\t$Rd, $Rn", "$Rd = $dst", pat>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - let Inst{31-30} = 0b01; - let Inst{29} = U; - let Inst{28-24} = 0b11110; - let Inst{23-22} = size; - let Inst{21-17} = 0b10000; - let Inst{16-12} = opcode; - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDCmpTwoScalar size, bits<5> opcode, - RegisterClass regtype, string asm, string zero> - : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, - "\t$Rd, $Rn, #" # zero, "", []>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - let Inst{31-30} = 0b01; - let Inst{29} = U; - let Inst{28-24} = 0b11110; - let Inst{23-22} = size; - let Inst{21-17} = 0b10000; - let Inst{16-12} = opcode; - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -class SIMDInexactCvtTwoScalar opcode, string asm> - : I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "", - [(set (f32 FPR32:$Rd), (int_arm64_sisd_fcvtxn (f64 FPR64:$Rn)))]>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - let Inst{31-17} = 0b011111100110000; - let Inst{16-12} = opcode; - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -multiclass SIMDCmpTwoScalarD opc, string asm, - SDPatternOperator OpNode> { - def v1i64rz : BaseSIMDCmpTwoScalar; - - def : Pat<(v1i64 (OpNode FPR64:$Rn)), - (!cast(NAME # v1i64rz) FPR64:$Rn)>; -} - -multiclass SIMDCmpTwoScalarSD opc, string asm, - SDPatternOperator OpNode> { - def v1i64rz : BaseSIMDCmpTwoScalar; - def v1i32rz : BaseSIMDCmpTwoScalar; - - def : InstAlias(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>; - def : InstAlias(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>; - - def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))), - (!cast(NAME # v1i64rz) FPR64:$Rn)>; -} - -multiclass SIMDTwoScalarD opc, string asm, - SDPatternOperator OpNode = null_frag> { - def v1i64 : BaseSIMDTwoScalar; - - def : Pat<(i64 (OpNode (i64 FPR64:$Rn))), - (!cast(NAME # "v1i64") FPR64:$Rn)>; -} - -multiclass SIMDTwoScalarSD opc, string asm> { - def v1i64 : BaseSIMDTwoScalar; - def v1i32 : BaseSIMDTwoScalar; -} - -multiclass SIMDTwoScalarCVTSD opc, string asm, - SDPatternOperator OpNode> { - def v1i64 : BaseSIMDTwoScalar; - def v1i32 : BaseSIMDTwoScalar; -} - -multiclass SIMDTwoScalarBHSD opc, string asm, - SDPatternOperator OpNode = null_frag> { - let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - def v1i64 : BaseSIMDTwoScalar; - def v1i32 : BaseSIMDTwoScalar; - def v1i16 : BaseSIMDTwoScalar; - def v1i8 : BaseSIMDTwoScalar; - } - - def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))), - (!cast(NAME # v1i64) FPR64:$Rn)>; -} - -multiclass SIMDTwoScalarBHSDTied opc, string asm, - Intrinsic OpNode> { - let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - def v1i64 : BaseSIMDTwoScalarTied; - def v1i32 : BaseSIMDTwoScalarTied; - def v1i16 : BaseSIMDTwoScalarTied; - def v1i8 : BaseSIMDTwoScalarTied; - } - - def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn))), - (!cast(NAME # v1i64) FPR64:$Rd, FPR64:$Rn)>; -} - - - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -multiclass SIMDTwoScalarMixedBHS opc, string asm, - SDPatternOperator OpNode = null_frag> { - def v1i32 : BaseSIMDTwoScalar; - def v1i16 : BaseSIMDTwoScalar; - def v1i8 : BaseSIMDTwoScalar; -} - -//---------------------------------------------------------------------------- -// AdvSIMD scalar pairwise instructions -//---------------------------------------------------------------------------- - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDPairwiseScalar size, bits<5> opcode, - RegisterOperand regtype, RegisterOperand vectype, - string asm, string kind> - : I<(outs regtype:$Rd), (ins vectype:$Rn), asm, - "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", []>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - let Inst{31-30} = 0b01; - let Inst{29} = U; - let Inst{28-24} = 0b11110; - let Inst{23-22} = size; - let Inst{21-17} = 0b11000; - let Inst{16-12} = opcode; - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -multiclass SIMDPairwiseScalarD opc, string asm> { - def v2i64p : BaseSIMDPairwiseScalar; -} - -multiclass SIMDPairwiseScalarSD opc, string asm> { - def v2i32p : BaseSIMDPairwiseScalar; - def v2i64p : BaseSIMDPairwiseScalar; -} - -//---------------------------------------------------------------------------- -// AdvSIMD across lanes instructions -//---------------------------------------------------------------------------- - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDAcrossLanes size, bits<5> opcode, - RegisterClass regtype, RegisterOperand vectype, - string asm, string kind, list pattern> - : I<(outs regtype:$Rd), (ins vectype:$Rn), asm, - "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", pattern>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - let Inst{31} = 0; - let Inst{30} = Q; - let Inst{29} = U; - let Inst{28-24} = 0b01110; - let Inst{23-22} = size; - let Inst{21-17} = 0b11000; - let Inst{16-12} = opcode; - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -multiclass SIMDAcrossLanesBHS opcode, - string asm> { - def v8i8v : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR8, V64, - asm, ".8b", []>; - def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR8, V128, - asm, ".16b", []>; - def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR16, V64, - asm, ".4h", []>; - def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR16, V128, - asm, ".8h", []>; - def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR32, V128, - asm, ".4s", []>; -} - -multiclass SIMDAcrossLanesHSD opcode, string asm> { - def v8i8v : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR16, V64, - asm, ".8b", []>; - def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR16, V128, - asm, ".16b", []>; - def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR32, V64, - asm, ".4h", []>; - def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR32, V128, - asm, ".8h", []>; - def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR64, V128, - asm, ".4s", []>; -} - -multiclass SIMDAcrossLanesS opcode, bit sz1, string asm, - Intrinsic intOp> { - def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128, - asm, ".4s", - [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>; -} - -//---------------------------------------------------------------------------- -// AdvSIMD INS/DUP instructions -//---------------------------------------------------------------------------- - -// FIXME: There has got to be a better way to factor these. ugh. - -class BaseSIMDInsDup pattern> - : I, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - let Inst{31} = 0; - let Inst{30} = Q; - let Inst{29} = op; - let Inst{28-21} = 0b01110000; - let Inst{15} = 0; - let Inst{10} = 1; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -class SIMDDupFromMain imm5, string size, ValueType vectype, - RegisterOperand vecreg, RegisterClass regtype> - : BaseSIMDInsDup { - let Inst{20-16} = imm5; - let Inst{14-11} = 0b0001; -} - -class SIMDDupFromElement - : BaseSIMDInsDup { - let Inst{14-11} = 0b0000; -} - -class SIMDDup64FromElement - : SIMDDupFromElement<1, ".2d", ".d", v2i64, v2i64, V128, - VectorIndexD, i64, ARM64duplane64> { - bits<1> idx; - let Inst{20} = idx; - let Inst{19-16} = 0b1000; -} - -class SIMDDup32FromElement - : SIMDDupFromElement { - bits<2> idx; - let Inst{20-19} = idx; - let Inst{18-16} = 0b100; -} - -class SIMDDup16FromElement - : SIMDDupFromElement { - bits<3> idx; - let Inst{20-18} = idx; - let Inst{17-16} = 0b10; -} - -class SIMDDup8FromElement - : SIMDDupFromElement { - bits<4> idx; - let Inst{20-17} = idx; - let Inst{16} = 1; -} - -class BaseSIMDMov imm4, RegisterClass regtype, - Operand idxtype, string asm, list pattern> - : BaseSIMDInsDup { - let Inst{14-11} = imm4; -} - -class SIMDSMov - : BaseSIMDMov; -class SIMDUMov - : BaseSIMDMov; - -class SIMDMovAlias - : InstAlias; - -multiclass SMov { - def vi8to32 : SIMDSMov<0, ".b", GPR32, VectorIndexB> { - bits<4> idx; - let Inst{20-17} = idx; - let Inst{16} = 1; - } - def vi8to64 : SIMDSMov<1, ".b", GPR64, VectorIndexB> { - bits<4> idx; - let Inst{20-17} = idx; - let Inst{16} = 1; - } - def vi16to32 : SIMDSMov<0, ".h", GPR32, VectorIndexH> { - bits<3> idx; - let Inst{20-18} = idx; - let Inst{17-16} = 0b10; - } - def vi16to64 : SIMDSMov<1, ".h", GPR64, VectorIndexH> { - bits<3> idx; - let Inst{20-18} = idx; - let Inst{17-16} = 0b10; - } - def vi32to64 : SIMDSMov<1, ".s", GPR64, VectorIndexS> { - bits<2> idx; - let Inst{20-19} = idx; - let Inst{18-16} = 0b100; - } -} - -multiclass UMov { - def vi8 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndexB> { - bits<4> idx; - let Inst{20-17} = idx; - let Inst{16} = 1; - } - def vi16 : SIMDUMov<0, ".h", v8i16, GPR32, VectorIndexH> { - bits<3> idx; - let Inst{20-18} = idx; - let Inst{17-16} = 0b10; - } - def vi32 : SIMDUMov<0, ".s", v4i32, GPR32, VectorIndexS> { - bits<2> idx; - let Inst{20-19} = idx; - let Inst{18-16} = 0b100; - } - def vi64 : SIMDUMov<1, ".d", v2i64, GPR64, VectorIndexD> { - bits<1> idx; - let Inst{20} = idx; - let Inst{19-16} = 0b1000; - } - def : SIMDMovAlias<"mov", ".s", - !cast(NAME#"vi32"), - GPR32, VectorIndexS>; - def : SIMDMovAlias<"mov", ".d", - !cast(NAME#"vi64"), - GPR64, VectorIndexD>; -} - -class SIMDInsFromMain - : BaseSIMDInsDup<1, 0, (outs V128:$dst), - (ins V128:$Rd, idxtype:$idx, regtype:$Rn), "ins", - "{\t$Rd" # size # "$idx, $Rn" # - "|" # size # "\t$Rd$idx, $Rn}", - "$Rd = $dst", - [(set V128:$dst, - (vector_insert (vectype V128:$Rd), regtype:$Rn, idxtype:$idx))]> { - let Inst{14-11} = 0b0011; -} - -class SIMDInsFromElement - : BaseSIMDInsDup<1, 1, (outs V128:$dst), - (ins V128:$Rd, idxtype:$idx, V128:$Rn, idxtype:$idx2), "ins", - "{\t$Rd" # size # "$idx, $Rn" # size # "$idx2" # - "|" # size # "\t$Rd$idx, $Rn$idx2}", - "$Rd = $dst", - [(set V128:$dst, - (vector_insert - (vectype V128:$Rd), - (elttype (vector_extract (vectype V128:$Rn), idxtype:$idx2)), - idxtype:$idx))]>; - -class SIMDInsMainMovAlias - : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # - "|" # size #"\t$dst$idx, $src}", - (inst V128:$dst, idxtype:$idx, regtype:$src)>; -class SIMDInsElementMovAlias - : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" # - # "|" # size #" $dst$idx, $src$idx2}", - (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>; - - -multiclass SIMDIns { - def vi8gpr : SIMDInsFromMain<".b", v16i8, GPR32, VectorIndexB> { - bits<4> idx; - let Inst{20-17} = idx; - let Inst{16} = 1; - } - def vi16gpr : SIMDInsFromMain<".h", v8i16, GPR32, VectorIndexH> { - bits<3> idx; - let Inst{20-18} = idx; - let Inst{17-16} = 0b10; - } - def vi32gpr : SIMDInsFromMain<".s", v4i32, GPR32, VectorIndexS> { - bits<2> idx; - let Inst{20-19} = idx; - let Inst{18-16} = 0b100; - } - def vi64gpr : SIMDInsFromMain<".d", v2i64, GPR64, VectorIndexD> { - bits<1> idx; - let Inst{20} = idx; - let Inst{19-16} = 0b1000; - } - - def vi8lane : SIMDInsFromElement<".b", v16i8, i32, VectorIndexB> { - bits<4> idx; - bits<4> idx2; - let Inst{20-17} = idx; - let Inst{16} = 1; - let Inst{14-11} = idx2; - } - def vi16lane : SIMDInsFromElement<".h", v8i16, i32, VectorIndexH> { - bits<3> idx; - bits<3> idx2; - let Inst{20-18} = idx; - let Inst{17-16} = 0b10; - let Inst{14-12} = idx2; - let Inst{11} = 0; - } - def vi32lane : SIMDInsFromElement<".s", v4i32, i32, VectorIndexS> { - bits<2> idx; - bits<2> idx2; - let Inst{20-19} = idx; - let Inst{18-16} = 0b100; - let Inst{14-13} = idx2; - let Inst{12-11} = 0; - } - def vi64lane : SIMDInsFromElement<".d", v2i64, i64, VectorIndexD> { - bits<1> idx; - bits<1> idx2; - let Inst{20} = idx; - let Inst{19-16} = 0b1000; - let Inst{14} = idx2; - let Inst{13-11} = 0; - } - - // For all forms of the INS instruction, the "mov" mnemonic is the - // preferred alias. Why they didn't just call the instruction "mov" in - // the first place is a very good question indeed... - def : SIMDInsMainMovAlias<".b", !cast(NAME#"vi8gpr"), - GPR32, VectorIndexB>; - def : SIMDInsMainMovAlias<".h", !cast(NAME#"vi16gpr"), - GPR32, VectorIndexH>; - def : SIMDInsMainMovAlias<".s", !cast(NAME#"vi32gpr"), - GPR32, VectorIndexS>; - def : SIMDInsMainMovAlias<".d", !cast(NAME#"vi64gpr"), - GPR64, VectorIndexD>; - - def : SIMDInsElementMovAlias<".b", !cast(NAME#"vi8lane"), - VectorIndexB>; - def : SIMDInsElementMovAlias<".h", !cast(NAME#"vi16lane"), - VectorIndexH>; - def : SIMDInsElementMovAlias<".s", !cast(NAME#"vi32lane"), - VectorIndexS>; - def : SIMDInsElementMovAlias<".d", !cast(NAME#"vi64lane"), - VectorIndexD>; -} - -//---------------------------------------------------------------------------- -// AdvSIMD TBL/TBX -//---------------------------------------------------------------------------- - -let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in -class BaseSIMDTableLookup len, bit op, RegisterOperand vectype, - RegisterOperand listtype, string asm, string kind> - : I<(outs vectype:$Vd), (ins listtype:$Vn, vectype:$Vm), asm, - "\t$Vd" # kind # ", $Vn, $Vm" # kind, "", []>, - Sched<[WriteV]> { - bits<5> Vd; - bits<5> Vn; - bits<5> Vm; - let Inst{31} = 0; - let Inst{30} = Q; - let Inst{29-21} = 0b001110000; - let Inst{20-16} = Vm; - let Inst{15} = 0; - let Inst{14-13} = len; - let Inst{12} = op; - let Inst{11-10} = 0b00; - let Inst{9-5} = Vn; - let Inst{4-0} = Vd; -} - -let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in -class BaseSIMDTableLookupTied len, bit op, RegisterOperand vectype, - RegisterOperand listtype, string asm, string kind> - : I<(outs vectype:$dst), (ins vectype:$Vd, listtype:$Vn, vectype:$Vm), asm, - "\t$Vd" # kind # ", $Vn, $Vm" # kind, "$Vd = $dst", []>, - Sched<[WriteV]> { - bits<5> Vd; - bits<5> Vn; - bits<5> Vm; - let Inst{31} = 0; - let Inst{30} = Q; - let Inst{29-21} = 0b001110000; - let Inst{20-16} = Vm; - let Inst{15} = 0; - let Inst{14-13} = len; - let Inst{12} = op; - let Inst{11-10} = 0b00; - let Inst{9-5} = Vn; - let Inst{4-0} = Vd; -} - -class SIMDTableLookupAlias - : InstAlias; - -multiclass SIMDTableLookup { - def v8i8One : BaseSIMDTableLookup<0, 0b00, op, V64, VecListOne16b, - asm, ".8b">; - def v8i8Two : BaseSIMDTableLookup<0, 0b01, op, V64, VecListTwo16b, - asm, ".8b">; - def v8i8Three : BaseSIMDTableLookup<0, 0b10, op, V64, VecListThree16b, - asm, ".8b">; - def v8i8Four : BaseSIMDTableLookup<0, 0b11, op, V64, VecListFour16b, - asm, ".8b">; - def v16i8One : BaseSIMDTableLookup<1, 0b00, op, V128, VecListOne16b, - asm, ".16b">; - def v16i8Two : BaseSIMDTableLookup<1, 0b01, op, V128, VecListTwo16b, - asm, ".16b">; - def v16i8Three: BaseSIMDTableLookup<1, 0b10, op, V128, VecListThree16b, - asm, ".16b">; - def v16i8Four : BaseSIMDTableLookup<1, 0b11, op, V128, VecListFour16b, - asm, ".16b">; - - def : SIMDTableLookupAlias(NAME#"v8i8One"), - V64, VecListOne128>; - def : SIMDTableLookupAlias(NAME#"v8i8Two"), - V64, VecListTwo128>; - def : SIMDTableLookupAlias(NAME#"v8i8Three"), - V64, VecListThree128>; - def : SIMDTableLookupAlias(NAME#"v8i8Four"), - V64, VecListFour128>; - def : SIMDTableLookupAlias(NAME#"v16i8One"), - V128, VecListOne128>; - def : SIMDTableLookupAlias(NAME#"v16i8Two"), - V128, VecListTwo128>; - def : SIMDTableLookupAlias(NAME#"v16i8Three"), - V128, VecListThree128>; - def : SIMDTableLookupAlias(NAME#"v16i8Four"), - V128, VecListFour128>; -} - -multiclass SIMDTableLookupTied { - def v8i8One : BaseSIMDTableLookupTied<0, 0b00, op, V64, VecListOne16b, - asm, ".8b">; - def v8i8Two : BaseSIMDTableLookupTied<0, 0b01, op, V64, VecListTwo16b, - asm, ".8b">; - def v8i8Three : BaseSIMDTableLookupTied<0, 0b10, op, V64, VecListThree16b, - asm, ".8b">; - def v8i8Four : BaseSIMDTableLookupTied<0, 0b11, op, V64, VecListFour16b, - asm, ".8b">; - def v16i8One : BaseSIMDTableLookupTied<1, 0b00, op, V128, VecListOne16b, - asm, ".16b">; - def v16i8Two : BaseSIMDTableLookupTied<1, 0b01, op, V128, VecListTwo16b, - asm, ".16b">; - def v16i8Three: BaseSIMDTableLookupTied<1, 0b10, op, V128, VecListThree16b, - asm, ".16b">; - def v16i8Four : BaseSIMDTableLookupTied<1, 0b11, op, V128, VecListFour16b, - asm, ".16b">; - - def : SIMDTableLookupAlias(NAME#"v8i8One"), - V64, VecListOne128>; - def : SIMDTableLookupAlias(NAME#"v8i8Two"), - V64, VecListTwo128>; - def : SIMDTableLookupAlias(NAME#"v8i8Three"), - V64, VecListThree128>; - def : SIMDTableLookupAlias(NAME#"v8i8Four"), - V64, VecListFour128>; - def : SIMDTableLookupAlias(NAME#"v16i8One"), - V128, VecListOne128>; - def : SIMDTableLookupAlias(NAME#"v16i8Two"), - V128, VecListTwo128>; - def : SIMDTableLookupAlias(NAME#"v16i8Three"), - V128, VecListThree128>; - def : SIMDTableLookupAlias(NAME#"v16i8Four"), - V128, VecListFour128>; -} - - -//---------------------------------------------------------------------------- -// AdvSIMD scalar CPY -//---------------------------------------------------------------------------- -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDScalarCPY - : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), "mov", - "{\t$dst, $src" # kind # "$idx" # - "|\t$dst, $src$idx}", "", []>, - Sched<[WriteV]> { - bits<5> dst; - bits<5> src; - let Inst{31-21} = 0b01011110000; - let Inst{15-10} = 0b000001; - let Inst{9-5} = src; - let Inst{4-0} = dst; -} - -class SIMDScalarCPYAlias - : InstAlias; - - -multiclass SIMDScalarCPY { - def i8 : BaseSIMDScalarCPY { - bits<4> idx; - let Inst{20-17} = idx; - let Inst{16} = 1; - } - def i16 : BaseSIMDScalarCPY { - bits<3> idx; - let Inst{20-18} = idx; - let Inst{17-16} = 0b10; - } - def i32 : BaseSIMDScalarCPY { - bits<2> idx; - let Inst{20-19} = idx; - let Inst{18-16} = 0b100; - } - def i64 : BaseSIMDScalarCPY { - bits<1> idx; - let Inst{20} = idx; - let Inst{19-16} = 0b1000; - } - - def : Pat<(v1i64 (scalar_to_vector (i64 (vector_extract (v2i64 V128:$src), - VectorIndexD:$idx)))), - (!cast(NAME # i64) V128:$src, VectorIndexD:$idx)>; - - // 'DUP' mnemonic aliases. - def : SIMDScalarCPYAlias<"dup", ".b", - !cast(NAME#"i8"), - FPR8, V128, VectorIndexB>; - def : SIMDScalarCPYAlias<"dup", ".h", - !cast(NAME#"i16"), - FPR16, V128, VectorIndexH>; - def : SIMDScalarCPYAlias<"dup", ".s", - !cast(NAME#"i32"), - FPR32, V128, VectorIndexS>; - def : SIMDScalarCPYAlias<"dup", ".d", - !cast(NAME#"i64"), - FPR64, V128, VectorIndexD>; -} - -//---------------------------------------------------------------------------- -// AdvSIMD modified immediate instructions -//---------------------------------------------------------------------------- - -class BaseSIMDModifiedImm pattern> - : I, - Sched<[WriteV]> { - bits<5> Rd; - bits<8> imm8; - let Inst{31} = 0; - let Inst{30} = Q; - let Inst{29} = op; - let Inst{28-19} = 0b0111100000; - let Inst{18-16} = imm8{7-5}; - let Inst{11-10} = 0b01; - let Inst{9-5} = imm8{4-0}; - let Inst{4-0} = Rd; -} - -class BaseSIMDModifiedImmVector pattern> - : BaseSIMDModifiedImm { - let DecoderMethod = "DecodeModImmInstruction"; -} - -class BaseSIMDModifiedImmVectorTied pattern> - : BaseSIMDModifiedImm { - let DecoderMethod = "DecodeModImmTiedInstruction"; -} - -class BaseSIMDModifiedImmVectorShift b15_b12, - RegisterOperand vectype, string asm, - string kind, list pattern> - : BaseSIMDModifiedImmVector { - bits<2> shift; - let Inst{15} = b15_b12{1}; - let Inst{14-13} = shift; - let Inst{12} = b15_b12{0}; -} - -class BaseSIMDModifiedImmVectorShiftTied b15_b12, - RegisterOperand vectype, string asm, - string kind, list pattern> - : BaseSIMDModifiedImmVectorTied { - bits<2> shift; - let Inst{15} = b15_b12{1}; - let Inst{14-13} = shift; - let Inst{12} = b15_b12{0}; -} - - -class BaseSIMDModifiedImmVectorShiftHalf b15_b12, - RegisterOperand vectype, string asm, - string kind, list pattern> - : BaseSIMDModifiedImmVector { - bits<2> shift; - let Inst{15} = b15_b12{1}; - let Inst{14} = 0; - let Inst{13} = shift{0}; - let Inst{12} = b15_b12{0}; -} - -class BaseSIMDModifiedImmVectorShiftHalfTied b15_b12, - RegisterOperand vectype, string asm, - string kind, list pattern> - : BaseSIMDModifiedImmVectorTied { - bits<2> shift; - let Inst{15} = b15_b12{1}; - let Inst{14} = 0; - let Inst{13} = shift{0}; - let Inst{12} = b15_b12{0}; -} - -multiclass SIMDModifiedImmVectorShift hw_cmode, bits<2> w_cmode, - string asm> { - def v4i16 : BaseSIMDModifiedImmVectorShiftHalf<0, op, hw_cmode, V64, - asm, ".4h", []>; - def v8i16 : BaseSIMDModifiedImmVectorShiftHalf<1, op, hw_cmode, V128, - asm, ".8h", []>; - - def v2i32 : BaseSIMDModifiedImmVectorShift<0, op, w_cmode, V64, - asm, ".2s", []>; - def v4i32 : BaseSIMDModifiedImmVectorShift<1, op, w_cmode, V128, - asm, ".4s", []>; -} - -multiclass SIMDModifiedImmVectorShiftTied hw_cmode, - bits<2> w_cmode, string asm, - SDNode OpNode> { - def v4i16 : BaseSIMDModifiedImmVectorShiftHalfTied<0, op, hw_cmode, V64, - asm, ".4h", - [(set (v4i16 V64:$dst), (OpNode V64:$Rd, - imm0_255:$imm8, - (i32 imm:$shift)))]>; - def v8i16 : BaseSIMDModifiedImmVectorShiftHalfTied<1, op, hw_cmode, V128, - asm, ".8h", - [(set (v8i16 V128:$dst), (OpNode V128:$Rd, - imm0_255:$imm8, - (i32 imm:$shift)))]>; - - def v2i32 : BaseSIMDModifiedImmVectorShiftTied<0, op, w_cmode, V64, - asm, ".2s", - [(set (v2i32 V64:$dst), (OpNode V64:$Rd, - imm0_255:$imm8, - (i32 imm:$shift)))]>; - def v4i32 : BaseSIMDModifiedImmVectorShiftTied<1, op, w_cmode, V128, - asm, ".4s", - [(set (v4i32 V128:$dst), (OpNode V128:$Rd, - imm0_255:$imm8, - (i32 imm:$shift)))]>; -} - -class SIMDModifiedImmMoveMSL cmode, - RegisterOperand vectype, string asm, - string kind, list pattern> - : BaseSIMDModifiedImmVector { - bits<1> shift; - let Inst{15-13} = cmode{3-1}; - let Inst{12} = shift; -} - -class SIMDModifiedImmVectorNoShift cmode, - RegisterOperand vectype, - Operand imm_type, string asm, - string kind, list pattern> - : BaseSIMDModifiedImmVector { - let Inst{15-12} = cmode; -} - -class SIMDModifiedImmScalarNoShift cmode, string asm, - list pattern> - : BaseSIMDModifiedImm { - let Inst{15-12} = cmode; - let DecoderMethod = "DecodeModImmInstruction"; -} - -//---------------------------------------------------------------------------- -// AdvSIMD indexed element -//---------------------------------------------------------------------------- - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDIndexed size, bits<4> opc, - RegisterOperand dst_reg, RegisterOperand lhs_reg, - RegisterOperand rhs_reg, Operand vec_idx, string asm, - string apple_kind, string dst_kind, string lhs_kind, - string rhs_kind, list pattern> - : I<(outs dst_reg:$Rd), (ins lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), - asm, - "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" # - "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "", pattern>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - - let Inst{31} = 0; - let Inst{30} = Q; - let Inst{29} = U; - let Inst{28} = Scalar; - let Inst{27-24} = 0b1111; - let Inst{23-22} = size; - // Bit 21 must be set by the derived class. - let Inst{20-16} = Rm; - let Inst{15-12} = opc; - // Bit 11 must be set by the derived class. - let Inst{10} = 0; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDIndexedTied size, bits<4> opc, - RegisterOperand dst_reg, RegisterOperand lhs_reg, - RegisterOperand rhs_reg, Operand vec_idx, string asm, - string apple_kind, string dst_kind, string lhs_kind, - string rhs_kind, list pattern> - : I<(outs dst_reg:$dst), - (ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), asm, - "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" # - "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "$Rd = $dst", pattern>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - - let Inst{31} = 0; - let Inst{30} = Q; - let Inst{29} = U; - let Inst{28} = Scalar; - let Inst{27-24} = 0b1111; - let Inst{23-22} = size; - // Bit 21 must be set by the derived class. - let Inst{20-16} = Rm; - let Inst{15-12} = opc; - // Bit 11 must be set by the derived class. - let Inst{10} = 0; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -multiclass SIMDFPIndexedSD opc, string asm, - SDPatternOperator OpNode> { - def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, - V64, V64, - V128, VectorIndexS, - asm, ".2s", ".2s", ".2s", ".s", - [(set (v2f32 V64:$Rd), - (OpNode (v2f32 V64:$Rn), - (v2f32 (ARM64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> { - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } - - def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc, - V128, V128, - V128, VectorIndexS, - asm, ".4s", ".4s", ".4s", ".s", - [(set (v4f32 V128:$Rd), - (OpNode (v4f32 V128:$Rn), - (v4f32 (ARM64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> { - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } - - def v2i64_indexed : BaseSIMDIndexed<1, U, 0, 0b11, opc, - V128, V128, - V128, VectorIndexD, - asm, ".2d", ".2d", ".2d", ".d", - [(set (v2f64 V128:$Rd), - (OpNode (v2f64 V128:$Rn), - (v2f64 (ARM64duplane64 (v2f64 V128:$Rm), VectorIndexD:$idx))))]> { - bits<1> idx; - let Inst{11} = idx{0}; - let Inst{21} = 0; - } - - def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc, - FPR32Op, FPR32Op, V128, VectorIndexS, - asm, ".s", "", "", ".s", - [(set (f32 FPR32Op:$Rd), - (OpNode (f32 FPR32Op:$Rn), - (f32 (vector_extract (v4f32 V128:$Rm), - VectorIndexS:$idx))))]> { - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } - - def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b11, opc, - FPR64Op, FPR64Op, V128, VectorIndexD, - asm, ".d", "", "", ".d", - [(set (f64 FPR64Op:$Rd), - (OpNode (f64 FPR64Op:$Rn), - (f64 (vector_extract (v2f64 V128:$Rm), - VectorIndexD:$idx))))]> { - bits<1> idx; - let Inst{11} = idx{0}; - let Inst{21} = 0; - } -} - -multiclass SIMDFPIndexedSDTiedPatterns { - // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar. - def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), - (ARM64duplane32 (v4f32 V128:$Rm), - VectorIndexS:$idx))), - (!cast(INST # v2i32_indexed) - V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>; - def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), - (ARM64dup (f32 FPR32Op:$Rm)))), - (!cast(INST # "v2i32_indexed") V64:$Rd, V64:$Rn, - (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>; - - - // 2 variants for the .4s version: DUPLANE from 128-bit and DUP scalar. - def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), - (ARM64duplane32 (v4f32 V128:$Rm), - VectorIndexS:$idx))), - (!cast(INST # "v4i32_indexed") - V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>; - def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), - (ARM64dup (f32 FPR32Op:$Rm)))), - (!cast(INST # "v4i32_indexed") V128:$Rd, V128:$Rn, - (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>; - - // 2 variants for the .2d version: DUPLANE from 128-bit and DUP scalar. - def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), - (ARM64duplane64 (v2f64 V128:$Rm), - VectorIndexD:$idx))), - (!cast(INST # "v2i64_indexed") - V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>; - def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), - (ARM64dup (f64 FPR64Op:$Rm)))), - (!cast(INST # "v2i64_indexed") V128:$Rd, V128:$Rn, - (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>; - - // 2 variants for 32-bit scalar version: extract from .2s or from .4s - def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), - (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))), - (!cast(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn, - V128:$Rm, VectorIndexS:$idx)>; - def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), - (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))), - (!cast(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn, - (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>; - - // 1 variant for 64-bit scalar version: extract from .1d or from .2d - def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn), - (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))), - (!cast(INST # "v1i64_indexed") FPR64:$Rd, FPR64:$Rn, - V128:$Rm, VectorIndexD:$idx)>; -} - -multiclass SIMDFPIndexedSDTied opc, string asm> { - def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64, - V128, VectorIndexS, - asm, ".2s", ".2s", ".2s", ".s", []> { - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } - - def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc, - V128, V128, - V128, VectorIndexS, - asm, ".4s", ".4s", ".4s", ".s", []> { - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } - - def v2i64_indexed : BaseSIMDIndexedTied<1, U, 0, 0b11, opc, - V128, V128, - V128, VectorIndexD, - asm, ".2d", ".2d", ".2d", ".d", []> { - bits<1> idx; - let Inst{11} = idx{0}; - let Inst{21} = 0; - } - - - def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc, - FPR32Op, FPR32Op, V128, VectorIndexS, - asm, ".s", "", "", ".s", []> { - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } - - def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b11, opc, - FPR64Op, FPR64Op, V128, VectorIndexD, - asm, ".d", "", "", ".d", []> { - bits<1> idx; - let Inst{11} = idx{0}; - let Inst{21} = 0; - } -} - -multiclass SIMDIndexedHS opc, string asm, - SDPatternOperator OpNode> { - def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64, - V128_lo, VectorIndexH, - asm, ".4h", ".4h", ".4h", ".h", - [(set (v4i16 V64:$Rd), - (OpNode (v4i16 V64:$Rn), - (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { - bits<3> idx; - let Inst{11} = idx{2}; - let Inst{21} = idx{1}; - let Inst{20} = idx{0}; - } - - def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc, - V128, V128, - V128_lo, VectorIndexH, - asm, ".8h", ".8h", ".8h", ".h", - [(set (v8i16 V128:$Rd), - (OpNode (v8i16 V128:$Rn), - (v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { - bits<3> idx; - let Inst{11} = idx{2}; - let Inst{21} = idx{1}; - let Inst{20} = idx{0}; - } - - def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, - V64, V64, - V128, VectorIndexS, - asm, ".2s", ".2s", ".2s", ".s", - [(set (v2i32 V64:$Rd), - (OpNode (v2i32 V64:$Rn), - (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } - - def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc, - V128, V128, - V128, VectorIndexS, - asm, ".4s", ".4s", ".4s", ".s", - [(set (v4i32 V128:$Rd), - (OpNode (v4i32 V128:$Rn), - (v4i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } - - def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc, - FPR16Op, FPR16Op, V128_lo, VectorIndexH, - asm, ".h", "", "", ".h", []> { - bits<3> idx; - let Inst{11} = idx{2}; - let Inst{21} = idx{1}; - let Inst{20} = idx{0}; - } - - def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc, - FPR32Op, FPR32Op, V128, VectorIndexS, - asm, ".s", "", "", ".s", - [(set (i32 FPR32Op:$Rd), - (OpNode FPR32Op:$Rn, - (i32 (vector_extract (v4i32 V128:$Rm), - VectorIndexS:$idx))))]> { - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } -} - -multiclass SIMDVectorIndexedHS opc, string asm, - SDPatternOperator OpNode> { - def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, - V64, V64, - V128_lo, VectorIndexH, - asm, ".4h", ".4h", ".4h", ".h", - [(set (v4i16 V64:$Rd), - (OpNode (v4i16 V64:$Rn), - (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { - bits<3> idx; - let Inst{11} = idx{2}; - let Inst{21} = idx{1}; - let Inst{20} = idx{0}; - } - - def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc, - V128, V128, - V128_lo, VectorIndexH, - asm, ".8h", ".8h", ".8h", ".h", - [(set (v8i16 V128:$Rd), - (OpNode (v8i16 V128:$Rn), - (v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { - bits<3> idx; - let Inst{11} = idx{2}; - let Inst{21} = idx{1}; - let Inst{20} = idx{0}; - } - - def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, - V64, V64, - V128, VectorIndexS, - asm, ".2s", ".2s", ".2s", ".s", - [(set (v2i32 V64:$Rd), - (OpNode (v2i32 V64:$Rn), - (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } - - def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc, - V128, V128, - V128, VectorIndexS, - asm, ".4s", ".4s", ".4s", ".s", - [(set (v4i32 V128:$Rd), - (OpNode (v4i32 V128:$Rn), - (v4i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } -} - -multiclass SIMDVectorIndexedHSTied opc, string asm, - SDPatternOperator OpNode> { - def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64, - V128_lo, VectorIndexH, - asm, ".4h", ".4h", ".4h", ".h", - [(set (v4i16 V64:$dst), - (OpNode (v4i16 V64:$Rd),(v4i16 V64:$Rn), - (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { - bits<3> idx; - let Inst{11} = idx{2}; - let Inst{21} = idx{1}; - let Inst{20} = idx{0}; - } - - def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc, - V128, V128, - V128_lo, VectorIndexH, - asm, ".8h", ".8h", ".8h", ".h", - [(set (v8i16 V128:$dst), - (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), - (v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { - bits<3> idx; - let Inst{11} = idx{2}; - let Inst{21} = idx{1}; - let Inst{20} = idx{0}; - } - - def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, - V64, V64, - V128, VectorIndexS, - asm, ".2s", ".2s", ".2s", ".s", - [(set (v2i32 V64:$dst), - (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), - (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } - - def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc, - V128, V128, - V128, VectorIndexS, - asm, ".4s", ".4s", ".4s", ".s", - [(set (v4i32 V128:$dst), - (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), - (v4i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } -} - -multiclass SIMDIndexedLongSD opc, string asm, - SDPatternOperator OpNode> { - def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, - V128, V64, - V128_lo, VectorIndexH, - asm, ".4s", ".4s", ".4h", ".h", - [(set (v4i32 V128:$Rd), - (OpNode (v4i16 V64:$Rn), - (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { - bits<3> idx; - let Inst{11} = idx{2}; - let Inst{21} = idx{1}; - let Inst{20} = idx{0}; - } - - def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc, - V128, V128, - V128_lo, VectorIndexH, - asm#"2", ".4s", ".4s", ".8h", ".h", - [(set (v4i32 V128:$Rd), - (OpNode (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))]> { - - bits<3> idx; - let Inst{11} = idx{2}; - let Inst{21} = idx{1}; - let Inst{20} = idx{0}; - } - - def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, - V128, V64, - V128, VectorIndexS, - asm, ".2d", ".2d", ".2s", ".s", - [(set (v2i64 V128:$Rd), - (OpNode (v2i32 V64:$Rn), - (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } - - def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc, - V128, V128, - V128, VectorIndexS, - asm#"2", ".2d", ".2d", ".4s", ".s", - [(set (v2i64 V128:$Rd), - (OpNode (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 (ARM64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))]> { - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } - - def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc, - FPR32Op, FPR16Op, V128_lo, VectorIndexH, - asm, ".h", "", "", ".h", []> { - bits<3> idx; - let Inst{11} = idx{2}; - let Inst{21} = idx{1}; - let Inst{20} = idx{0}; - } - - def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc, - FPR64Op, FPR32Op, V128, VectorIndexS, - asm, ".s", "", "", ".s", []> { - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } -} - -multiclass SIMDIndexedLongSQDMLXSDTied opc, string asm, - SDPatternOperator Accum> { - def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, - V128, V64, - V128_lo, VectorIndexH, - asm, ".4s", ".4s", ".4h", ".h", - [(set (v4i32 V128:$dst), - (Accum (v4i32 V128:$Rd), - (v4i32 (int_arm64_neon_sqdmull - (v4i16 V64:$Rn), - (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))))]> { - bits<3> idx; - let Inst{11} = idx{2}; - let Inst{21} = idx{1}; - let Inst{20} = idx{0}; - } - - // FIXME: it would be nice to use the scalar (v1i32) instruction here, but an - // intermediate EXTRACT_SUBREG would be untyped. - def : Pat<(i32 (Accum (i32 FPR32Op:$Rd), - (i32 (vector_extract (v4i32 - (int_arm64_neon_sqdmull (v4i16 V64:$Rn), - (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx)))), - (i64 0))))), - (EXTRACT_SUBREG - (!cast(NAME # v4i16_indexed) - (SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub), V64:$Rn, - V128_lo:$Rm, VectorIndexH:$idx), - ssub)>; - - def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc, - V128, V128, - V128_lo, VectorIndexH, - asm#"2", ".4s", ".4s", ".8h", ".h", - [(set (v4i32 V128:$dst), - (Accum (v4i32 V128:$Rd), - (v4i32 (int_arm64_neon_sqdmull - (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 - (ARM64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))))]> { - bits<3> idx; - let Inst{11} = idx{2}; - let Inst{21} = idx{1}; - let Inst{20} = idx{0}; - } - - def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, - V128, V64, - V128, VectorIndexS, - asm, ".2d", ".2d", ".2s", ".s", - [(set (v2i64 V128:$dst), - (Accum (v2i64 V128:$Rd), - (v2i64 (int_arm64_neon_sqdmull - (v2i32 V64:$Rn), - (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))))]> { - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } - - def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc, - V128, V128, - V128, VectorIndexS, - asm#"2", ".2d", ".2d", ".4s", ".s", - [(set (v2i64 V128:$dst), - (Accum (v2i64 V128:$Rd), - (v2i64 (int_arm64_neon_sqdmull - (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 - (ARM64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))))]> { - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } - - def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc, - FPR32Op, FPR16Op, V128_lo, VectorIndexH, - asm, ".h", "", "", ".h", []> { - bits<3> idx; - let Inst{11} = idx{2}; - let Inst{21} = idx{1}; - let Inst{20} = idx{0}; - } - - - def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc, - FPR64Op, FPR32Op, V128, VectorIndexS, - asm, ".s", "", "", ".s", - [(set (i64 FPR64Op:$dst), - (Accum (i64 FPR64Op:$Rd), - (i64 (int_arm64_neon_sqdmulls_scalar - (i32 FPR32Op:$Rn), - (i32 (vector_extract (v4i32 V128:$Rm), - VectorIndexS:$idx))))))]> { - - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } -} - -multiclass SIMDVectorIndexedLongSD opc, string asm, - SDPatternOperator OpNode> { - let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, - V128, V64, - V128_lo, VectorIndexH, - asm, ".4s", ".4s", ".4h", ".h", - [(set (v4i32 V128:$Rd), - (OpNode (v4i16 V64:$Rn), - (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { - bits<3> idx; - let Inst{11} = idx{2}; - let Inst{21} = idx{1}; - let Inst{20} = idx{0}; - } - - def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc, - V128, V128, - V128_lo, VectorIndexH, - asm#"2", ".4s", ".4s", ".8h", ".h", - [(set (v4i32 V128:$Rd), - (OpNode (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))]> { - - bits<3> idx; - let Inst{11} = idx{2}; - let Inst{21} = idx{1}; - let Inst{20} = idx{0}; - } - - def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, - V128, V64, - V128, VectorIndexS, - asm, ".2d", ".2d", ".2s", ".s", - [(set (v2i64 V128:$Rd), - (OpNode (v2i32 V64:$Rn), - (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } - - def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc, - V128, V128, - V128, VectorIndexS, - asm#"2", ".2d", ".2d", ".4s", ".s", - [(set (v2i64 V128:$Rd), - (OpNode (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 (ARM64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))]> { - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } - } -} - -multiclass SIMDVectorIndexedLongSDTied opc, string asm, - SDPatternOperator OpNode> { - let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, - V128, V64, - V128_lo, VectorIndexH, - asm, ".4s", ".4s", ".4h", ".h", - [(set (v4i32 V128:$dst), - (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn), - (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> { - bits<3> idx; - let Inst{11} = idx{2}; - let Inst{21} = idx{1}; - let Inst{20} = idx{0}; - } - - def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc, - V128, V128, - V128_lo, VectorIndexH, - asm#"2", ".4s", ".4s", ".8h", ".h", - [(set (v4i32 V128:$dst), - (OpNode (v4i32 V128:$Rd), - (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))]> { - bits<3> idx; - let Inst{11} = idx{2}; - let Inst{21} = idx{1}; - let Inst{20} = idx{0}; - } - - def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, - V128, V64, - V128, VectorIndexS, - asm, ".2d", ".2d", ".2s", ".s", - [(set (v2i64 V128:$dst), - (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn), - (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> { - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } - - def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc, - V128, V128, - V128, VectorIndexS, - asm#"2", ".2d", ".2d", ".4s", ".s", - [(set (v2i64 V128:$dst), - (OpNode (v2i64 V128:$Rd), - (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 (ARM64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))]> { - bits<2> idx; - let Inst{11} = idx{1}; - let Inst{21} = idx{0}; - } - } -} - -//---------------------------------------------------------------------------- -// AdvSIMD scalar shift by immediate -//---------------------------------------------------------------------------- - -let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in -class BaseSIMDScalarShift opc, bits<7> fixed_imm, - RegisterClass regtype1, RegisterClass regtype2, - Operand immtype, string asm, list pattern> - : I<(outs regtype1:$Rd), (ins regtype2:$Rn, immtype:$imm), - asm, "\t$Rd, $Rn, $imm", "", pattern>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - bits<7> imm; - let Inst{31-30} = 0b01; - let Inst{29} = U; - let Inst{28-23} = 0b111110; - let Inst{22-16} = fixed_imm; - let Inst{15-11} = opc; - let Inst{10} = 1; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in -class BaseSIMDScalarShiftTied opc, bits<7> fixed_imm, - RegisterClass regtype1, RegisterClass regtype2, - Operand immtype, string asm, list pattern> - : I<(outs regtype1:$dst), (ins regtype1:$Rd, regtype2:$Rn, immtype:$imm), - asm, "\t$Rd, $Rn, $imm", "$Rd = $dst", pattern>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - bits<7> imm; - let Inst{31-30} = 0b01; - let Inst{29} = U; - let Inst{28-23} = 0b111110; - let Inst{22-16} = fixed_imm; - let Inst{15-11} = opc; - let Inst{10} = 1; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - - -multiclass SIMDScalarRShiftSD opc, string asm> { - def s : BaseSIMDScalarShift { - let Inst{20-16} = imm{4-0}; - } - - def d : BaseSIMDScalarShift { - let Inst{21-16} = imm{5-0}; - } -} - -multiclass SIMDScalarRShiftD opc, string asm, - SDPatternOperator OpNode> { - def d : BaseSIMDScalarShift { - let Inst{21-16} = imm{5-0}; - } - - def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))), - (!cast(NAME # "d") FPR64:$Rn, vecshiftR64:$imm)>; -} - -multiclass SIMDScalarRShiftDTied opc, string asm, - SDPatternOperator OpNode = null_frag> { - def d : BaseSIMDScalarShiftTied { - let Inst{21-16} = imm{5-0}; - } - - def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), - (i32 vecshiftR64:$imm))), - (!cast(NAME # "d") FPR64:$Rd, FPR64:$Rn, - vecshiftR64:$imm)>; -} - -multiclass SIMDScalarLShiftD opc, string asm, - SDPatternOperator OpNode> { - def d : BaseSIMDScalarShift { - let Inst{21-16} = imm{5-0}; - } -} - -let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in -multiclass SIMDScalarLShiftDTied opc, string asm> { - def d : BaseSIMDScalarShiftTied { - let Inst{21-16} = imm{5-0}; - } -} - -let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in -multiclass SIMDScalarRShiftBHS opc, string asm, - SDPatternOperator OpNode = null_frag> { - def b : BaseSIMDScalarShift { - let Inst{18-16} = imm{2-0}; - } - - def h : BaseSIMDScalarShift { - let Inst{19-16} = imm{3-0}; - } - - def s : BaseSIMDScalarShift { - let Inst{20-16} = imm{4-0}; - } -} - -multiclass SIMDScalarLShiftBHSD opc, string asm, - SDPatternOperator OpNode> { - def b : BaseSIMDScalarShift { - let Inst{18-16} = imm{2-0}; - } - - def h : BaseSIMDScalarShift { - let Inst{19-16} = imm{3-0}; - } - - def s : BaseSIMDScalarShift { - let Inst{20-16} = imm{4-0}; - } - - def d : BaseSIMDScalarShift { - let Inst{21-16} = imm{5-0}; - } - - def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))), - (!cast(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>; -} - -multiclass SIMDScalarRShiftBHSD opc, string asm> { - def b : BaseSIMDScalarShift { - let Inst{18-16} = imm{2-0}; - } - - def h : BaseSIMDScalarShift { - let Inst{19-16} = imm{3-0}; - } - - def s : BaseSIMDScalarShift { - let Inst{20-16} = imm{4-0}; - } - - def d : BaseSIMDScalarShift { - let Inst{21-16} = imm{5-0}; - } -} - -//---------------------------------------------------------------------------- -// AdvSIMD vector x indexed element -//---------------------------------------------------------------------------- - -let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in -class BaseSIMDVectorShift opc, bits<7> fixed_imm, - RegisterOperand dst_reg, RegisterOperand src_reg, - Operand immtype, - string asm, string dst_kind, string src_kind, - list pattern> - : I<(outs dst_reg:$Rd), (ins src_reg:$Rn, immtype:$imm), - asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" # - "|" # dst_kind # "\t$Rd, $Rn, $imm}", "", pattern>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - let Inst{31} = 0; - let Inst{30} = Q; - let Inst{29} = U; - let Inst{28-23} = 0b011110; - let Inst{22-16} = fixed_imm; - let Inst{15-11} = opc; - let Inst{10} = 1; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in -class BaseSIMDVectorShiftTied opc, bits<7> fixed_imm, - RegisterOperand vectype1, RegisterOperand vectype2, - Operand immtype, - string asm, string dst_kind, string src_kind, - list pattern> - : I<(outs vectype1:$dst), (ins vectype1:$Rd, vectype2:$Rn, immtype:$imm), - asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" # - "|" # dst_kind # "\t$Rd, $Rn, $imm}", "$Rd = $dst", pattern>, - Sched<[WriteV]> { - bits<5> Rd; - bits<5> Rn; - let Inst{31} = 0; - let Inst{30} = Q; - let Inst{29} = U; - let Inst{28-23} = 0b011110; - let Inst{22-16} = fixed_imm; - let Inst{15-11} = opc; - let Inst{10} = 1; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -multiclass SIMDVectorRShiftSD opc, string asm, - Intrinsic OpNode> { - def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, - V64, V64, vecshiftR32, - asm, ".2s", ".2s", - [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (i32 imm:$imm)))]> { - bits<5> imm; - let Inst{20-16} = imm; - } - - def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?}, - V128, V128, vecshiftR32, - asm, ".4s", ".4s", - [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (i32 imm:$imm)))]> { - bits<5> imm; - let Inst{20-16} = imm; - } - - def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?}, - V128, V128, vecshiftR64, - asm, ".2d", ".2d", - [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (i32 imm:$imm)))]> { - bits<6> imm; - let Inst{21-16} = imm; - } -} - -multiclass SIMDVectorRShiftSDToFP opc, string asm, - Intrinsic OpNode> { - def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, - V64, V64, vecshiftR32, - asm, ".2s", ".2s", - [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 imm:$imm)))]> { - bits<5> imm; - let Inst{20-16} = imm; - } - - def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?}, - V128, V128, vecshiftR32, - asm, ".4s", ".4s", - [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 imm:$imm)))]> { - bits<5> imm; - let Inst{20-16} = imm; - } - - def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?}, - V128, V128, vecshiftR64, - asm, ".2d", ".2d", - [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 imm:$imm)))]> { - bits<6> imm; - let Inst{21-16} = imm; - } -} - -multiclass SIMDVectorRShiftNarrowBHS opc, string asm, - SDPatternOperator OpNode> { - def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?}, - V64, V128, vecshiftR16Narrow, - asm, ".8b", ".8h", - [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))]> { - bits<3> imm; - let Inst{18-16} = imm; - } - - def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?}, - V128, V128, vecshiftR16Narrow, - asm#"2", ".16b", ".8h", []> { - bits<3> imm; - let Inst{18-16} = imm; - let hasSideEffects = 0; - } - - def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, - V64, V128, vecshiftR32Narrow, - asm, ".4h", ".4s", - [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))]> { - bits<4> imm; - let Inst{19-16} = imm; - } - - def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?}, - V128, V128, vecshiftR32Narrow, - asm#"2", ".8h", ".4s", []> { - bits<4> imm; - let Inst{19-16} = imm; - let hasSideEffects = 0; - } - - def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, - V64, V128, vecshiftR64Narrow, - asm, ".2s", ".2d", - [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))]> { - bits<5> imm; - let Inst{20-16} = imm; - } - - def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?}, - V128, V128, vecshiftR64Narrow, - asm#"2", ".4s", ".2d", []> { - bits<5> imm; - let Inst{20-16} = imm; - let hasSideEffects = 0; - } - - // TableGen doesn't like patters w/ INSERT_SUBREG on the instructions - // themselves, so put them here instead. - - // Patterns involving what's effectively an insert high and a normal - // intrinsic, represented by CONCAT_VECTORS. - def : Pat<(concat_vectors (v8i8 V64:$Rd),(OpNode (v8i16 V128:$Rn), - vecshiftR16Narrow:$imm)), - (!cast(NAME # "v16i8_shift") - (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), - V128:$Rn, vecshiftR16Narrow:$imm)>; - def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), - vecshiftR32Narrow:$imm)), - (!cast(NAME # "v8i16_shift") - (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), - V128:$Rn, vecshiftR32Narrow:$imm)>; - def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), - vecshiftR64Narrow:$imm)), - (!cast(NAME # "v4i32_shift") - (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), - V128:$Rn, vecshiftR64Narrow:$imm)>; -} - -multiclass SIMDVectorLShiftBHSD opc, string asm, - SDPatternOperator OpNode> { - def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?}, - V64, V64, vecshiftL8, - asm, ".8b", ".8b", - [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), - (i32 vecshiftL8:$imm)))]> { - bits<3> imm; - let Inst{18-16} = imm; - } - - def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?}, - V128, V128, vecshiftL8, - asm, ".16b", ".16b", - [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), - (i32 vecshiftL8:$imm)))]> { - bits<3> imm; - let Inst{18-16} = imm; - } - - def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, - V64, V64, vecshiftL16, - asm, ".4h", ".4h", - [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), - (i32 vecshiftL16:$imm)))]> { - bits<4> imm; - let Inst{19-16} = imm; - } - - def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?}, - V128, V128, vecshiftL16, - asm, ".8h", ".8h", - [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), - (i32 vecshiftL16:$imm)))]> { - bits<4> imm; - let Inst{19-16} = imm; - } - - def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, - V64, V64, vecshiftL32, - asm, ".2s", ".2s", - [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), - (i32 vecshiftL32:$imm)))]> { - bits<5> imm; - let Inst{20-16} = imm; - } - - def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?}, - V128, V128, vecshiftL32, - asm, ".4s", ".4s", - [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), - (i32 vecshiftL32:$imm)))]> { - bits<5> imm; - let Inst{20-16} = imm; - } - - def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?}, - V128, V128, vecshiftL64, - asm, ".2d", ".2d", - [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), - (i32 vecshiftL64:$imm)))]> { - bits<6> imm; - let Inst{21-16} = imm; - } -} - -multiclass SIMDVectorRShiftBHSD opc, string asm, - SDPatternOperator OpNode> { - def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?}, - V64, V64, vecshiftR8, - asm, ".8b", ".8b", - [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), - (i32 vecshiftR8:$imm)))]> { - bits<3> imm; - let Inst{18-16} = imm; - } - - def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?}, - V128, V128, vecshiftR8, - asm, ".16b", ".16b", - [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), - (i32 vecshiftR8:$imm)))]> { - bits<3> imm; - let Inst{18-16} = imm; - } - - def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, - V64, V64, vecshiftR16, - asm, ".4h", ".4h", - [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), - (i32 vecshiftR16:$imm)))]> { - bits<4> imm; - let Inst{19-16} = imm; - } - - def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?}, - V128, V128, vecshiftR16, - asm, ".8h", ".8h", - [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), - (i32 vecshiftR16:$imm)))]> { - bits<4> imm; - let Inst{19-16} = imm; - } - - def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, - V64, V64, vecshiftR32, - asm, ".2s", ".2s", - [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), - (i32 vecshiftR32:$imm)))]> { - bits<5> imm; - let Inst{20-16} = imm; - } - - def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?}, - V128, V128, vecshiftR32, - asm, ".4s", ".4s", - [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), - (i32 vecshiftR32:$imm)))]> { - bits<5> imm; - let Inst{20-16} = imm; - } - - def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?}, - V128, V128, vecshiftR64, - asm, ".2d", ".2d", - [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), - (i32 vecshiftR64:$imm)))]> { - bits<6> imm; - let Inst{21-16} = imm; - } -} - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -multiclass SIMDVectorRShiftBHSDTied opc, string asm, - SDPatternOperator OpNode = null_frag> { - def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?}, - V64, V64, vecshiftR8, asm, ".8b", ".8b", - [(set (v8i8 V64:$dst), - (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), - (i32 vecshiftR8:$imm)))]> { - bits<3> imm; - let Inst{18-16} = imm; - } - - def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?}, - V128, V128, vecshiftR8, asm, ".16b", ".16b", - [(set (v16i8 V128:$dst), - (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), - (i32 vecshiftR8:$imm)))]> { - bits<3> imm; - let Inst{18-16} = imm; - } - - def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?}, - V64, V64, vecshiftR16, asm, ".4h", ".4h", - [(set (v4i16 V64:$dst), - (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), - (i32 vecshiftR16:$imm)))]> { - bits<4> imm; - let Inst{19-16} = imm; - } - - def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?}, - V128, V128, vecshiftR16, asm, ".8h", ".8h", - [(set (v8i16 V128:$dst), - (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), - (i32 vecshiftR16:$imm)))]> { - bits<4> imm; - let Inst{19-16} = imm; - } - - def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?}, - V64, V64, vecshiftR32, asm, ".2s", ".2s", - [(set (v2i32 V64:$dst), - (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), - (i32 vecshiftR32:$imm)))]> { - bits<5> imm; - let Inst{20-16} = imm; - } - - def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?}, - V128, V128, vecshiftR32, asm, ".4s", ".4s", - [(set (v4i32 V128:$dst), - (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), - (i32 vecshiftR32:$imm)))]> { - bits<5> imm; - let Inst{20-16} = imm; - } - - def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?}, - V128, V128, vecshiftR64, - asm, ".2d", ".2d", [(set (v2i64 V128:$dst), - (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn), - (i32 vecshiftR64:$imm)))]> { - bits<6> imm; - let Inst{21-16} = imm; - } -} - -multiclass SIMDVectorLShiftBHSDTied opc, string asm, - SDPatternOperator OpNode = null_frag> { - def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?}, - V64, V64, vecshiftL8, - asm, ".8b", ".8b", - [(set (v8i8 V64:$dst), - (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), - (i32 vecshiftL8:$imm)))]> { - bits<3> imm; - let Inst{18-16} = imm; - } - - def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?}, - V128, V128, vecshiftL8, - asm, ".16b", ".16b", - [(set (v16i8 V128:$dst), - (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), - (i32 vecshiftL8:$imm)))]> { - bits<3> imm; - let Inst{18-16} = imm; - } - - def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?}, - V64, V64, vecshiftL16, - asm, ".4h", ".4h", - [(set (v4i16 V64:$dst), - (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), - (i32 vecshiftL16:$imm)))]> { - bits<4> imm; - let Inst{19-16} = imm; - } - - def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?}, - V128, V128, vecshiftL16, - asm, ".8h", ".8h", - [(set (v8i16 V128:$dst), - (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), - (i32 vecshiftL16:$imm)))]> { - bits<4> imm; - let Inst{19-16} = imm; - } - - def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?}, - V64, V64, vecshiftL32, - asm, ".2s", ".2s", - [(set (v2i32 V64:$dst), - (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), - (i32 vecshiftL32:$imm)))]> { - bits<5> imm; - let Inst{20-16} = imm; - } - - def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?}, - V128, V128, vecshiftL32, - asm, ".4s", ".4s", - [(set (v4i32 V128:$dst), - (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), - (i32 vecshiftL32:$imm)))]> { - bits<5> imm; - let Inst{20-16} = imm; - } - - def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?}, - V128, V128, vecshiftL64, - asm, ".2d", ".2d", - [(set (v2i64 V128:$dst), - (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn), - (i32 vecshiftL64:$imm)))]> { - bits<6> imm; - let Inst{21-16} = imm; - } -} - -multiclass SIMDVectorLShiftLongBHSD opc, string asm, - SDPatternOperator OpNode> { - def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?}, - V128, V64, vecshiftL8, asm, ".8h", ".8b", - [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), vecshiftL8:$imm))]> { - bits<3> imm; - let Inst{18-16} = imm; - } - - def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?}, - V128, V128, vecshiftL8, - asm#"2", ".8h", ".16b", - [(set (v8i16 V128:$Rd), - (OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> { - bits<3> imm; - let Inst{18-16} = imm; - } - - def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, - V128, V64, vecshiftL16, asm, ".4s", ".4h", - [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), vecshiftL16:$imm))]> { - bits<4> imm; - let Inst{19-16} = imm; - } - - def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?}, - V128, V128, vecshiftL16, - asm#"2", ".4s", ".8h", - [(set (v4i32 V128:$Rd), - (OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> { - - bits<4> imm; - let Inst{19-16} = imm; - } - - def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, - V128, V64, vecshiftL32, asm, ".2d", ".2s", - [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), vecshiftL32:$imm))]> { - bits<5> imm; - let Inst{20-16} = imm; - } - - def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?}, - V128, V128, vecshiftL32, - asm#"2", ".2d", ".4s", - [(set (v2i64 V128:$Rd), - (OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> { - bits<5> imm; - let Inst{20-16} = imm; - } -} - - -//--- -// Vector load/store -//--- -// SIMD ldX/stX no-index memory references don't allow the optional -// ", #0" constant and handle post-indexing explicitly, so we use -// a more specialized parse method for them. Otherwise, it's the same as -// the general GPR64sp handling. - -class BaseSIMDLdSt opcode, bits<2> size, - string asm, dag oops, dag iops, list pattern> - : I { - bits<5> Vt; - bits<5> Rn; - let Inst{31} = 0; - let Inst{30} = Q; - let Inst{29-23} = 0b0011000; - let Inst{22} = L; - let Inst{21-16} = 0b000000; - let Inst{15-12} = opcode; - let Inst{11-10} = size; - let Inst{9-5} = Rn; - let Inst{4-0} = Vt; -} - -class BaseSIMDLdStPost opcode, bits<2> size, - string asm, dag oops, dag iops> - : I { - bits<5> Vt; - bits<5> Rn; - bits<5> Xm; - let Inst{31} = 0; - let Inst{30} = Q; - let Inst{29-23} = 0b0011001; - let Inst{22} = L; - let Inst{21} = 0; - let Inst{20-16} = Xm; - let Inst{15-12} = opcode; - let Inst{11-10} = size; - let Inst{9-5} = Rn; - let Inst{4-0} = Vt; -} - -// The immediate form of AdvSIMD post-indexed addressing is encoded with -// register post-index addressing from the zero register. -multiclass SIMDLdStAliases { - // E.g. "ld1 { v0.8b, v1.8b }, [x1], #16" - // "ld1\t$Vt, [$Rn], #16" - // may get mapped to - // (LD1Twov8b_POST VecListTwo8b:$Vt, GPR64sp:$Rn, XZR) - def : InstAlias(NAME # Count # "v" # layout # "_POST") - GPR64sp:$Rn, - !cast("VecList" # Count # layout):$Vt, - XZR), 1>; - - // E.g. "ld1.8b { v0, v1 }, [x1], #16" - // "ld1.8b\t$Vt, [$Rn], #16" - // may get mapped to - // (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, XZR) - def : InstAlias(NAME # Count # "v" # layout # "_POST") - GPR64sp:$Rn, - !cast("VecList" # Count # Size):$Vt, - XZR), 0>; - - // E.g. "ld1.8b { v0, v1 }, [x1]" - // "ld1\t$Vt, [$Rn]" - // may get mapped to - // (LD1Twov8b VecListTwo64:$Vt, GPR64sp:$Rn) - def : InstAlias(NAME # Count # "v" # layout) - !cast("VecList" # Count # Size):$Vt, - GPR64sp:$Rn), 0>; - - // E.g. "ld1.8b { v0, v1 }, [x1], x2" - // "ld1\t$Vt, [$Rn], $Xm" - // may get mapped to - // (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, GPR64pi8:$Xm) - def : InstAlias(NAME # Count # "v" # layout # "_POST") - GPR64sp:$Rn, - !cast("VecList" # Count # Size):$Vt, - !cast("GPR64pi" # Offset):$Xm), 0>; -} - -multiclass BaseSIMDLdN opcode> { - let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { - def v16b: BaseSIMDLdSt<1, 1, opcode, 0b00, asm, - (outs !cast(veclist # "16b"):$Vt), - (ins GPR64sp:$Rn), []>; - def v8h : BaseSIMDLdSt<1, 1, opcode, 0b01, asm, - (outs !cast(veclist # "8h"):$Vt), - (ins GPR64sp:$Rn), []>; - def v4s : BaseSIMDLdSt<1, 1, opcode, 0b10, asm, - (outs !cast(veclist # "4s"):$Vt), - (ins GPR64sp:$Rn), []>; - def v2d : BaseSIMDLdSt<1, 1, opcode, 0b11, asm, - (outs !cast(veclist # "2d"):$Vt), - (ins GPR64sp:$Rn), []>; - def v8b : BaseSIMDLdSt<0, 1, opcode, 0b00, asm, - (outs !cast(veclist # "8b"):$Vt), - (ins GPR64sp:$Rn), []>; - def v4h : BaseSIMDLdSt<0, 1, opcode, 0b01, asm, - (outs !cast(veclist # "4h"):$Vt), - (ins GPR64sp:$Rn), []>; - def v2s : BaseSIMDLdSt<0, 1, opcode, 0b10, asm, - (outs !cast(veclist # "2s"):$Vt), - (ins GPR64sp:$Rn), []>; - - - def v16b_POST: BaseSIMDLdStPost<1, 1, opcode, 0b00, asm, - (outs GPR64sp:$wback, - !cast(veclist # "16b"):$Vt), - (ins GPR64sp:$Rn, - !cast("GPR64pi" # Offset128):$Xm)>; - def v8h_POST : BaseSIMDLdStPost<1, 1, opcode, 0b01, asm, - (outs GPR64sp:$wback, - !cast(veclist # "8h"):$Vt), - (ins GPR64sp:$Rn, - !cast("GPR64pi" # Offset128):$Xm)>; - def v4s_POST : BaseSIMDLdStPost<1, 1, opcode, 0b10, asm, - (outs GPR64sp:$wback, - !cast(veclist # "4s"):$Vt), - (ins GPR64sp:$Rn, - !cast("GPR64pi" # Offset128):$Xm)>; - def v2d_POST : BaseSIMDLdStPost<1, 1, opcode, 0b11, asm, - (outs GPR64sp:$wback, - !cast(veclist # "2d"):$Vt), - (ins GPR64sp:$Rn, - !cast("GPR64pi" # Offset128):$Xm)>; - def v8b_POST : BaseSIMDLdStPost<0, 1, opcode, 0b00, asm, - (outs GPR64sp:$wback, - !cast(veclist # "8b"):$Vt), - (ins GPR64sp:$Rn, - !cast("GPR64pi" # Offset64):$Xm)>; - def v4h_POST : BaseSIMDLdStPost<0, 1, opcode, 0b01, asm, - (outs GPR64sp:$wback, - !cast(veclist # "4h"):$Vt), - (ins GPR64sp:$Rn, - !cast("GPR64pi" # Offset64):$Xm)>; - def v2s_POST : BaseSIMDLdStPost<0, 1, opcode, 0b10, asm, - (outs GPR64sp:$wback, - !cast(veclist # "2s"):$Vt), - (ins GPR64sp:$Rn, - !cast("GPR64pi" # Offset64):$Xm)>; - } - - defm : SIMDLdStAliases; - defm : SIMDLdStAliases; - defm : SIMDLdStAliases; - defm : SIMDLdStAliases; - defm : SIMDLdStAliases; - defm : SIMDLdStAliases; - defm : SIMDLdStAliases; -} - -// Only ld1/st1 has a v1d version. -multiclass BaseSIMDStN opcode> { - let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in { - def v16b : BaseSIMDLdSt<1, 0, opcode, 0b00, asm, (outs), - (ins !cast(veclist # "16b"):$Vt, - GPR64sp:$Rn), []>; - def v8h : BaseSIMDLdSt<1, 0, opcode, 0b01, asm, (outs), - (ins !cast(veclist # "8h"):$Vt, - GPR64sp:$Rn), []>; - def v4s : BaseSIMDLdSt<1, 0, opcode, 0b10, asm, (outs), - (ins !cast(veclist # "4s"):$Vt, - GPR64sp:$Rn), []>; - def v2d : BaseSIMDLdSt<1, 0, opcode, 0b11, asm, (outs), - (ins !cast(veclist # "2d"):$Vt, - GPR64sp:$Rn), []>; - def v8b : BaseSIMDLdSt<0, 0, opcode, 0b00, asm, (outs), - (ins !cast(veclist # "8b"):$Vt, - GPR64sp:$Rn), []>; - def v4h : BaseSIMDLdSt<0, 0, opcode, 0b01, asm, (outs), - (ins !cast(veclist # "4h"):$Vt, - GPR64sp:$Rn), []>; - def v2s : BaseSIMDLdSt<0, 0, opcode, 0b10, asm, (outs), - (ins !cast(veclist # "2s"):$Vt, - GPR64sp:$Rn), []>; - - def v16b_POST : BaseSIMDLdStPost<1, 0, opcode, 0b00, asm, - (outs GPR64sp:$wback), - (ins !cast(veclist # "16b"):$Vt, - GPR64sp:$Rn, - !cast("GPR64pi" # Offset128):$Xm)>; - def v8h_POST : BaseSIMDLdStPost<1, 0, opcode, 0b01, asm, - (outs GPR64sp:$wback), - (ins !cast(veclist # "8h"):$Vt, - GPR64sp:$Rn, - !cast("GPR64pi" # Offset128):$Xm)>; - def v4s_POST : BaseSIMDLdStPost<1, 0, opcode, 0b10, asm, - (outs GPR64sp:$wback), - (ins !cast(veclist # "4s"):$Vt, - GPR64sp:$Rn, - !cast("GPR64pi" # Offset128):$Xm)>; - def v2d_POST : BaseSIMDLdStPost<1, 0, opcode, 0b11, asm, - (outs GPR64sp:$wback), - (ins !cast(veclist # "2d"):$Vt, - GPR64sp:$Rn, - !cast("GPR64pi" # Offset128):$Xm)>; - def v8b_POST : BaseSIMDLdStPost<0, 0, opcode, 0b00, asm, - (outs GPR64sp:$wback), - (ins !cast(veclist # "8b"):$Vt, - GPR64sp:$Rn, - !cast("GPR64pi" # Offset64):$Xm)>; - def v4h_POST : BaseSIMDLdStPost<0, 0, opcode, 0b01, asm, - (outs GPR64sp:$wback), - (ins !cast(veclist # "4h"):$Vt, - GPR64sp:$Rn, - !cast("GPR64pi" # Offset64):$Xm)>; - def v2s_POST : BaseSIMDLdStPost<0, 0, opcode, 0b10, asm, - (outs GPR64sp:$wback), - (ins !cast(veclist # "2s"):$Vt, - GPR64sp:$Rn, - !cast("GPR64pi" # Offset64):$Xm)>; - } - - defm : SIMDLdStAliases; - defm : SIMDLdStAliases; - defm : SIMDLdStAliases; - defm : SIMDLdStAliases; - defm : SIMDLdStAliases; - defm : SIMDLdStAliases; - defm : SIMDLdStAliases; -} - -multiclass BaseSIMDLd1 opcode> - : BaseSIMDLdN { - - // LD1 instructions have extra "1d" variants. - let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { - def v1d : BaseSIMDLdSt<0, 1, opcode, 0b11, asm, - (outs !cast(veclist # "1d"):$Vt), - (ins GPR64sp:$Rn), []>; - - def v1d_POST : BaseSIMDLdStPost<0, 1, opcode, 0b11, asm, - (outs GPR64sp:$wback, - !cast(veclist # "1d"):$Vt), - (ins GPR64sp:$Rn, - !cast("GPR64pi" # Offset64):$Xm)>; - } - - defm : SIMDLdStAliases; -} - -multiclass BaseSIMDSt1 opcode> - : BaseSIMDStN { - - // ST1 instructions have extra "1d" variants. - let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in { - def v1d : BaseSIMDLdSt<0, 0, opcode, 0b11, asm, (outs), - (ins !cast(veclist # "1d"):$Vt, - GPR64sp:$Rn), []>; - - def v1d_POST : BaseSIMDLdStPost<0, 0, opcode, 0b11, asm, - (outs GPR64sp:$wback), - (ins !cast(veclist # "1d"):$Vt, - GPR64sp:$Rn, - !cast("GPR64pi" # Offset64):$Xm)>; - } - - defm : SIMDLdStAliases; -} - -multiclass SIMDLd1Multiple { - defm One : BaseSIMDLd1<"One", asm, "VecListOne", 16, 8, 0b0111>; - defm Two : BaseSIMDLd1<"Two", asm, "VecListTwo", 32, 16, 0b1010>; - defm Three : BaseSIMDLd1<"Three", asm, "VecListThree", 48, 24, 0b0110>; - defm Four : BaseSIMDLd1<"Four", asm, "VecListFour", 64, 32, 0b0010>; -} - -multiclass SIMDSt1Multiple { - defm One : BaseSIMDSt1<"One", asm, "VecListOne", 16, 8, 0b0111>; - defm Two : BaseSIMDSt1<"Two", asm, "VecListTwo", 32, 16, 0b1010>; - defm Three : BaseSIMDSt1<"Three", asm, "VecListThree", 48, 24, 0b0110>; - defm Four : BaseSIMDSt1<"Four", asm, "VecListFour", 64, 32, 0b0010>; -} - -multiclass SIMDLd2Multiple { - defm Two : BaseSIMDLdN<"Two", asm, "VecListTwo", 32, 16, 0b1000>; -} - -multiclass SIMDSt2Multiple { - defm Two : BaseSIMDStN<"Two", asm, "VecListTwo", 32, 16, 0b1000>; -} - -multiclass SIMDLd3Multiple { - defm Three : BaseSIMDLdN<"Three", asm, "VecListThree", 48, 24, 0b0100>; -} - -multiclass SIMDSt3Multiple { - defm Three : BaseSIMDStN<"Three", asm, "VecListThree", 48, 24, 0b0100>; -} - -multiclass SIMDLd4Multiple { - defm Four : BaseSIMDLdN<"Four", asm, "VecListFour", 64, 32, 0b0000>; -} - -multiclass SIMDSt4Multiple { - defm Four : BaseSIMDStN<"Four", asm, "VecListFour", 64, 32, 0b0000>; -} - -//--- -// AdvSIMD Load/store single-element -//--- - -class BaseSIMDLdStSingle opcode, - string asm, string operands, string cst, - dag oops, dag iops, list pattern> - : I { - bits<5> Vt; - bits<5> Rn; - let Inst{31} = 0; - let Inst{29-24} = 0b001101; - let Inst{22} = L; - let Inst{21} = R; - let Inst{15-13} = opcode; - let Inst{9-5} = Rn; - let Inst{4-0} = Vt; -} - -class BaseSIMDLdStSingleTied opcode, - string asm, string operands, string cst, - dag oops, dag iops, list pattern> - : I { - bits<5> Vt; - bits<5> Rn; - let Inst{31} = 0; - let Inst{29-24} = 0b001101; - let Inst{22} = L; - let Inst{21} = R; - let Inst{15-13} = opcode; - let Inst{9-5} = Rn; - let Inst{4-0} = Vt; -} - - -let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDLdR opcode, bit S, bits<2> size, string asm, - Operand listtype> - : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn]", "", - (outs listtype:$Vt), (ins GPR64sp:$Rn), - []> { - let Inst{30} = Q; - let Inst{23} = 0; - let Inst{20-16} = 0b00000; - let Inst{12} = S; - let Inst{11-10} = size; -} -let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDLdRPost opcode, bit S, bits<2> size, - string asm, Operand listtype, Operand GPR64pi> - : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn], $Xm", - "$Rn = $wback", - (outs GPR64sp:$wback, listtype:$Vt), - (ins GPR64sp:$Rn, GPR64pi:$Xm), []> { - bits<5> Xm; - let Inst{30} = Q; - let Inst{23} = 1; - let Inst{20-16} = Xm; - let Inst{12} = S; - let Inst{11-10} = size; -} - -multiclass SIMDLdrAliases { - // E.g. "ld1r { v0.8b }, [x1], #1" - // "ld1r.8b\t$Vt, [$Rn], #1" - // may get mapped to - // (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR) - def : InstAlias(NAME # "v" # layout # "_POST") - GPR64sp:$Rn, - !cast("VecList" # Count # layout):$Vt, - XZR), 1>; - - // E.g. "ld1r.8b { v0 }, [x1], #1" - // "ld1r.8b\t$Vt, [$Rn], #1" - // may get mapped to - // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR) - def : InstAlias(NAME # "v" # layout # "_POST") - GPR64sp:$Rn, - !cast("VecList" # Count # Size):$Vt, - XZR), 0>; - - // E.g. "ld1r.8b { v0 }, [x1]" - // "ld1r.8b\t$Vt, [$Rn]" - // may get mapped to - // (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn) - def : InstAlias(NAME # "v" # layout) - !cast("VecList" # Count # Size):$Vt, - GPR64sp:$Rn), 0>; - - // E.g. "ld1r.8b { v0 }, [x1], x2" - // "ld1r.8b\t$Vt, [$Rn], $Xm" - // may get mapped to - // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm) - def : InstAlias(NAME # "v" # layout # "_POST") - GPR64sp:$Rn, - !cast("VecList" # Count # Size):$Vt, - !cast("GPR64pi" # Offset):$Xm), 0>; -} - -multiclass SIMDLdR opcode, bit S, string asm, string Count, - int Offset1, int Offset2, int Offset4, int Offset8> { - def v8b : BaseSIMDLdR<0, R, opcode, S, 0b00, asm, - !cast("VecList" # Count # "8b")>; - def v16b: BaseSIMDLdR<1, R, opcode, S, 0b00, asm, - !cast("VecList" # Count #"16b")>; - def v4h : BaseSIMDLdR<0, R, opcode, S, 0b01, asm, - !cast("VecList" # Count #"4h")>; - def v8h : BaseSIMDLdR<1, R, opcode, S, 0b01, asm, - !cast("VecList" # Count #"8h")>; - def v2s : BaseSIMDLdR<0, R, opcode, S, 0b10, asm, - !cast("VecList" # Count #"2s")>; - def v4s : BaseSIMDLdR<1, R, opcode, S, 0b10, asm, - !cast("VecList" # Count #"4s")>; - def v1d : BaseSIMDLdR<0, R, opcode, S, 0b11, asm, - !cast("VecList" # Count #"1d")>; - def v2d : BaseSIMDLdR<1, R, opcode, S, 0b11, asm, - !cast("VecList" # Count #"2d")>; - - def v8b_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b00, asm, - !cast("VecList" # Count # "8b"), - !cast("GPR64pi" # Offset1)>; - def v16b_POST: BaseSIMDLdRPost<1, R, opcode, S, 0b00, asm, - !cast("VecList" # Count # "16b"), - !cast("GPR64pi" # Offset1)>; - def v4h_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b01, asm, - !cast("VecList" # Count # "4h"), - !cast("GPR64pi" # Offset2)>; - def v8h_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b01, asm, - !cast("VecList" # Count # "8h"), - !cast("GPR64pi" # Offset2)>; - def v2s_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b10, asm, - !cast("VecList" # Count # "2s"), - !cast("GPR64pi" # Offset4)>; - def v4s_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b10, asm, - !cast("VecList" # Count # "4s"), - !cast("GPR64pi" # Offset4)>; - def v1d_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b11, asm, - !cast("VecList" # Count # "1d"), - !cast("GPR64pi" # Offset8)>; - def v2d_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b11, asm, - !cast("VecList" # Count # "2d"), - !cast("GPR64pi" # Offset8)>; - - defm : SIMDLdrAliases; - defm : SIMDLdrAliases; - defm : SIMDLdrAliases; - defm : SIMDLdrAliases; - defm : SIMDLdrAliases; - defm : SIMDLdrAliases; - defm : SIMDLdrAliases; - defm : SIMDLdrAliases; -} - -class SIMDLdStSingleB opcode, string asm, - dag oops, dag iops, list pattern> - : BaseSIMDLdStSingle { - // idx encoded in Q:S:size fields. - bits<4> idx; - let Inst{30} = idx{3}; - let Inst{23} = 0; - let Inst{20-16} = 0b00000; - let Inst{12} = idx{2}; - let Inst{11-10} = idx{1-0}; -} -class SIMDLdStSingleBTied opcode, string asm, - dag oops, dag iops, list pattern> - : BaseSIMDLdStSingleTied { - // idx encoded in Q:S:size fields. - bits<4> idx; - let Inst{30} = idx{3}; - let Inst{23} = 0; - let Inst{20-16} = 0b00000; - let Inst{12} = idx{2}; - let Inst{11-10} = idx{1-0}; -} -class SIMDLdStSingleBPost opcode, string asm, - dag oops, dag iops> - : BaseSIMDLdStSingle { - // idx encoded in Q:S:size fields. - bits<4> idx; - bits<5> Xm; - let Inst{30} = idx{3}; - let Inst{23} = 1; - let Inst{20-16} = Xm; - let Inst{12} = idx{2}; - let Inst{11-10} = idx{1-0}; -} -class SIMDLdStSingleBTiedPost opcode, string asm, - dag oops, dag iops> - : BaseSIMDLdStSingleTied { - // idx encoded in Q:S:size fields. - bits<4> idx; - bits<5> Xm; - let Inst{30} = idx{3}; - let Inst{23} = 1; - let Inst{20-16} = Xm; - let Inst{12} = idx{2}; - let Inst{11-10} = idx{1-0}; -} - -class SIMDLdStSingleH opcode, bit size, string asm, - dag oops, dag iops, list pattern> - : BaseSIMDLdStSingle { - // idx encoded in Q:S:size<1> fields. - bits<3> idx; - let Inst{30} = idx{2}; - let Inst{23} = 0; - let Inst{20-16} = 0b00000; - let Inst{12} = idx{1}; - let Inst{11} = idx{0}; - let Inst{10} = size; -} -class SIMDLdStSingleHTied opcode, bit size, string asm, - dag oops, dag iops, list pattern> - : BaseSIMDLdStSingleTied { - // idx encoded in Q:S:size<1> fields. - bits<3> idx; - let Inst{30} = idx{2}; - let Inst{23} = 0; - let Inst{20-16} = 0b00000; - let Inst{12} = idx{1}; - let Inst{11} = idx{0}; - let Inst{10} = size; -} - -class SIMDLdStSingleHPost opcode, bit size, string asm, - dag oops, dag iops> - : BaseSIMDLdStSingle { - // idx encoded in Q:S:size<1> fields. - bits<3> idx; - bits<5> Xm; - let Inst{30} = idx{2}; - let Inst{23} = 1; - let Inst{20-16} = Xm; - let Inst{12} = idx{1}; - let Inst{11} = idx{0}; - let Inst{10} = size; -} -class SIMDLdStSingleHTiedPost opcode, bit size, string asm, - dag oops, dag iops> - : BaseSIMDLdStSingleTied { - // idx encoded in Q:S:size<1> fields. - bits<3> idx; - bits<5> Xm; - let Inst{30} = idx{2}; - let Inst{23} = 1; - let Inst{20-16} = Xm; - let Inst{12} = idx{1}; - let Inst{11} = idx{0}; - let Inst{10} = size; -} -class SIMDLdStSingleS opcode, bits<2> size, string asm, - dag oops, dag iops, list pattern> - : BaseSIMDLdStSingle { - // idx encoded in Q:S fields. - bits<2> idx; - let Inst{30} = idx{1}; - let Inst{23} = 0; - let Inst{20-16} = 0b00000; - let Inst{12} = idx{0}; - let Inst{11-10} = size; -} -class SIMDLdStSingleSTied opcode, bits<2> size, string asm, - dag oops, dag iops, list pattern> - : BaseSIMDLdStSingleTied { - // idx encoded in Q:S fields. - bits<2> idx; - let Inst{30} = idx{1}; - let Inst{23} = 0; - let Inst{20-16} = 0b00000; - let Inst{12} = idx{0}; - let Inst{11-10} = size; -} -class SIMDLdStSingleSPost opcode, bits<2> size, - string asm, dag oops, dag iops> - : BaseSIMDLdStSingle { - // idx encoded in Q:S fields. - bits<2> idx; - bits<5> Xm; - let Inst{30} = idx{1}; - let Inst{23} = 1; - let Inst{20-16} = Xm; - let Inst{12} = idx{0}; - let Inst{11-10} = size; -} -class SIMDLdStSingleSTiedPost opcode, bits<2> size, - string asm, dag oops, dag iops> - : BaseSIMDLdStSingleTied { - // idx encoded in Q:S fields. - bits<2> idx; - bits<5> Xm; - let Inst{30} = idx{1}; - let Inst{23} = 1; - let Inst{20-16} = Xm; - let Inst{12} = idx{0}; - let Inst{11-10} = size; -} -class SIMDLdStSingleD opcode, bits<2> size, string asm, - dag oops, dag iops, list pattern> - : BaseSIMDLdStSingle { - // idx encoded in Q field. - bits<1> idx; - let Inst{30} = idx; - let Inst{23} = 0; - let Inst{20-16} = 0b00000; - let Inst{12} = 0; - let Inst{11-10} = size; -} -class SIMDLdStSingleDTied opcode, bits<2> size, string asm, - dag oops, dag iops, list pattern> - : BaseSIMDLdStSingleTied { - // idx encoded in Q field. - bits<1> idx; - let Inst{30} = idx; - let Inst{23} = 0; - let Inst{20-16} = 0b00000; - let Inst{12} = 0; - let Inst{11-10} = size; -} -class SIMDLdStSingleDPost opcode, bits<2> size, - string asm, dag oops, dag iops> - : BaseSIMDLdStSingle { - // idx encoded in Q field. - bits<1> idx; - bits<5> Xm; - let Inst{30} = idx; - let Inst{23} = 1; - let Inst{20-16} = Xm; - let Inst{12} = 0; - let Inst{11-10} = size; -} -class SIMDLdStSingleDTiedPost opcode, bits<2> size, - string asm, dag oops, dag iops> - : BaseSIMDLdStSingleTied { - // idx encoded in Q field. - bits<1> idx; - bits<5> Xm; - let Inst{30} = idx; - let Inst{23} = 1; - let Inst{20-16} = Xm; - let Inst{12} = 0; - let Inst{11-10} = size; -} - -let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in -multiclass SIMDLdSingleBTied opcode, string asm, - RegisterOperand listtype, - RegisterOperand GPR64pi> { - def i8 : SIMDLdStSingleBTied<1, R, opcode, asm, - (outs listtype:$dst), - (ins listtype:$Vt, VectorIndexB:$idx, - GPR64sp:$Rn), []>; - - def i8_POST : SIMDLdStSingleBTiedPost<1, R, opcode, asm, - (outs GPR64sp:$wback, listtype:$dst), - (ins listtype:$Vt, VectorIndexB:$idx, - GPR64sp:$Rn, GPR64pi:$Xm)>; -} -let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in -multiclass SIMDLdSingleHTied opcode, bit size, string asm, - RegisterOperand listtype, - RegisterOperand GPR64pi> { - def i16 : SIMDLdStSingleHTied<1, R, opcode, size, asm, - (outs listtype:$dst), - (ins listtype:$Vt, VectorIndexH:$idx, - GPR64sp:$Rn), []>; - - def i16_POST : SIMDLdStSingleHTiedPost<1, R, opcode, size, asm, - (outs GPR64sp:$wback, listtype:$dst), - (ins listtype:$Vt, VectorIndexH:$idx, - GPR64sp:$Rn, GPR64pi:$Xm)>; -} -let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in -multiclass SIMDLdSingleSTied opcode, bits<2> size,string asm, - RegisterOperand listtype, - RegisterOperand GPR64pi> { - def i32 : SIMDLdStSingleSTied<1, R, opcode, size, asm, - (outs listtype:$dst), - (ins listtype:$Vt, VectorIndexS:$idx, - GPR64sp:$Rn), []>; - - def i32_POST : SIMDLdStSingleSTiedPost<1, R, opcode, size, asm, - (outs GPR64sp:$wback, listtype:$dst), - (ins listtype:$Vt, VectorIndexS:$idx, - GPR64sp:$Rn, GPR64pi:$Xm)>; -} -let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in -multiclass SIMDLdSingleDTied opcode, bits<2> size, string asm, - RegisterOperand listtype, RegisterOperand GPR64pi> { - def i64 : SIMDLdStSingleDTied<1, R, opcode, size, asm, - (outs listtype:$dst), - (ins listtype:$Vt, VectorIndexD:$idx, - GPR64sp:$Rn), []>; - - def i64_POST : SIMDLdStSingleDTiedPost<1, R, opcode, size, asm, - (outs GPR64sp:$wback, listtype:$dst), - (ins listtype:$Vt, VectorIndexD:$idx, - GPR64sp:$Rn, GPR64pi:$Xm)>; -} -let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in -multiclass SIMDStSingleB opcode, string asm, - RegisterOperand listtype, RegisterOperand GPR64pi> { - def i8 : SIMDLdStSingleB<0, R, opcode, asm, - (outs), (ins listtype:$Vt, VectorIndexB:$idx, - GPR64sp:$Rn), []>; - - def i8_POST : SIMDLdStSingleBPost<0, R, opcode, asm, - (outs GPR64sp:$wback), - (ins listtype:$Vt, VectorIndexB:$idx, - GPR64sp:$Rn, GPR64pi:$Xm)>; -} -let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in -multiclass SIMDStSingleH opcode, bit size, string asm, - RegisterOperand listtype, RegisterOperand GPR64pi> { - def i16 : SIMDLdStSingleH<0, R, opcode, size, asm, - (outs), (ins listtype:$Vt, VectorIndexH:$idx, - GPR64sp:$Rn), []>; - - def i16_POST : SIMDLdStSingleHPost<0, R, opcode, size, asm, - (outs GPR64sp:$wback), - (ins listtype:$Vt, VectorIndexH:$idx, - GPR64sp:$Rn, GPR64pi:$Xm)>; -} -let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in -multiclass SIMDStSingleS opcode, bits<2> size,string asm, - RegisterOperand listtype, RegisterOperand GPR64pi> { - def i32 : SIMDLdStSingleS<0, R, opcode, size, asm, - (outs), (ins listtype:$Vt, VectorIndexS:$idx, - GPR64sp:$Rn), []>; - - def i32_POST : SIMDLdStSingleSPost<0, R, opcode, size, asm, - (outs GPR64sp:$wback), - (ins listtype:$Vt, VectorIndexS:$idx, - GPR64sp:$Rn, GPR64pi:$Xm)>; -} -let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in -multiclass SIMDStSingleD opcode, bits<2> size, string asm, - RegisterOperand listtype, RegisterOperand GPR64pi> { - def i64 : SIMDLdStSingleD<0, R, opcode, size, asm, - (outs), (ins listtype:$Vt, VectorIndexD:$idx, - GPR64sp:$Rn), []>; - - def i64_POST : SIMDLdStSingleDPost<0, R, opcode, size, asm, - (outs GPR64sp:$wback), - (ins listtype:$Vt, VectorIndexD:$idx, - GPR64sp:$Rn, GPR64pi:$Xm)>; -} - -multiclass SIMDLdStSingleAliases { - // E.g. "ld1 { v0.8b }[0], [x1], #1" - // "ld1\t$Vt, [$Rn], #1" - // may get mapped to - // (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR) - def : InstAlias(NAME # Type # "_POST") - GPR64sp:$Rn, - !cast("VecList" # Count # layout):$Vt, - idxtype:$idx, XZR), 1>; - - // E.g. "ld1.8b { v0 }[0], [x1], #1" - // "ld1.8b\t$Vt, [$Rn], #1" - // may get mapped to - // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR) - def : InstAlias(NAME # Type # "_POST") - GPR64sp:$Rn, - !cast("VecList" # Count # "128"):$Vt, - idxtype:$idx, XZR), 0>; - - // E.g. "ld1.8b { v0 }[0], [x1]" - // "ld1.8b\t$Vt, [$Rn]" - // may get mapped to - // (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn) - def : InstAlias(NAME # Type) - !cast("VecList" # Count # "128"):$Vt, - idxtype:$idx, GPR64sp:$Rn), 0>; - - // E.g. "ld1.8b { v0 }[0], [x1], x2" - // "ld1.8b\t$Vt, [$Rn], $Xm" - // may get mapped to - // (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm) - def : InstAlias(NAME # Type # "_POST") - GPR64sp:$Rn, - !cast("VecList" # Count # "128"):$Vt, - idxtype:$idx, - !cast("GPR64pi" # Offset):$Xm), 0>; -} - -multiclass SIMDLdSt1SingleAliases { - defm : SIMDLdStSingleAliases; - defm : SIMDLdStSingleAliases; - defm : SIMDLdStSingleAliases; - defm : SIMDLdStSingleAliases; -} - -multiclass SIMDLdSt2SingleAliases { - defm : SIMDLdStSingleAliases; - defm : SIMDLdStSingleAliases; - defm : SIMDLdStSingleAliases; - defm : SIMDLdStSingleAliases; -} - -multiclass SIMDLdSt3SingleAliases { - defm : SIMDLdStSingleAliases; - defm : SIMDLdStSingleAliases; - defm : SIMDLdStSingleAliases; - defm : SIMDLdStSingleAliases; -} - -multiclass SIMDLdSt4SingleAliases { - defm : SIMDLdStSingleAliases; - defm : SIMDLdStSingleAliases; - defm : SIMDLdStSingleAliases; - defm : SIMDLdStSingleAliases; -} -} // end of 'let Predicates = [HasNEON]' - -//---------------------------------------------------------------------------- -// Crypto extensions -//---------------------------------------------------------------------------- - -let Predicates = [HasCrypto] in { -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class AESBase opc, string asm, dag outs, dag ins, string cstr, - list pat> - : I, - Sched<[WriteV]>{ - bits<5> Rd; - bits<5> Rn; - let Inst{31-16} = 0b0100111000101000; - let Inst{15-12} = opc; - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -class AESInst opc, string asm, Intrinsic OpNode> - : AESBase; - -class AESTiedInst opc, string asm, Intrinsic OpNode> - : AESBase; - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class SHA3OpTiedInst opc, string asm, string dst_lhs_kind, - dag oops, dag iops, list pat> - : I, - Sched<[WriteV]>{ - bits<5> Rd; - bits<5> Rn; - bits<5> Rm; - let Inst{31-21} = 0b01011110000; - let Inst{20-16} = Rm; - let Inst{15} = 0; - let Inst{14-12} = opc; - let Inst{11-10} = 0b00; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -class SHATiedInstQSV opc, string asm, Intrinsic OpNode> - : SHA3OpTiedInst; - -class SHATiedInstVVV opc, string asm, Intrinsic OpNode> - : SHA3OpTiedInst; - -class SHATiedInstQQV opc, string asm, Intrinsic OpNode> - : SHA3OpTiedInst; - -let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class SHA2OpInst opc, string asm, string kind, - string cstr, dag oops, dag iops, - list pat> - : I, - Sched<[WriteV]>{ - bits<5> Rd; - bits<5> Rn; - let Inst{31-16} = 0b0101111000101000; - let Inst{15-12} = opc; - let Inst{11-10} = 0b10; - let Inst{9-5} = Rn; - let Inst{4-0} = Rd; -} - -class SHATiedInstVV opc, string asm, Intrinsic OpNode> - : SHA2OpInst; - -class SHAInstSS opc, string asm, Intrinsic OpNode> - : SHA2OpInst; -} // end of 'let Predicates = [HasCrypto]' - -// Allow the size specifier tokens to be upper case, not just lower. -def : TokenAlias<".8B", ".8b">; -def : TokenAlias<".4H", ".4h">; -def : TokenAlias<".2S", ".2s">; -def : TokenAlias<".1D", ".1d">; -def : TokenAlias<".16B", ".16b">; -def : TokenAlias<".8H", ".8h">; -def : TokenAlias<".4S", ".4s">; -def : TokenAlias<".2D", ".2d">; -def : TokenAlias<".1Q", ".1q">; -def : TokenAlias<".B", ".b">; -def : TokenAlias<".H", ".h">; -def : TokenAlias<".S", ".s">; -def : TokenAlias<".D", ".d">; -def : TokenAlias<".Q", ".q">; diff --git a/lib/Target/ARM64/ARM64InstrInfo.cpp b/lib/Target/ARM64/ARM64InstrInfo.cpp deleted file mode 100644 index fbbddd56660..00000000000 --- a/lib/Target/ARM64/ARM64InstrInfo.cpp +++ /dev/null @@ -1,2059 +0,0 @@ -//===- ARM64InstrInfo.cpp - ARM64 Instruction Information -----------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the ARM64 implementation of the TargetInstrInfo class. -// -//===----------------------------------------------------------------------===// - -#include "ARM64InstrInfo.h" -#include "ARM64Subtarget.h" -#include "MCTargetDesc/ARM64AddressingModes.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineMemOperand.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/PseudoSourceValue.h" -#include "llvm/MC/MCInst.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/TargetRegistry.h" - -using namespace llvm; - -#define GET_INSTRINFO_CTOR_DTOR -#include "ARM64GenInstrInfo.inc" - -ARM64InstrInfo::ARM64InstrInfo(const ARM64Subtarget &STI) - : ARM64GenInstrInfo(ARM64::ADJCALLSTACKDOWN, ARM64::ADJCALLSTACKUP), - RI(this, &STI), Subtarget(STI) {} - -/// GetInstSize - Return the number of bytes of code the specified -/// instruction may be. This returns the maximum number of bytes. -unsigned ARM64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { - const MCInstrDesc &Desc = MI->getDesc(); - - switch (Desc.getOpcode()) { - default: - // Anything not explicitly designated otherwise is a nomal 4-byte insn. - return 4; - case TargetOpcode::DBG_VALUE: - case TargetOpcode::EH_LABEL: - case TargetOpcode::IMPLICIT_DEF: - case TargetOpcode::KILL: - return 0; - } - - llvm_unreachable("GetInstSizeInBytes()- Unable to determin insn size"); -} - -static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, - SmallVectorImpl &Cond) { - // Block ends with fall-through condbranch. - switch (LastInst->getOpcode()) { - default: - llvm_unreachable("Unknown branch instruction?"); - case ARM64::Bcc: - Target = LastInst->getOperand(1).getMBB(); - Cond.push_back(LastInst->getOperand(0)); - break; - case ARM64::CBZW: - case ARM64::CBZX: - case ARM64::CBNZW: - case ARM64::CBNZX: - Target = LastInst->getOperand(1).getMBB(); - Cond.push_back(MachineOperand::CreateImm(-1)); - Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); - Cond.push_back(LastInst->getOperand(0)); - break; - case ARM64::TBZW: - case ARM64::TBZX: - case ARM64::TBNZW: - case ARM64::TBNZX: - Target = LastInst->getOperand(2).getMBB(); - Cond.push_back(MachineOperand::CreateImm(-1)); - Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode())); - Cond.push_back(LastInst->getOperand(0)); - Cond.push_back(LastInst->getOperand(1)); - } -} - -// Branch analysis. -bool ARM64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, - MachineBasicBlock *&TBB, - MachineBasicBlock *&FBB, - SmallVectorImpl &Cond, - bool AllowModify) const { - // If the block has no terminators, it just falls into the block after it. - MachineBasicBlock::iterator I = MBB.end(); - if (I == MBB.begin()) - return false; - --I; - while (I->isDebugValue()) { - if (I == MBB.begin()) - return false; - --I; - } - if (!isUnpredicatedTerminator(I)) - return false; - - // Get the last instruction in the block. - MachineInstr *LastInst = I; - - // If there is only one terminator instruction, process it. - unsigned LastOpc = LastInst->getOpcode(); - if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { - if (isUncondBranchOpcode(LastOpc)) { - TBB = LastInst->getOperand(0).getMBB(); - return false; - } - if (isCondBranchOpcode(LastOpc)) { - // Block ends with fall-through condbranch. - parseCondBranch(LastInst, TBB, Cond); - return false; - } - return true; // Can't handle indirect branch. - } - - // Get the instruction before it if it is a terminator. - MachineInstr *SecondLastInst = I; - unsigned SecondLastOpc = SecondLastInst->getOpcode(); - - // If AllowModify is true and the block ends with two or more unconditional - // branches, delete all but the first unconditional branch. - if (AllowModify && isUncondBranchOpcode(LastOpc)) { - while (isUncondBranchOpcode(SecondLastOpc)) { - LastInst->eraseFromParent(); - LastInst = SecondLastInst; - LastOpc = LastInst->getOpcode(); - if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { - // Return now the only terminator is an unconditional branch. - TBB = LastInst->getOperand(0).getMBB(); - return false; - } else { - SecondLastInst = I; - SecondLastOpc = SecondLastInst->getOpcode(); - } - } - } - - // If there are three terminators, we don't know what sort of block this is. - if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I)) - return true; - - // If the block ends with a B and a Bcc, handle it. - if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { - parseCondBranch(SecondLastInst, TBB, Cond); - FBB = LastInst->getOperand(0).getMBB(); - return false; - } - - // If the block ends with two unconditional branches, handle it. The second - // one is not executed, so remove it. - if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { - TBB = SecondLastInst->getOperand(0).getMBB(); - I = LastInst; - if (AllowModify) - I->eraseFromParent(); - return false; - } - - // ...likewise if it ends with an indirect branch followed by an unconditional - // branch. - if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) { - I = LastInst; - if (AllowModify) - I->eraseFromParent(); - return true; - } - - // Otherwise, can't handle this. - return true; -} - -bool ARM64InstrInfo::ReverseBranchCondition( - SmallVectorImpl &Cond) const { - if (Cond[0].getImm() != -1) { - // Regular Bcc - ARM64CC::CondCode CC = (ARM64CC::CondCode)(int)Cond[0].getImm(); - Cond[0].setImm(ARM64CC::getInvertedCondCode(CC)); - } else { - // Folded compare-and-branch - switch (Cond[1].getImm()) { - default: - llvm_unreachable("Unknown conditional branch!"); - case ARM64::CBZW: - Cond[1].setImm(ARM64::CBNZW); - break; - case ARM64::CBNZW: - Cond[1].setImm(ARM64::CBZW); - break; - case ARM64::CBZX: - Cond[1].setImm(ARM64::CBNZX); - break; - case ARM64::CBNZX: - Cond[1].setImm(ARM64::CBZX); - break; - case ARM64::TBZW: - Cond[1].setImm(ARM64::TBNZW); - break; - case ARM64::TBNZW: - Cond[1].setImm(ARM64::TBZW); - break; - case ARM64::TBZX: - Cond[1].setImm(ARM64::TBNZX); - break; - case ARM64::TBNZX: - Cond[1].setImm(ARM64::TBZX); - break; - } - } - - return false; -} - -unsigned ARM64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { - MachineBasicBlock::iterator I = MBB.end(); - if (I == MBB.begin()) - return 0; - --I; - while (I->isDebugValue()) { - if (I == MBB.begin()) - return 0; - --I; - } - if (!isUncondBranchOpcode(I->getOpcode()) && - !isCondBranchOpcode(I->getOpcode())) - return 0; - - // Remove the branch. - I->eraseFromParent(); - - I = MBB.end(); - - if (I == MBB.begin()) - return 1; - --I; - if (!isCondBranchOpcode(I->getOpcode())) - return 1; - - // Remove the branch. - I->eraseFromParent(); - return 2; -} - -void ARM64InstrInfo::instantiateCondBranch( - MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB, - const SmallVectorImpl &Cond) const { - if (Cond[0].getImm() != -1) { - // Regular Bcc - BuildMI(&MBB, DL, get(ARM64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB); - } else { - // Folded compare-and-branch - const MachineInstrBuilder MIB = - BuildMI(&MBB, DL, get(Cond[1].getImm())).addReg(Cond[2].getReg()); - if (Cond.size() > 3) - MIB.addImm(Cond[3].getImm()); - MIB.addMBB(TBB); - } -} - -unsigned ARM64InstrInfo::InsertBranch( - MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, - const SmallVectorImpl &Cond, DebugLoc DL) const { - // Shouldn't be a fall through. - assert(TBB && "InsertBranch must not be told to insert a fallthrough"); - - if (!FBB) { - if (Cond.empty()) // Unconditional branch? - BuildMI(&MBB, DL, get(ARM64::B)).addMBB(TBB); - else - instantiateCondBranch(MBB, DL, TBB, Cond); - return 1; - } - - // Two-way conditional branch. - instantiateCondBranch(MBB, DL, TBB, Cond); - BuildMI(&MBB, DL, get(ARM64::B)).addMBB(FBB); - return 2; -} - -// Find the original register that VReg is copied from. -static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) { - while (TargetRegisterInfo::isVirtualRegister(VReg)) { - const MachineInstr *DefMI = MRI.getVRegDef(VReg); - if (!DefMI->isFullCopy()) - return VReg; - VReg = DefMI->getOperand(1).getReg(); - } - return VReg; -} - -// Determine if VReg is defined by an instruction that can be folded into a -// csel instruction. If so, return the folded opcode, and the replacement -// register. -static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg, - unsigned *NewVReg = nullptr) { - VReg = removeCopies(MRI, VReg); - if (!TargetRegisterInfo::isVirtualRegister(VReg)) - return 0; - - bool Is64Bit = ARM64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg)); - const MachineInstr *DefMI = MRI.getVRegDef(VReg); - unsigned Opc = 0; - unsigned SrcOpNum = 0; - switch (DefMI->getOpcode()) { - case ARM64::ADDSXri: - case ARM64::ADDSWri: - // if NZCV is used, do not fold. - if (DefMI->findRegisterDefOperandIdx(ARM64::NZCV, true) == -1) - return 0; - // fall-through to ADDXri and ADDWri. - case ARM64::ADDXri: - case ARM64::ADDWri: - // add x, 1 -> csinc. - if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 || - DefMI->getOperand(3).getImm() != 0) - return 0; - SrcOpNum = 1; - Opc = Is64Bit ? ARM64::CSINCXr : ARM64::CSINCWr; - break; - - case ARM64::ORNXrr: - case ARM64::ORNWrr: { - // not x -> csinv, represented as orn dst, xzr, src. - unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); - if (ZReg != ARM64::XZR && ZReg != ARM64::WZR) - return 0; - SrcOpNum = 2; - Opc = Is64Bit ? ARM64::CSINVXr : ARM64::CSINVWr; - break; - } - - case ARM64::SUBSXrr: - case ARM64::SUBSWrr: - // if NZCV is used, do not fold. - if (DefMI->findRegisterDefOperandIdx(ARM64::NZCV, true) == -1) - return 0; - // fall-through to SUBXrr and SUBWrr. - case ARM64::SUBXrr: - case ARM64::SUBWrr: { - // neg x -> csneg, represented as sub dst, xzr, src. - unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg()); - if (ZReg != ARM64::XZR && ZReg != ARM64::WZR) - return 0; - SrcOpNum = 2; - Opc = Is64Bit ? ARM64::CSNEGXr : ARM64::CSNEGWr; - break; - } - default: - return 0; - } - assert(Opc && SrcOpNum && "Missing parameters"); - - if (NewVReg) - *NewVReg = DefMI->getOperand(SrcOpNum).getReg(); - return Opc; -} - -bool ARM64InstrInfo::canInsertSelect( - const MachineBasicBlock &MBB, const SmallVectorImpl &Cond, - unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles, - int &FalseCycles) const { - // Check register classes. - const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - const TargetRegisterClass *RC = - RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); - if (!RC) - return false; - - // Expanding cbz/tbz requires an extra cycle of latency on the condition. - unsigned ExtraCondLat = Cond.size() != 1; - - // GPRs are handled by csel. - // FIXME: Fold in x+1, -x, and ~x when applicable. - if (ARM64::GPR64allRegClass.hasSubClassEq(RC) || - ARM64::GPR32allRegClass.hasSubClassEq(RC)) { - // Single-cycle csel, csinc, csinv, and csneg. - CondCycles = 1 + ExtraCondLat; - TrueCycles = FalseCycles = 1; - if (canFoldIntoCSel(MRI, TrueReg)) - TrueCycles = 0; - else if (canFoldIntoCSel(MRI, FalseReg)) - FalseCycles = 0; - return true; - } - - // Scalar floating point is handled by fcsel. - // FIXME: Form fabs, fmin, and fmax when applicable. - if (ARM64::FPR64RegClass.hasSubClassEq(RC) || - ARM64::FPR32RegClass.hasSubClassEq(RC)) { - CondCycles = 5 + ExtraCondLat; - TrueCycles = FalseCycles = 2; - return true; - } - - // Can't do vectors. - return false; -} - -void ARM64InstrInfo::insertSelect(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, DebugLoc DL, - unsigned DstReg, - const SmallVectorImpl &Cond, - unsigned TrueReg, unsigned FalseReg) const { - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - - // Parse the condition code, see parseCondBranch() above. - ARM64CC::CondCode CC; - switch (Cond.size()) { - default: - llvm_unreachable("Unknown condition opcode in Cond"); - case 1: // b.cc - CC = ARM64CC::CondCode(Cond[0].getImm()); - break; - case 3: { // cbz/cbnz - // We must insert a compare against 0. - bool Is64Bit; - switch (Cond[1].getImm()) { - default: - llvm_unreachable("Unknown branch opcode in Cond"); - case ARM64::CBZW: - Is64Bit = 0; - CC = ARM64CC::EQ; - break; - case ARM64::CBZX: - Is64Bit = 1; - CC = ARM64CC::EQ; - break; - case ARM64::CBNZW: - Is64Bit = 0; - CC = ARM64CC::NE; - break; - case ARM64::CBNZX: - Is64Bit = 1; - CC = ARM64CC::NE; - break; - } - unsigned SrcReg = Cond[2].getReg(); - if (Is64Bit) { - // cmp reg, #0 is actually subs xzr, reg, #0. - MRI.constrainRegClass(SrcReg, &ARM64::GPR64spRegClass); - BuildMI(MBB, I, DL, get(ARM64::SUBSXri), ARM64::XZR) - .addReg(SrcReg) - .addImm(0) - .addImm(0); - } else { - MRI.constrainRegClass(SrcReg, &ARM64::GPR32spRegClass); - BuildMI(MBB, I, DL, get(ARM64::SUBSWri), ARM64::WZR) - .addReg(SrcReg) - .addImm(0) - .addImm(0); - } - break; - } - case 4: { // tbz/tbnz - // We must insert a tst instruction. - switch (Cond[1].getImm()) { - default: - llvm_unreachable("Unknown branch opcode in Cond"); - case ARM64::TBZW: - case ARM64::TBZX: - CC = ARM64CC::EQ; - break; - case ARM64::TBNZW: - case ARM64::TBNZX: - CC = ARM64CC::NE; - break; - } - // cmp reg, #foo is actually ands xzr, reg, #1< 64 bit extension case, these instructions can do - // much more. - if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31) - return false; - // This is a signed or unsigned 32 -> 64 bit extension. - SrcReg = MI.getOperand(1).getReg(); - DstReg = MI.getOperand(0).getReg(); - SubIdx = ARM64::sub_32; - return true; - } -} - -/// analyzeCompare - For a comparison instruction, return the source registers -/// in SrcReg and SrcReg2, and the value it compares against in CmpValue. -/// Return true if the comparison instruction can be analyzed. -bool ARM64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, - unsigned &SrcReg2, int &CmpMask, - int &CmpValue) const { - switch (MI->getOpcode()) { - default: - break; - case ARM64::SUBSWrr: - case ARM64::SUBSWrs: - case ARM64::SUBSWrx: - case ARM64::SUBSXrr: - case ARM64::SUBSXrs: - case ARM64::SUBSXrx: - case ARM64::ADDSWrr: - case ARM64::ADDSWrs: - case ARM64::ADDSWrx: - case ARM64::ADDSXrr: - case ARM64::ADDSXrs: - case ARM64::ADDSXrx: - // Replace SUBSWrr with SUBWrr if NZCV is not used. - SrcReg = MI->getOperand(1).getReg(); - SrcReg2 = MI->getOperand(2).getReg(); - CmpMask = ~0; - CmpValue = 0; - return true; - case ARM64::SUBSWri: - case ARM64::ADDSWri: - case ARM64::SUBSXri: - case ARM64::ADDSXri: - SrcReg = MI->getOperand(1).getReg(); - SrcReg2 = 0; - CmpMask = ~0; - CmpValue = MI->getOperand(2).getImm(); - return true; - case ARM64::ANDSWri: - case ARM64::ANDSXri: - // ANDS does not use the same encoding scheme as the others xxxS - // instructions. - SrcReg = MI->getOperand(1).getReg(); - SrcReg2 = 0; - CmpMask = ~0; - CmpValue = ARM64_AM::decodeLogicalImmediate( - MI->getOperand(2).getImm(), - MI->getOpcode() == ARM64::ANDSWri ? 32 : 64); - return true; - } - - return false; -} - -static bool UpdateOperandRegClass(MachineInstr *Instr) { - MachineBasicBlock *MBB = Instr->getParent(); - assert(MBB && "Can't get MachineBasicBlock here"); - MachineFunction *MF = MBB->getParent(); - assert(MF && "Can't get MachineFunction here"); - const TargetMachine *TM = &MF->getTarget(); - const TargetInstrInfo *TII = TM->getInstrInfo(); - const TargetRegisterInfo *TRI = TM->getRegisterInfo(); - MachineRegisterInfo *MRI = &MF->getRegInfo(); - - for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx; - ++OpIdx) { - MachineOperand &MO = Instr->getOperand(OpIdx); - const TargetRegisterClass *OpRegCstraints = - Instr->getRegClassConstraint(OpIdx, TII, TRI); - - // If there's no constraint, there's nothing to do. - if (!OpRegCstraints) - continue; - // If the operand is a frame index, there's nothing to do here. - // A frame index operand will resolve correctly during PEI. - if (MO.isFI()) - continue; - - assert(MO.isReg() && - "Operand has register constraints without being a register!"); - - unsigned Reg = MO.getReg(); - if (TargetRegisterInfo::isPhysicalRegister(Reg)) { - if (!OpRegCstraints->contains(Reg)) - return false; - } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) && - !MRI->constrainRegClass(Reg, OpRegCstraints)) - return false; - } - - return true; -} - -/// optimizeCompareInstr - Convert the instruction supplying the argument to the -/// comparison into one that sets the zero bit in the flags register. -bool ARM64InstrInfo::optimizeCompareInstr( - MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, - int CmpValue, const MachineRegisterInfo *MRI) const { - - // Replace SUBSWrr with SUBWrr if NZCV is not used. - int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(ARM64::NZCV, true); - if (Cmp_NZCV != -1) { - unsigned NewOpc; - switch (CmpInstr->getOpcode()) { - default: - return false; - case ARM64::ADDSWrr: NewOpc = ARM64::ADDWrr; break; - case ARM64::ADDSWri: NewOpc = ARM64::ADDWri; break; - case ARM64::ADDSWrs: NewOpc = ARM64::ADDWrs; break; - case ARM64::ADDSWrx: NewOpc = ARM64::ADDWrx; break; - case ARM64::ADDSXrr: NewOpc = ARM64::ADDXrr; break; - case ARM64::ADDSXri: NewOpc = ARM64::ADDXri; break; - case ARM64::ADDSXrs: NewOpc = ARM64::ADDXrs; break; - case ARM64::ADDSXrx: NewOpc = ARM64::ADDXrx; break; - case ARM64::SUBSWrr: NewOpc = ARM64::SUBWrr; break; - case ARM64::SUBSWri: NewOpc = ARM64::SUBWri; break; - case ARM64::SUBSWrs: NewOpc = ARM64::SUBWrs; break; - case ARM64::SUBSWrx: NewOpc = ARM64::SUBWrx; break; - case ARM64::SUBSXrr: NewOpc = ARM64::SUBXrr; break; - case ARM64::SUBSXri: NewOpc = ARM64::SUBXri; break; - case ARM64::SUBSXrs: NewOpc = ARM64::SUBXrs; break; - case ARM64::SUBSXrx: NewOpc = ARM64::SUBXrx; break; - } - - const MCInstrDesc &MCID = get(NewOpc); - CmpInstr->setDesc(MCID); - CmpInstr->RemoveOperand(Cmp_NZCV); - bool succeeded = UpdateOperandRegClass(CmpInstr); - (void)succeeded; - assert(succeeded && "Some operands reg class are incompatible!"); - return true; - } - - // Continue only if we have a "ri" where immediate is zero. - if (CmpValue != 0 || SrcReg2 != 0) - return false; - - // CmpInstr is a Compare instruction if destination register is not used. - if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg())) - return false; - - // Get the unique definition of SrcReg. - MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); - if (!MI) - return false; - - // We iterate backward, starting from the instruction before CmpInstr and - // stop when reaching the definition of the source register or done with the - // basic block, to check whether NZCV is used or modified in between. - MachineBasicBlock::iterator I = CmpInstr, E = MI, - B = CmpInstr->getParent()->begin(); - - // Early exit if CmpInstr is at the beginning of the BB. - if (I == B) - return false; - - // Check whether the definition of SrcReg is in the same basic block as - // Compare. If not, we can't optimize away the Compare. - if (MI->getParent() != CmpInstr->getParent()) - return false; - - // Check that NZCV isn't set between the comparison instruction and the one we - // want to change. - const TargetRegisterInfo *TRI = &getRegisterInfo(); - for (--I; I != E; --I) { - const MachineInstr &Instr = *I; - - if (Instr.modifiesRegister(ARM64::NZCV, TRI) || - Instr.readsRegister(ARM64::NZCV, TRI)) - // This instruction modifies or uses NZCV after the one we want to - // change. We can't do this transformation. - return false; - if (I == B) - // The 'and' is below the comparison instruction. - return false; - } - - unsigned NewOpc = MI->getOpcode(); - switch (MI->getOpcode()) { - default: - return false; - case ARM64::ADDSWrr: - case ARM64::ADDSWri: - case ARM64::ADDSXrr: - case ARM64::ADDSXri: - case ARM64::SUBSWrr: - case ARM64::SUBSWri: - case ARM64::SUBSXrr: - case ARM64::SUBSXri: - break; - case ARM64::ADDWrr: NewOpc = ARM64::ADDSWrr; break; - case ARM64::ADDWri: NewOpc = ARM64::ADDSWri; break; - case ARM64::ADDXrr: NewOpc = ARM64::ADDSXrr; break; - case ARM64::ADDXri: NewOpc = ARM64::ADDSXri; break; - case ARM64::ADCWr: NewOpc = ARM64::ADCSWr; break; - case ARM64::ADCXr: NewOpc = ARM64::ADCSXr; break; - case ARM64::SUBWrr: NewOpc = ARM64::SUBSWrr; break; - case ARM64::SUBWri: NewOpc = ARM64::SUBSWri; break; - case ARM64::SUBXrr: NewOpc = ARM64::SUBSXrr; break; - case ARM64::SUBXri: NewOpc = ARM64::SUBSXri; break; - case ARM64::SBCWr: NewOpc = ARM64::SBCSWr; break; - case ARM64::SBCXr: NewOpc = ARM64::SBCSXr; break; - case ARM64::ANDWri: NewOpc = ARM64::ANDSWri; break; - case ARM64::ANDXri: NewOpc = ARM64::ANDSXri; break; - } - - // Scan forward for the use of NZCV. - // When checking against MI: if it's a conditional code requires - // checking of V bit, then this is not safe to do. - // It is safe to remove CmpInstr if NZCV is redefined or killed. - // If we are done with the basic block, we need to check whether NZCV is - // live-out. - bool IsSafe = false; - for (MachineBasicBlock::iterator I = CmpInstr, - E = CmpInstr->getParent()->end(); - !IsSafe && ++I != E;) { - const MachineInstr &Instr = *I; - for (unsigned IO = 0, EO = Instr.getNumOperands(); !IsSafe && IO != EO; - ++IO) { - const MachineOperand &MO = Instr.getOperand(IO); - if (MO.isRegMask() && MO.clobbersPhysReg(ARM64::NZCV)) { - IsSafe = true; - break; - } - if (!MO.isReg() || MO.getReg() != ARM64::NZCV) - continue; - if (MO.isDef()) { - IsSafe = true; - break; - } - - // Decode the condition code. - unsigned Opc = Instr.getOpcode(); - ARM64CC::CondCode CC; - switch (Opc) { - default: - return false; - case ARM64::Bcc: - CC = (ARM64CC::CondCode)Instr.getOperand(IO - 2).getImm(); - break; - case ARM64::CSINVWr: - case ARM64::CSINVXr: - case ARM64::CSINCWr: - case ARM64::CSINCXr: - case ARM64::CSELWr: - case ARM64::CSELXr: - case ARM64::CSNEGWr: - case ARM64::CSNEGXr: - case ARM64::FCSELSrrr: - case ARM64::FCSELDrrr: - CC = (ARM64CC::CondCode)Instr.getOperand(IO - 1).getImm(); - break; - } - - // It is not safe to remove Compare instruction if Overflow(V) is used. - switch (CC) { - default: - // NZCV can be used multiple times, we should continue. - break; - case ARM64CC::VS: - case ARM64CC::VC: - case ARM64CC::GE: - case ARM64CC::LT: - case ARM64CC::GT: - case ARM64CC::LE: - return false; - } - } - } - - // If NZCV is not killed nor re-defined, we should check whether it is - // live-out. If it is live-out, do not optimize. - if (!IsSafe) { - MachineBasicBlock *ParentBlock = CmpInstr->getParent(); - for (auto *MBB : ParentBlock->successors()) - if (MBB->isLiveIn(ARM64::NZCV)) - return false; - } - - // Update the instruction to set NZCV. - MI->setDesc(get(NewOpc)); - CmpInstr->eraseFromParent(); - bool succeeded = UpdateOperandRegClass(MI); - (void)succeeded; - assert(succeeded && "Some operands reg class are incompatible!"); - MI->addRegisterDefined(ARM64::NZCV, TRI); - return true; -} - -/// Return true if this is this instruction has a non-zero immediate -bool ARM64InstrInfo::hasShiftedReg(const MachineInstr *MI) const { - switch (MI->getOpcode()) { - default: - break; - case ARM64::ADDSWrs: - case ARM64::ADDSXrs: - case ARM64::ADDWrs: - case ARM64::ADDXrs: - case ARM64::ANDSWrs: - case ARM64::ANDSXrs: - case ARM64::ANDWrs: - case ARM64::ANDXrs: - case ARM64::BICSWrs: - case ARM64::BICSXrs: - case ARM64::BICWrs: - case ARM64::BICXrs: - case ARM64::CRC32Brr: - case ARM64::CRC32CBrr: - case ARM64::CRC32CHrr: - case ARM64::CRC32CWrr: - case ARM64::CRC32CXrr: - case ARM64::CRC32Hrr: - case ARM64::CRC32Wrr: - case ARM64::CRC32Xrr: - case ARM64::EONWrs: - case ARM64::EONXrs: - case ARM64::EORWrs: - case ARM64::EORXrs: - case ARM64::ORNWrs: - case ARM64::ORNXrs: - case ARM64::ORRWrs: - case ARM64::ORRXrs: - case ARM64::SUBSWrs: - case ARM64::SUBSXrs: - case ARM64::SUBWrs: - case ARM64::SUBXrs: - if (MI->getOperand(3).isImm()) { - unsigned val = MI->getOperand(3).getImm(); - return (val != 0); - } - break; - } - return false; -} - -/// Return true if this is this instruction has a non-zero immediate -bool ARM64InstrInfo::hasExtendedReg(const MachineInstr *MI) const { - switch (MI->getOpcode()) { - default: - break; - case ARM64::ADDSWrx: - case ARM64::ADDSXrx: - case ARM64::ADDSXrx64: - case ARM64::ADDWrx: - case ARM64::ADDXrx: - case ARM64::ADDXrx64: - case ARM64::SUBSWrx: - case ARM64::SUBSXrx: - case ARM64::SUBSXrx64: - case ARM64::SUBWrx: - case ARM64::SUBXrx: - case ARM64::SUBXrx64: - if (MI->getOperand(3).isImm()) { - unsigned val = MI->getOperand(3).getImm(); - return (val != 0); - } - break; - } - - return false; -} - -// Return true if this instruction simply sets its single destination register -// to zero. This is equivalent to a register rename of the zero-register. -bool ARM64InstrInfo::isGPRZero(const MachineInstr *MI) const { - switch (MI->getOpcode()) { - default: - break; - case ARM64::MOVZWi: - case ARM64::MOVZXi: // movz Rd, #0 (LSL #0) - if (MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) { - assert(MI->getDesc().getNumOperands() == 3 && - MI->getOperand(2).getImm() == 0 && "invalid MOVZi operands"); - return true; - } - break; - case ARM64::ANDWri: // and Rd, Rzr, #imm - return MI->getOperand(1).getReg() == ARM64::WZR; - case ARM64::ANDXri: - return MI->getOperand(1).getReg() == ARM64::XZR; - case TargetOpcode::COPY: - return MI->getOperand(1).getReg() == ARM64::WZR; - } - return false; -} - -// Return true if this instruction simply renames a general register without -// modifying bits. -bool ARM64InstrInfo::isGPRCopy(const MachineInstr *MI) const { - switch (MI->getOpcode()) { - default: - break; - case TargetOpcode::COPY: { - // GPR32 copies will by lowered to ORRXrs - unsigned DstReg = MI->getOperand(0).getReg(); - return (ARM64::GPR32RegClass.contains(DstReg) || - ARM64::GPR64RegClass.contains(DstReg)); - } - case ARM64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0) - if (MI->getOperand(1).getReg() == ARM64::XZR) { - assert(MI->getDesc().getNumOperands() == 4 && - MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands"); - return true; - } - case ARM64::ADDXri: // add Xd, Xn, #0 (LSL #0) - if (MI->getOperand(2).getImm() == 0) { - assert(MI->getDesc().getNumOperands() == 4 && - MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands"); - return true; - } - } - return false; -} - -// Return true if this instruction simply renames a general register without -// modifying bits. -bool ARM64InstrInfo::isFPRCopy(const MachineInstr *MI) const { - switch (MI->getOpcode()) { - default: - break; - case TargetOpcode::COPY: { - // FPR64 copies will by lowered to ORR.16b - unsigned DstReg = MI->getOperand(0).getReg(); - return (ARM64::FPR64RegClass.contains(DstReg) || - ARM64::FPR128RegClass.contains(DstReg)); - } - case ARM64::ORRv16i8: - if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) { - assert(MI->getDesc().getNumOperands() == 3 && MI->getOperand(0).isReg() && - "invalid ORRv16i8 operands"); - return true; - } - } - return false; -} - -unsigned ARM64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const { - switch (MI->getOpcode()) { - default: - break; - case ARM64::LDRWui: - case ARM64::LDRXui: - case ARM64::LDRBui: - case ARM64::LDRHui: - case ARM64::LDRSui: - case ARM64::LDRDui: - case ARM64::LDRQui: - if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() && - MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); - } - break; - } - - return 0; -} - -unsigned ARM64InstrInfo::isStoreToStackSlot(const MachineInstr *MI, - int &FrameIndex) const { - switch (MI->getOpcode()) { - default: - break; - case ARM64::STRWui: - case ARM64::STRXui: - case ARM64::STRBui: - case ARM64::STRHui: - case ARM64::STRSui: - case ARM64::STRDui: - case ARM64::STRQui: - if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() && - MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); - } - break; - } - return 0; -} - -/// Return true if this is load/store scales or extends its register offset. -/// This refers to scaling a dynamic index as opposed to scaled immediates. -/// MI should be a memory op that allows scaled addressing. -bool ARM64InstrInfo::isScaledAddr(const MachineInstr *MI) const { - switch (MI->getOpcode()) { - default: - break; - case ARM64::LDRBBroW: - case ARM64::LDRBroW: - case ARM64::LDRDroW: - case ARM64::LDRHHroW: - case ARM64::LDRHroW: - case ARM64::LDRQroW: - case ARM64::LDRSBWroW: - case ARM64::LDRSBXroW: - case ARM64::LDRSHWroW: - case ARM64::LDRSHXroW: - case ARM64::LDRSWroW: - case ARM64::LDRSroW: - case ARM64::LDRWroW: - case ARM64::LDRXroW: - case ARM64::STRBBroW: - case ARM64::STRBroW: - case ARM64::STRDroW: - case ARM64::STRHHroW: - case ARM64::STRHroW: - case ARM64::STRQroW: - case ARM64::STRSroW: - case ARM64::STRWroW: - case ARM64::STRXroW: - case ARM64::LDRBBroX: - case ARM64::LDRBroX: - case ARM64::LDRDroX: - case ARM64::LDRHHroX: - case ARM64::LDRHroX: - case ARM64::LDRQroX: - case ARM64::LDRSBWroX: - case ARM64::LDRSBXroX: - case ARM64::LDRSHWroX: - case ARM64::LDRSHXroX: - case ARM64::LDRSWroX: - case ARM64::LDRSroX: - case ARM64::LDRWroX: - case ARM64::LDRXroX: - case ARM64::STRBBroX: - case ARM64::STRBroX: - case ARM64::STRDroX: - case ARM64::STRHHroX: - case ARM64::STRHroX: - case ARM64::STRQroX: - case ARM64::STRSroX: - case ARM64::STRWroX: - case ARM64::STRXroX: - - unsigned Val = MI->getOperand(3).getImm(); - ARM64_AM::ShiftExtendType ExtType = ARM64_AM::getMemExtendType(Val); - return (ExtType != ARM64_AM::UXTX) || ARM64_AM::getMemDoShift(Val); - } - return false; -} - -/// Check all MachineMemOperands for a hint to suppress pairing. -bool ARM64InstrInfo::isLdStPairSuppressed(const MachineInstr *MI) const { - assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) && - "Too many target MO flags"); - for (auto *MM : MI->memoperands()) { - if (MM->getFlags() & - (MOSuppressPair << MachineMemOperand::MOTargetStartBit)) { - return true; - } - } - return false; -} - -/// Set a flag on the first MachineMemOperand to suppress pairing. -void ARM64InstrInfo::suppressLdStPair(MachineInstr *MI) const { - if (MI->memoperands_empty()) - return; - - assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) && - "Too many target MO flags"); - (*MI->memoperands_begin()) - ->setFlags(MOSuppressPair << MachineMemOperand::MOTargetStartBit); -} - -bool ARM64InstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, - unsigned &Offset, - const TargetRegisterInfo *TRI) const { - switch (LdSt->getOpcode()) { - default: - return false; - case ARM64::STRSui: - case ARM64::STRDui: - case ARM64::STRQui: - case ARM64::STRXui: - case ARM64::STRWui: - case ARM64::LDRSui: - case ARM64::LDRDui: - case ARM64::LDRQui: - case ARM64::LDRXui: - case ARM64::LDRWui: - if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm()) - return false; - BaseReg = LdSt->getOperand(1).getReg(); - MachineFunction &MF = *LdSt->getParent()->getParent(); - unsigned Width = getRegClass(LdSt->getDesc(), 0, TRI, MF)->getSize(); - Offset = LdSt->getOperand(2).getImm() * Width; - return true; - }; -} - -/// Detect opportunities for ldp/stp formation. -/// -/// Only called for LdSt for which getLdStBaseRegImmOfs returns true. -bool ARM64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, - MachineInstr *SecondLdSt, - unsigned NumLoads) const { - // Only cluster up to a single pair. - if (NumLoads > 1) - return false; - if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode()) - return false; - // getLdStBaseRegImmOfs guarantees that oper 2 isImm. - unsigned Ofs1 = FirstLdSt->getOperand(2).getImm(); - // Allow 6 bits of positive range. - if (Ofs1 > 64) - return false; - // The caller should already have ordered First/SecondLdSt by offset. - unsigned Ofs2 = SecondLdSt->getOperand(2).getImm(); - return Ofs1 + 1 == Ofs2; -} - -bool ARM64InstrInfo::shouldScheduleAdjacent(MachineInstr *First, - MachineInstr *Second) const { - // Cyclone can fuse CMN, CMP followed by Bcc. - - // FIXME: B0 can also fuse: - // AND, BIC, ORN, ORR, or EOR (optional S) followed by Bcc or CBZ or CBNZ. - if (Second->getOpcode() != ARM64::Bcc) - return false; - switch (First->getOpcode()) { - default: - return false; - case ARM64::SUBSWri: - case ARM64::ADDSWri: - case ARM64::ANDSWri: - case ARM64::SUBSXri: - case ARM64::ADDSXri: - case ARM64::ANDSXri: - return true; - } -} - -MachineInstr *ARM64InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, - int FrameIx, - uint64_t Offset, - const MDNode *MDPtr, - DebugLoc DL) const { - MachineInstrBuilder MIB = BuildMI(MF, DL, get(ARM64::DBG_VALUE)) - .addFrameIndex(FrameIx) - .addImm(0) - .addImm(Offset) - .addMetadata(MDPtr); - return &*MIB; -} - -static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB, - unsigned Reg, unsigned SubIdx, - unsigned State, - const TargetRegisterInfo *TRI) { - if (!SubIdx) - return MIB.addReg(Reg, State); - - if (TargetRegisterInfo::isPhysicalRegister(Reg)) - return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State); - return MIB.addReg(Reg, State, SubIdx); -} - -static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, - unsigned NumRegs) { - // We really want the positive remainder mod 32 here, that happens to be - // easily obtainable with a mask. - return ((DestReg - SrcReg) & 0x1f) < NumRegs; -} - -void ARM64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - DebugLoc DL, unsigned DestReg, - unsigned SrcReg, bool KillSrc, - unsigned Opcode, - llvm::ArrayRef Indices) const { - assert(getSubTarget().hasNEON() && - "Unexpected register copy without NEON"); - const TargetRegisterInfo *TRI = &getRegisterInfo(); - uint16_t DestEncoding = TRI->getEncodingValue(DestReg); - uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg); - unsigned NumRegs = Indices.size(); - - int SubReg = 0, End = NumRegs, Incr = 1; - if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) { - SubReg = NumRegs - 1; - End = -1; - Incr = -1; - } - - for (; SubReg != End; SubReg += Incr) { - const MachineInstrBuilder &MIB = BuildMI(MBB, I, DL, get(Opcode)); - AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI); - AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI); - AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI); - } -} - -void ARM64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const { - if (ARM64::GPR32spRegClass.contains(DestReg) && - (ARM64::GPR32spRegClass.contains(SrcReg) || SrcReg == ARM64::WZR)) { - const TargetRegisterInfo *TRI = &getRegisterInfo(); - - if (DestReg == ARM64::WSP || SrcReg == ARM64::WSP) { - // If either operand is WSP, expand to ADD #0. - if (Subtarget.hasZeroCycleRegMove()) { - // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move. - unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, ARM64::sub_32, - &ARM64::GPR64spRegClass); - unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, ARM64::sub_32, - &ARM64::GPR64spRegClass); - // This instruction is reading and writing X registers. This may upset - // the register scavenger and machine verifier, so we need to indicate - // that we are reading an undefined value from SrcRegX, but a proper - // value from SrcReg. - BuildMI(MBB, I, DL, get(ARM64::ADDXri), DestRegX) - .addReg(SrcRegX, RegState::Undef) - .addImm(0) - .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0)) - .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); - } else { - BuildMI(MBB, I, DL, get(ARM64::ADDWri), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)) - .addImm(0) - .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0)); - } - } else if (SrcReg == ARM64::WZR && Subtarget.hasZeroCycleZeroing()) { - BuildMI(MBB, I, DL, get(ARM64::MOVZWi), DestReg).addImm(0).addImm( - ARM64_AM::getShifterImm(ARM64_AM::LSL, 0)); - } else { - if (Subtarget.hasZeroCycleRegMove()) { - // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. - unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, ARM64::sub_32, - &ARM64::GPR64spRegClass); - unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, ARM64::sub_32, - &ARM64::GPR64spRegClass); - // This instruction is reading and writing X registers. This may upset - // the register scavenger and machine verifier, so we need to indicate - // that we are reading an undefined value from SrcRegX, but a proper - // value from SrcReg. - BuildMI(MBB, I, DL, get(ARM64::ORRXrr), DestRegX) - .addReg(ARM64::XZR) - .addReg(SrcRegX, RegState::Undef) - .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc)); - } else { - // Otherwise, expand to ORR WZR. - BuildMI(MBB, I, DL, get(ARM64::ORRWrr), DestReg) - .addReg(ARM64::WZR) - .addReg(SrcReg, getKillRegState(KillSrc)); - } - } - return; - } - - if (ARM64::GPR64spRegClass.contains(DestReg) && - (ARM64::GPR64spRegClass.contains(SrcReg) || SrcReg == ARM64::XZR)) { - if (DestReg == ARM64::SP || SrcReg == ARM64::SP) { - // If either operand is SP, expand to ADD #0. - BuildMI(MBB, I, DL, get(ARM64::ADDXri), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)) - .addImm(0) - .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0)); - } else if (SrcReg == ARM64::XZR && Subtarget.hasZeroCycleZeroing()) { - BuildMI(MBB, I, DL, get(ARM64::MOVZXi), DestReg).addImm(0).addImm( - ARM64_AM::getShifterImm(ARM64_AM::LSL, 0)); - } else { - // Otherwise, expand to ORR XZR. - BuildMI(MBB, I, DL, get(ARM64::ORRXrr), DestReg) - .addReg(ARM64::XZR) - .addReg(SrcReg, getKillRegState(KillSrc)); - } - return; - } - - // Copy a DDDD register quad by copying the individual sub-registers. - if (ARM64::DDDDRegClass.contains(DestReg) && - ARM64::DDDDRegClass.contains(SrcReg)) { - static const unsigned Indices[] = { ARM64::dsub0, ARM64::dsub1, - ARM64::dsub2, ARM64::dsub3 }; - copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv8i8, - Indices); - return; - } - - // Copy a DDD register triple by copying the individual sub-registers. - if (ARM64::DDDRegClass.contains(DestReg) && - ARM64::DDDRegClass.contains(SrcReg)) { - static const unsigned Indices[] = { ARM64::dsub0, ARM64::dsub1, - ARM64::dsub2 }; - copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv8i8, - Indices); - return; - } - - // Copy a DD register pair by copying the individual sub-registers. - if (ARM64::DDRegClass.contains(DestReg) && - ARM64::DDRegClass.contains(SrcReg)) { - static const unsigned Indices[] = { ARM64::dsub0, ARM64::dsub1 }; - copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv8i8, - Indices); - return; - } - - // Copy a QQQQ register quad by copying the individual sub-registers. - if (ARM64::QQQQRegClass.contains(DestReg) && - ARM64::QQQQRegClass.contains(SrcReg)) { - static const unsigned Indices[] = { ARM64::qsub0, ARM64::qsub1, - ARM64::qsub2, ARM64::qsub3 }; - copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv16i8, - Indices); - return; - } - - // Copy a QQQ register triple by copying the individual sub-registers. - if (ARM64::QQQRegClass.contains(DestReg) && - ARM64::QQQRegClass.contains(SrcReg)) { - static const unsigned Indices[] = { ARM64::qsub0, ARM64::qsub1, - ARM64::qsub2 }; - copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv16i8, - Indices); - return; - } - - // Copy a QQ register pair by copying the individual sub-registers. - if (ARM64::QQRegClass.contains(DestReg) && - ARM64::QQRegClass.contains(SrcReg)) { - static const unsigned Indices[] = { ARM64::qsub0, ARM64::qsub1 }; - copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv16i8, - Indices); - return; - } - - if (ARM64::FPR128RegClass.contains(DestReg) && - ARM64::FPR128RegClass.contains(SrcReg)) { - if(getSubTarget().hasNEON()) { - BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg( - SrcReg, getKillRegState(KillSrc)); - } else { - BuildMI(MBB, I, DL, get(ARM64::STRQpre)) - .addReg(ARM64::SP, RegState::Define) - .addReg(SrcReg, getKillRegState(KillSrc)) - .addReg(ARM64::SP) - .addImm(-16); - BuildMI(MBB, I, DL, get(ARM64::LDRQpre)) - .addReg(ARM64::SP, RegState::Define) - .addReg(DestReg, RegState::Define) - .addReg(ARM64::SP) - .addImm(16); - } - return; - } - - if (ARM64::FPR64RegClass.contains(DestReg) && - ARM64::FPR64RegClass.contains(SrcReg)) { - if(getSubTarget().hasNEON()) { - DestReg = - RI.getMatchingSuperReg(DestReg, ARM64::dsub, &ARM64::FPR128RegClass); - SrcReg = - RI.getMatchingSuperReg(SrcReg, ARM64::dsub, &ARM64::FPR128RegClass); - BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg( - SrcReg, getKillRegState(KillSrc)); - } else { - BuildMI(MBB, I, DL, get(ARM64::FMOVDr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - } - return; - } - - if (ARM64::FPR32RegClass.contains(DestReg) && - ARM64::FPR32RegClass.contains(SrcReg)) { - if(getSubTarget().hasNEON()) { - DestReg = - RI.getMatchingSuperReg(DestReg, ARM64::ssub, &ARM64::FPR128RegClass); - SrcReg = - RI.getMatchingSuperReg(SrcReg, ARM64::ssub, &ARM64::FPR128RegClass); - BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg( - SrcReg, getKillRegState(KillSrc)); - } else { - BuildMI(MBB, I, DL, get(ARM64::FMOVSr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - } - return; - } - - if (ARM64::FPR16RegClass.contains(DestReg) && - ARM64::FPR16RegClass.contains(SrcReg)) { - if(getSubTarget().hasNEON()) { - DestReg = - RI.getMatchingSuperReg(DestReg, ARM64::hsub, &ARM64::FPR128RegClass); - SrcReg = - RI.getMatchingSuperReg(SrcReg, ARM64::hsub, &ARM64::FPR128RegClass); - BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg( - SrcReg, getKillRegState(KillSrc)); - } else { - DestReg = - RI.getMatchingSuperReg(DestReg, ARM64::hsub, &ARM64::FPR32RegClass); - SrcReg = - RI.getMatchingSuperReg(SrcReg, ARM64::hsub, &ARM64::FPR32RegClass); - BuildMI(MBB, I, DL, get(ARM64::FMOVSr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - } - return; - } - - if (ARM64::FPR8RegClass.contains(DestReg) && - ARM64::FPR8RegClass.contains(SrcReg)) { - if(getSubTarget().hasNEON()) { - DestReg = - RI.getMatchingSuperReg(DestReg, ARM64::bsub, &ARM64::FPR128RegClass); - SrcReg = - RI.getMatchingSuperReg(SrcReg, ARM64::bsub, &ARM64::FPR128RegClass); - BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg( - SrcReg, getKillRegState(KillSrc)); - } else { - DestReg = - RI.getMatchingSuperReg(DestReg, ARM64::bsub, &ARM64::FPR32RegClass); - SrcReg = - RI.getMatchingSuperReg(SrcReg, ARM64::bsub, &ARM64::FPR32RegClass); - BuildMI(MBB, I, DL, get(ARM64::FMOVSr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - } - return; - } - - // Copies between GPR64 and FPR64. - if (ARM64::FPR64RegClass.contains(DestReg) && - ARM64::GPR64RegClass.contains(SrcReg)) { - BuildMI(MBB, I, DL, get(ARM64::FMOVXDr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - return; - } - if (ARM64::GPR64RegClass.contains(DestReg) && - ARM64::FPR64RegClass.contains(SrcReg)) { - BuildMI(MBB, I, DL, get(ARM64::FMOVDXr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - return; - } - // Copies between GPR32 and FPR32. - if (ARM64::FPR32RegClass.contains(DestReg) && - ARM64::GPR32RegClass.contains(SrcReg)) { - BuildMI(MBB, I, DL, get(ARM64::FMOVWSr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - return; - } - if (ARM64::GPR32RegClass.contains(DestReg) && - ARM64::FPR32RegClass.contains(SrcReg)) { - BuildMI(MBB, I, DL, get(ARM64::FMOVSWr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - return; - } - - assert(0 && "unimplemented reg-to-reg copy"); -} - -void ARM64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - unsigned SrcReg, bool isKill, int FI, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { - DebugLoc DL; - if (MBBI != MBB.end()) - DL = MBBI->getDebugLoc(); - MachineFunction &MF = *MBB.getParent(); - MachineFrameInfo &MFI = *MF.getFrameInfo(); - unsigned Align = MFI.getObjectAlignment(FI); - - MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI)); - MachineMemOperand *MMO = MF.getMachineMemOperand( - PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align); - unsigned Opc = 0; - bool Offset = true; - switch (RC->getSize()) { - case 1: - if (ARM64::FPR8RegClass.hasSubClassEq(RC)) - Opc = ARM64::STRBui; - break; - case 2: - if (ARM64::FPR16RegClass.hasSubClassEq(RC)) - Opc = ARM64::STRHui; - break; - case 4: - if (ARM64::GPR32allRegClass.hasSubClassEq(RC)) { - Opc = ARM64::STRWui; - if (TargetRegisterInfo::isVirtualRegister(SrcReg)) - MF.getRegInfo().constrainRegClass(SrcReg, &ARM64::GPR32RegClass); - else - assert(SrcReg != ARM64::WSP); - } else if (ARM64::FPR32RegClass.hasSubClassEq(RC)) - Opc = ARM64::STRSui; - break; - case 8: - if (ARM64::GPR64allRegClass.hasSubClassEq(RC)) { - Opc = ARM64::STRXui; - if (TargetRegisterInfo::isVirtualRegister(SrcReg)) - MF.getRegInfo().constrainRegClass(SrcReg, &ARM64::GPR64RegClass); - else - assert(SrcReg != ARM64::SP); - } else if (ARM64::FPR64RegClass.hasSubClassEq(RC)) - Opc = ARM64::STRDui; - break; - case 16: - if (ARM64::FPR128RegClass.hasSubClassEq(RC)) - Opc = ARM64::STRQui; - else if (ARM64::DDRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && - "Unexpected register store without NEON"); - Opc = ARM64::ST1Twov1d, Offset = false; - } - break; - case 24: - if (ARM64::DDDRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && - "Unexpected register store without NEON"); - Opc = ARM64::ST1Threev1d, Offset = false; - } - break; - case 32: - if (ARM64::DDDDRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && - "Unexpected register store without NEON"); - Opc = ARM64::ST1Fourv1d, Offset = false; - } else if (ARM64::QQRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && - "Unexpected register store without NEON"); - Opc = ARM64::ST1Twov2d, Offset = false; - } - break; - case 48: - if (ARM64::QQQRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && - "Unexpected register store without NEON"); - Opc = ARM64::ST1Threev2d, Offset = false; - } - break; - case 64: - if (ARM64::QQQQRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && - "Unexpected register store without NEON"); - Opc = ARM64::ST1Fourv2d, Offset = false; - } - break; - } - assert(Opc && "Unknown register class"); - - const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc)) - .addReg(SrcReg, getKillRegState(isKill)) - .addFrameIndex(FI); - - if (Offset) - MI.addImm(0); - MI.addMemOperand(MMO); -} - -void ARM64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - unsigned DestReg, int FI, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { - DebugLoc DL; - if (MBBI != MBB.end()) - DL = MBBI->getDebugLoc(); - MachineFunction &MF = *MBB.getParent(); - MachineFrameInfo &MFI = *MF.getFrameInfo(); - unsigned Align = MFI.getObjectAlignment(FI); - MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI)); - MachineMemOperand *MMO = MF.getMachineMemOperand( - PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align); - - unsigned Opc = 0; - bool Offset = true; - switch (RC->getSize()) { - case 1: - if (ARM64::FPR8RegClass.hasSubClassEq(RC)) - Opc = ARM64::LDRBui; - break; - case 2: - if (ARM64::FPR16RegClass.hasSubClassEq(RC)) - Opc = ARM64::LDRHui; - break; - case 4: - if (ARM64::GPR32allRegClass.hasSubClassEq(RC)) { - Opc = ARM64::LDRWui; - if (TargetRegisterInfo::isVirtualRegister(DestReg)) - MF.getRegInfo().constrainRegClass(DestReg, &ARM64::GPR32RegClass); - else - assert(DestReg != ARM64::WSP); - } else if (ARM64::FPR32RegClass.hasSubClassEq(RC)) - Opc = ARM64::LDRSui; - break; - case 8: - if (ARM64::GPR64allRegClass.hasSubClassEq(RC)) { - Opc = ARM64::LDRXui; - if (TargetRegisterInfo::isVirtualRegister(DestReg)) - MF.getRegInfo().constrainRegClass(DestReg, &ARM64::GPR64RegClass); - else - assert(DestReg != ARM64::SP); - } else if (ARM64::FPR64RegClass.hasSubClassEq(RC)) - Opc = ARM64::LDRDui; - break; - case 16: - if (ARM64::FPR128RegClass.hasSubClassEq(RC)) - Opc = ARM64::LDRQui; - else if (ARM64::DDRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && - "Unexpected register load without NEON"); - Opc = ARM64::LD1Twov1d, Offset = false; - } - break; - case 24: - if (ARM64::DDDRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && - "Unexpected register load without NEON"); - Opc = ARM64::LD1Threev1d, Offset = false; - } - break; - case 32: - if (ARM64::DDDDRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && - "Unexpected register load without NEON"); - Opc = ARM64::LD1Fourv1d, Offset = false; - } else if (ARM64::QQRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && - "Unexpected register load without NEON"); - Opc = ARM64::LD1Twov2d, Offset = false; - } - break; - case 48: - if (ARM64::QQQRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && - "Unexpected register load without NEON"); - Opc = ARM64::LD1Threev2d, Offset = false; - } - break; - case 64: - if (ARM64::QQQQRegClass.hasSubClassEq(RC)) { - assert(getSubTarget().hasNEON() && - "Unexpected register load without NEON"); - Opc = ARM64::LD1Fourv2d, Offset = false; - } - break; - } - assert(Opc && "Unknown register class"); - - const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc)) - .addReg(DestReg, getDefRegState(true)) - .addFrameIndex(FI); - if (Offset) - MI.addImm(0); - MI.addMemOperand(MMO); -} - -void llvm::emitFrameOffset(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, int Offset, - const ARM64InstrInfo *TII, MachineInstr::MIFlag Flag, - bool SetNZCV) { - if (DestReg == SrcReg && Offset == 0) - return; - - bool isSub = Offset < 0; - if (isSub) - Offset = -Offset; - - // FIXME: If the offset won't fit in 24-bits, compute the offset into a - // scratch register. If DestReg is a virtual register, use it as the - // scratch register; otherwise, create a new virtual register (to be - // replaced by the scavenger at the end of PEI). That case can be optimized - // slightly if DestReg is SP which is always 16-byte aligned, so the scratch - // register can be loaded with offset%8 and the add/sub can use an extending - // instruction with LSL#3. - // Currently the function handles any offsets but generates a poor sequence - // of code. - // assert(Offset < (1 << 24) && "unimplemented reg plus immediate"); - - unsigned Opc; - if (SetNZCV) - Opc = isSub ? ARM64::SUBSXri : ARM64::ADDSXri; - else - Opc = isSub ? ARM64::SUBXri : ARM64::ADDXri; - const unsigned MaxEncoding = 0xfff; - const unsigned ShiftSize = 12; - const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; - while (((unsigned)Offset) >= (1 << ShiftSize)) { - unsigned ThisVal; - if (((unsigned)Offset) > MaxEncodableValue) { - ThisVal = MaxEncodableValue; - } else { - ThisVal = Offset & MaxEncodableValue; - } - assert((ThisVal >> ShiftSize) <= MaxEncoding && - "Encoding cannot handle value that big"); - BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg) - .addReg(SrcReg) - .addImm(ThisVal >> ShiftSize) - .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftSize)) - .setMIFlag(Flag); - - SrcReg = DestReg; - Offset -= ThisVal; - if (Offset == 0) - return; - } - BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg) - .addReg(SrcReg) - .addImm(Offset) - .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0)) - .setMIFlag(Flag); -} - -MachineInstr * -ARM64InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - const SmallVectorImpl &Ops, - int FrameIndex) const { - // This is a bit of a hack. Consider this instruction: - // - // %vreg0 = COPY %SP; GPR64all:%vreg0 - // - // We explicitly chose GPR64all for the virtual register so such a copy might - // be eliminated by RegisterCoalescer. However, that may not be possible, and - // %vreg0 may even spill. We can't spill %SP, and since it is in the GPR64all - // register class, TargetInstrInfo::foldMemoryOperand() is going to try. - // - // To prevent that, we are going to constrain the %vreg0 register class here. - // - // - // - if (MI->isCopy()) { - unsigned DstReg = MI->getOperand(0).getReg(); - unsigned SrcReg = MI->getOperand(1).getReg(); - if (SrcReg == ARM64::SP && TargetRegisterInfo::isVirtualRegister(DstReg)) { - MF.getRegInfo().constrainRegClass(DstReg, &ARM64::GPR64RegClass); - return nullptr; - } - if (DstReg == ARM64::SP && TargetRegisterInfo::isVirtualRegister(SrcReg)) { - MF.getRegInfo().constrainRegClass(SrcReg, &ARM64::GPR64RegClass); - return nullptr; - } - } - - // Cannot fold. - return nullptr; -} - -int llvm::isARM64FrameOffsetLegal(const MachineInstr &MI, int &Offset, - bool *OutUseUnscaledOp, - unsigned *OutUnscaledOp, - int *EmittableOffset) { - int Scale = 1; - bool IsSigned = false; - // The ImmIdx should be changed case by case if it is not 2. - unsigned ImmIdx = 2; - unsigned UnscaledOp = 0; - // Set output values in case of early exit. - if (EmittableOffset) - *EmittableOffset = 0; - if (OutUseUnscaledOp) - *OutUseUnscaledOp = false; - if (OutUnscaledOp) - *OutUnscaledOp = 0; - switch (MI.getOpcode()) { - default: - assert(0 && "unhandled opcode in rewriteARM64FrameIndex"); - // Vector spills/fills can't take an immediate offset. - case ARM64::LD1Twov2d: - case ARM64::LD1Threev2d: - case ARM64::LD1Fourv2d: - case ARM64::LD1Twov1d: - case ARM64::LD1Threev1d: - case ARM64::LD1Fourv1d: - case ARM64::ST1Twov2d: - case ARM64::ST1Threev2d: - case ARM64::ST1Fourv2d: - case ARM64::ST1Twov1d: - case ARM64::ST1Threev1d: - case ARM64::ST1Fourv1d: - return ARM64FrameOffsetCannotUpdate; - case ARM64::PRFMui: - Scale = 8; - UnscaledOp = ARM64::PRFUMi; - break; - case ARM64::LDRXui: - Scale = 8; - UnscaledOp = ARM64::LDURXi; - break; - case ARM64::LDRWui: - Scale = 4; - UnscaledOp = ARM64::LDURWi; - break; - case ARM64::LDRBui: - Scale = 1; - UnscaledOp = ARM64::LDURBi; - break; - case ARM64::LDRHui: - Scale = 2; - UnscaledOp = ARM64::LDURHi; - break; - case ARM64::LDRSui: - Scale = 4; - UnscaledOp = ARM64::LDURSi; - break; - case ARM64::LDRDui: - Scale = 8; - UnscaledOp = ARM64::LDURDi; - break; - case ARM64::LDRQui: - Scale = 16; - UnscaledOp = ARM64::LDURQi; - break; - case ARM64::LDRBBui: - Scale = 1; - UnscaledOp = ARM64::LDURBBi; - break; - case ARM64::LDRHHui: - Scale = 2; - UnscaledOp = ARM64::LDURHHi; - break; - case ARM64::LDRSBXui: - Scale = 1; - UnscaledOp = ARM64::LDURSBXi; - break; - case ARM64::LDRSBWui: - Scale = 1; - UnscaledOp = ARM64::LDURSBWi; - break; - case ARM64::LDRSHXui: - Scale = 2; - UnscaledOp = ARM64::LDURSHXi; - break; - case ARM64::LDRSHWui: - Scale = 2; - UnscaledOp = ARM64::LDURSHWi; - break; - case ARM64::LDRSWui: - Scale = 4; - UnscaledOp = ARM64::LDURSWi; - break; - - case ARM64::STRXui: - Scale = 8; - UnscaledOp = ARM64::STURXi; - break; - case ARM64::STRWui: - Scale = 4; - UnscaledOp = ARM64::STURWi; - break; - case ARM64::STRBui: - Scale = 1; - UnscaledOp = ARM64::STURBi; - break; - case ARM64::STRHui: - Scale = 2; - UnscaledOp = ARM64::STURHi; - break; - case ARM64::STRSui: - Scale = 4; - UnscaledOp = ARM64::STURSi; - break; - case ARM64::STRDui: - Scale = 8; - UnscaledOp = ARM64::STURDi; - break; - case ARM64::STRQui: - Scale = 16; - UnscaledOp = ARM64::STURQi; - break; - case ARM64::STRBBui: - Scale = 1; - UnscaledOp = ARM64::STURBBi; - break; - case ARM64::STRHHui: - Scale = 2; - UnscaledOp = ARM64::STURHHi; - break; - - case ARM64::LDPXi: - case ARM64::LDPDi: - case ARM64::STPXi: - case ARM64::STPDi: - IsSigned = true; - Scale = 8; - break; - case ARM64::LDPQi: - case ARM64::STPQi: - IsSigned = true; - Scale = 16; - break; - case ARM64::LDPWi: - case ARM64::LDPSi: - case ARM64::STPWi: - case ARM64::STPSi: - IsSigned = true; - Scale = 4; - break; - - case ARM64::LDURXi: - case ARM64::LDURWi: - case ARM64::LDURBi: - case ARM64::LDURHi: - case ARM64::LDURSi: - case ARM64::LDURDi: - case ARM64::LDURQi: - case ARM64::LDURHHi: - case ARM64::LDURBBi: - case ARM64::LDURSBXi: - case ARM64::LDURSBWi: - case ARM64::LDURSHXi: - case ARM64::LDURSHWi: - case ARM64::LDURSWi: - case ARM64::STURXi: - case ARM64::STURWi: - case ARM64::STURBi: - case ARM64::STURHi: - case ARM64::STURSi: - case ARM64::STURDi: - case ARM64::STURQi: - case ARM64::STURBBi: - case ARM64::STURHHi: - Scale = 1; - break; - } - - Offset += MI.getOperand(ImmIdx).getImm() * Scale; - - bool useUnscaledOp = false; - // If the offset doesn't match the scale, we rewrite the instruction to - // use the unscaled instruction instead. Likewise, if we have a negative - // offset (and have an unscaled op to use). - if ((Offset & (Scale - 1)) != 0 || (Offset < 0 && UnscaledOp != 0)) - useUnscaledOp = true; - - // Use an unscaled addressing mode if the instruction has a negative offset - // (or if the instruction is already using an unscaled addressing mode). - unsigned MaskBits; - if (IsSigned) { - // ldp/stp instructions. - MaskBits = 7; - Offset /= Scale; - } else if (UnscaledOp == 0 || useUnscaledOp) { - MaskBits = 9; - IsSigned = true; - Scale = 1; - } else { - MaskBits = 12; - IsSigned = false; - Offset /= Scale; - } - - // Attempt to fold address computation. - int MaxOff = (1 << (MaskBits - IsSigned)) - 1; - int MinOff = (IsSigned ? (-MaxOff - 1) : 0); - if (Offset >= MinOff && Offset <= MaxOff) { - if (EmittableOffset) - *EmittableOffset = Offset; - Offset = 0; - } else { - int NewOff = Offset < 0 ? MinOff : MaxOff; - if (EmittableOffset) - *EmittableOffset = NewOff; - Offset = (Offset - NewOff) * Scale; - } - if (OutUseUnscaledOp) - *OutUseUnscaledOp = useUnscaledOp; - if (OutUnscaledOp) - *OutUnscaledOp = UnscaledOp; - return ARM64FrameOffsetCanUpdate | - (Offset == 0 ? ARM64FrameOffsetIsLegal : 0); -} - -bool llvm::rewriteARM64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, - unsigned FrameReg, int &Offset, - const ARM64InstrInfo *TII) { - unsigned Opcode = MI.getOpcode(); - unsigned ImmIdx = FrameRegIdx + 1; - - if (Opcode == ARM64::ADDSXri || Opcode == ARM64::ADDXri) { - Offset += MI.getOperand(ImmIdx).getImm(); - emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(), - MI.getOperand(0).getReg(), FrameReg, Offset, TII, - MachineInstr::NoFlags, (Opcode == ARM64::ADDSXri)); - MI.eraseFromParent(); - Offset = 0; - return true; - } - - int NewOffset; - unsigned UnscaledOp; - bool UseUnscaledOp; - int Status = isARM64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, &UnscaledOp, - &NewOffset); - if (Status & ARM64FrameOffsetCanUpdate) { - if (Status & ARM64FrameOffsetIsLegal) - // Replace the FrameIndex with FrameReg. - MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false); - if (UseUnscaledOp) - MI.setDesc(TII->get(UnscaledOp)); - - MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset); - return Offset == 0; - } - - return false; -} - -void ARM64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { - NopInst.setOpcode(ARM64::HINT); - NopInst.addOperand(MCOperand::CreateImm(0)); -} diff --git a/lib/Target/ARM64/ARM64InstrInfo.h b/lib/Target/ARM64/ARM64InstrInfo.h deleted file mode 100644 index ce195e763b2..00000000000 --- a/lib/Target/ARM64/ARM64InstrInfo.h +++ /dev/null @@ -1,231 +0,0 @@ -//===- ARM64InstrInfo.h - ARM64 Instruction Information ---------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the ARM64 implementation of the TargetInstrInfo class. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TARGET_ARM64INSTRINFO_H -#define LLVM_TARGET_ARM64INSTRINFO_H - -#include "ARM64.h" -#include "ARM64RegisterInfo.h" -#include "llvm/Target/TargetInstrInfo.h" - -#define GET_INSTRINFO_HEADER -#include "ARM64GenInstrInfo.inc" - -namespace llvm { - -class ARM64Subtarget; -class ARM64TargetMachine; - -class ARM64InstrInfo : public ARM64GenInstrInfo { - // Reserve bits in the MachineMemOperand target hint flags, starting at 1. - // They will be shifted into MOTargetHintStart when accessed. - enum TargetMemOperandFlags { - MOSuppressPair = 1 - }; - - const ARM64RegisterInfo RI; - const ARM64Subtarget &Subtarget; - -public: - explicit ARM64InstrInfo(const ARM64Subtarget &STI); - - /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As - /// such, whenever a client has an instance of instruction info, it should - /// always be able to get register info as well (through this method). - const ARM64RegisterInfo &getRegisterInfo() const { return RI; } - - const ARM64Subtarget &getSubTarget() const { return Subtarget; } - - unsigned GetInstSizeInBytes(const MachineInstr *MI) const; - - bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, - unsigned &DstReg, unsigned &SubIdx) const override; - - unsigned isLoadFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const override; - unsigned isStoreToStackSlot(const MachineInstr *MI, - int &FrameIndex) const override; - - /// Returns true if there is a shiftable register and that the shift value - /// is non-zero. - bool hasShiftedReg(const MachineInstr *MI) const; - - /// Returns true if there is an extendable register and that the extending value - /// is non-zero. - bool hasExtendedReg(const MachineInstr *MI) const; - - /// \brief Does this instruction set its full destination register to zero? - bool isGPRZero(const MachineInstr *MI) const; - - /// \brief Does this instruction rename a GPR without modifying bits? - bool isGPRCopy(const MachineInstr *MI) const; - - /// \brief Does this instruction rename an FPR without modifying bits? - bool isFPRCopy(const MachineInstr *MI) const; - - /// Return true if this is load/store scales or extends its register offset. - /// This refers to scaling a dynamic index as opposed to scaled immediates. - /// MI should be a memory op that allows scaled addressing. - bool isScaledAddr(const MachineInstr *MI) const; - - /// Return true if pairing the given load or store is hinted to be - /// unprofitable. - bool isLdStPairSuppressed(const MachineInstr *MI) const; - - /// Hint that pairing the given load or store is unprofitable. - void suppressLdStPair(MachineInstr *MI) const; - - bool getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, - unsigned &Offset, - const TargetRegisterInfo *TRI) const override; - - bool enableClusterLoads() const override { return true; } - - bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt, - unsigned NumLoads) const override; - - bool shouldScheduleAdjacent(MachineInstr *First, - MachineInstr *Second) const override; - - MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx, - uint64_t Offset, const MDNode *MDPtr, - DebugLoc DL) const; - void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - DebugLoc DL, unsigned DestReg, unsigned SrcReg, - bool KillSrc, unsigned Opcode, - llvm::ArrayRef Indices) const; - void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - DebugLoc DL, unsigned DestReg, unsigned SrcReg, - bool KillSrc) const override; - - void storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, unsigned SrcReg, - bool isKill, int FrameIndex, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const override; - - void loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, unsigned DestReg, - int FrameIndex, const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const override; - - MachineInstr * - foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - const SmallVectorImpl &Ops, - int FrameIndex) const override; - - bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, - MachineBasicBlock *&FBB, - SmallVectorImpl &Cond, - bool AllowModify = false) const override; - unsigned RemoveBranch(MachineBasicBlock &MBB) const override; - unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, - MachineBasicBlock *FBB, - const SmallVectorImpl &Cond, - DebugLoc DL) const override; - bool - ReverseBranchCondition(SmallVectorImpl &Cond) const override; - bool canInsertSelect(const MachineBasicBlock &, - const SmallVectorImpl &Cond, unsigned, - unsigned, int &, int &, int &) const override; - void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - DebugLoc DL, unsigned DstReg, - const SmallVectorImpl &Cond, - unsigned TrueReg, unsigned FalseReg) const override; - void getNoopForMachoTarget(MCInst &NopInst) const override; - - /// analyzeCompare - For a comparison instruction, return the source registers - /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. - /// Return true if the comparison instruction can be analyzed. - bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, - unsigned &SrcReg2, int &CmpMask, - int &CmpValue) const override; - /// optimizeCompareInstr - Convert the instruction supplying the argument to - /// the comparison into one that sets the zero bit in the flags register. - bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, - unsigned SrcReg2, int CmpMask, int CmpValue, - const MachineRegisterInfo *MRI) const override; - -private: - void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL, - MachineBasicBlock *TBB, - const SmallVectorImpl &Cond) const; -}; - -/// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg -/// plus Offset. This is intended to be used from within the prolog/epilog -/// insertion (PEI) pass, where a virtual scratch register may be allocated -/// if necessary, to be replaced by the scavenger at the end of PEI. -void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset, - const ARM64InstrInfo *TII, - MachineInstr::MIFlag = MachineInstr::NoFlags, - bool SetNZCV = false); - -/// rewriteARM64FrameIndex - Rewrite MI to access 'Offset' bytes from the -/// FP. Return false if the offset could not be handled directly in MI, and -/// return the left-over portion by reference. -bool rewriteARM64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx, - unsigned FrameReg, int &Offset, - const ARM64InstrInfo *TII); - -/// \brief Use to report the frame offset status in isARM64FrameOffsetLegal. -enum ARM64FrameOffsetStatus { - ARM64FrameOffsetCannotUpdate = 0x0, ///< Offset cannot apply. - ARM64FrameOffsetIsLegal = 0x1, ///< Offset is legal. - ARM64FrameOffsetCanUpdate = 0x2 ///< Offset can apply, at least partly. -}; - -/// \brief Check if the @p Offset is a valid frame offset for @p MI. -/// The returned value reports the validity of the frame offset for @p MI. -/// It uses the values defined by ARM64FrameOffsetStatus for that. -/// If result == ARM64FrameOffsetCannotUpdate, @p MI cannot be updated to -/// use an offset.eq -/// If result & ARM64FrameOffsetIsLegal, @p Offset can completely be -/// rewriten in @p MI. -/// If result & ARM64FrameOffsetCanUpdate, @p Offset contains the -/// amount that is off the limit of the legal offset. -/// If set, @p OutUseUnscaledOp will contain the whether @p MI should be -/// turned into an unscaled operator, which opcode is in @p OutUnscaledOp. -/// If set, @p EmittableOffset contains the amount that can be set in @p MI -/// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that -/// is a legal offset. -int isARM64FrameOffsetLegal(const MachineInstr &MI, int &Offset, - bool *OutUseUnscaledOp = nullptr, - unsigned *OutUnscaledOp = nullptr, - int *EmittableOffset = nullptr); - -static inline bool isUncondBranchOpcode(int Opc) { return Opc == ARM64::B; } - -static inline bool isCondBranchOpcode(int Opc) { - switch (Opc) { - case ARM64::Bcc: - case ARM64::CBZW: - case ARM64::CBZX: - case ARM64::CBNZW: - case ARM64::CBNZX: - case ARM64::TBZW: - case ARM64::TBZX: - case ARM64::TBNZW: - case ARM64::TBNZX: - return true; - default: - return false; - } -} - -static inline bool isIndirectBranchOpcode(int Opc) { return Opc == ARM64::BR; } - -} // end namespace llvm - -#endif diff --git a/lib/Target/ARM64/ARM64InstrInfo.td b/lib/Target/ARM64/ARM64InstrInfo.td deleted file mode 100644 index e68980c83c5..00000000000 --- a/lib/Target/ARM64/ARM64InstrInfo.td +++ /dev/null @@ -1,5282 +0,0 @@ -//===- ARM64InstrInfo.td - Describe the ARM64 Instructions -*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// ARM64 Instruction definitions. -// -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// ARM Instruction Predicate Definitions. -// -def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">, - AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">; -def HasNEON : Predicate<"Subtarget->hasNEON()">, - AssemblerPredicate<"FeatureNEON", "neon">; -def HasCrypto : Predicate<"Subtarget->hasCrypto()">, - AssemblerPredicate<"FeatureCrypto", "crypto">; -def HasCRC : Predicate<"Subtarget->hasCRC()">, - AssemblerPredicate<"FeatureCRC", "crc">; -def IsLE : Predicate<"Subtarget->isLittleEndian()">; -def IsBE : Predicate<"!Subtarget->isLittleEndian()">; - -//===----------------------------------------------------------------------===// -// ARM64-specific DAG Nodes. -// - -// SDTBinaryArithWithFlagsOut - RES1, FLAGS = op LHS, RHS -def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2, - [SDTCisSameAs<0, 2>, - SDTCisSameAs<0, 3>, - SDTCisInt<0>, SDTCisVT<1, i32>]>; - -// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS -def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3, - [SDTCisSameAs<0, 1>, - SDTCisSameAs<0, 2>, - SDTCisInt<0>, - SDTCisVT<3, i32>]>; - -// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS -def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, - [SDTCisSameAs<0, 2>, - SDTCisSameAs<0, 3>, - SDTCisInt<0>, - SDTCisVT<1, i32>, - SDTCisVT<4, i32>]>; - -def SDT_ARM64Brcond : SDTypeProfile<0, 3, - [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>, - SDTCisVT<2, i32>]>; -def SDT_ARM64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>; -def SDT_ARM64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>, - SDTCisVT<2, OtherVT>]>; - - -def SDT_ARM64CSel : SDTypeProfile<1, 4, - [SDTCisSameAs<0, 1>, - SDTCisSameAs<0, 2>, - SDTCisInt<3>, - SDTCisVT<4, i32>]>; -def SDT_ARM64FCmp : SDTypeProfile<0, 2, - [SDTCisFP<0>, - SDTCisSameAs<0, 1>]>; -def SDT_ARM64Dup : SDTypeProfile<1, 1, [SDTCisVec<0>]>; -def SDT_ARM64DupLane : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>; -def SDT_ARM64Zip : SDTypeProfile<1, 2, [SDTCisVec<0>, - SDTCisSameAs<0, 1>, - SDTCisSameAs<0, 2>]>; -def SDT_ARM64MOVIedit : SDTypeProfile<1, 1, [SDTCisInt<1>]>; -def SDT_ARM64MOVIshift : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>; -def SDT_ARM64vecimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisInt<2>, SDTCisInt<3>]>; -def SDT_ARM64UnaryVec: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; -def SDT_ARM64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>, SDTCisInt<3>]>; -def SDT_ARM64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>; - -def SDT_ARM64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; -def SDT_ARM64fcmpz : SDTypeProfile<1, 1, []>; -def SDT_ARM64fcmp : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>; -def SDT_ARM64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>]>; -def SDT_ARM64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>, - SDTCisSameAs<0,3>]>; -def SDT_ARM64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>; -def SDT_ARM64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>; - -def SDT_ARM64ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>; - -def SDT_ARM64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>, - SDTCisPtrTy<1>]>; -def SDT_ARM64WrapperLarge : SDTypeProfile<1, 4, - [SDTCisVT<0, i64>, SDTCisVT<1, i32>, - SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>, - SDTCisSameAs<1, 4>]>; - - -// Node definitions. -def ARM64adrp : SDNode<"ARM64ISD::ADRP", SDTIntUnaryOp, []>; -def ARM64addlow : SDNode<"ARM64ISD::ADDlow", SDTIntBinOp, []>; -def ARM64LOADgot : SDNode<"ARM64ISD::LOADgot", SDTIntUnaryOp>; -def ARM64callseq_start : SDNode<"ISD::CALLSEQ_START", - SDCallSeqStart<[ SDTCisVT<0, i32> ]>, - [SDNPHasChain, SDNPOutGlue]>; -def ARM64callseq_end : SDNode<"ISD::CALLSEQ_END", - SDCallSeqEnd<[ SDTCisVT<0, i32>, - SDTCisVT<1, i32> ]>, - [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; -def ARM64call : SDNode<"ARM64ISD::CALL", - SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>, - [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, - SDNPVariadic]>; -def ARM64brcond : SDNode<"ARM64ISD::BRCOND", SDT_ARM64Brcond, - [SDNPHasChain]>; -def ARM64cbz : SDNode<"ARM64ISD::CBZ", SDT_ARM64cbz, - [SDNPHasChain]>; -def ARM64cbnz : SDNode<"ARM64ISD::CBNZ", SDT_ARM64cbz, - [SDNPHasChain]>; -def ARM64tbz : SDNode<"ARM64ISD::TBZ", SDT_ARM64tbz, - [SDNPHasChain]>; -def ARM64tbnz : SDNode<"ARM64ISD::TBNZ", SDT_ARM64tbz, - [SDNPHasChain]>; - - -def ARM64csel : SDNode<"ARM64ISD::CSEL", SDT_ARM64CSel>; -def ARM64csinv : SDNode<"ARM64ISD::CSINV", SDT_ARM64CSel>; -def ARM64csneg : SDNode<"ARM64ISD::CSNEG", SDT_ARM64CSel>; -def ARM64csinc : SDNode<"ARM64ISD::CSINC", SDT_ARM64CSel>; -def ARM64retflag : SDNode<"ARM64ISD::RET_FLAG", SDTNone, - [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; -def ARM64adc : SDNode<"ARM64ISD::ADC", SDTBinaryArithWithFlagsIn >; -def ARM64sbc : SDNode<"ARM64ISD::SBC", SDTBinaryArithWithFlagsIn>; -def ARM64add_flag : SDNode<"ARM64ISD::ADDS", SDTBinaryArithWithFlagsOut, - [SDNPCommutative]>; -def ARM64sub_flag : SDNode<"ARM64ISD::SUBS", SDTBinaryArithWithFlagsOut>; -def ARM64and_flag : SDNode<"ARM64ISD::ANDS", SDTBinaryArithWithFlagsOut, - [SDNPCommutative]>; -def ARM64adc_flag : SDNode<"ARM64ISD::ADCS", SDTBinaryArithWithFlagsInOut>; -def ARM64sbc_flag : SDNode<"ARM64ISD::SBCS", SDTBinaryArithWithFlagsInOut>; - -def ARM64threadpointer : SDNode<"ARM64ISD::THREAD_POINTER", SDTPtrLeaf>; - -def ARM64fcmp : SDNode<"ARM64ISD::FCMP", SDT_ARM64FCmp>; - -def ARM64fmax : SDNode<"ARM64ISD::FMAX", SDTFPBinOp>; -def ARM64fmin : SDNode<"ARM64ISD::FMIN", SDTFPBinOp>; - -def ARM64dup : SDNode<"ARM64ISD::DUP", SDT_ARM64Dup>; -def ARM64duplane8 : SDNode<"ARM64ISD::DUPLANE8", SDT_ARM64DupLane>; -def ARM64duplane16 : SDNode<"ARM64ISD::DUPLANE16", SDT_ARM64DupLane>; -def ARM64duplane32 : SDNode<"ARM64ISD::DUPLANE32", SDT_ARM64DupLane>; -def ARM64duplane64 : SDNode<"ARM64ISD::DUPLANE64", SDT_ARM64DupLane>; - -def ARM64zip1 : SDNode<"ARM64ISD::ZIP1", SDT_ARM64Zip>; -def ARM64zip2 : SDNode<"ARM64ISD::ZIP2", SDT_ARM64Zip>; -def ARM64uzp1 : SDNode<"ARM64ISD::UZP1", SDT_ARM64Zip>; -def ARM64uzp2 : SDNode<"ARM64ISD::UZP2", SDT_ARM64Zip>; -def ARM64trn1 : SDNode<"ARM64ISD::TRN1", SDT_ARM64Zip>; -def ARM64trn2 : SDNode<"ARM64ISD::TRN2", SDT_ARM64Zip>; - -def ARM64movi_edit : SDNode<"ARM64ISD::MOVIedit", SDT_ARM64MOVIedit>; -def ARM64movi_shift : SDNode<"ARM64ISD::MOVIshift", SDT_ARM64MOVIshift>; -def ARM64movi_msl : SDNode<"ARM64ISD::MOVImsl", SDT_ARM64MOVIshift>; -def ARM64mvni_shift : SDNode<"ARM64ISD::MVNIshift", SDT_ARM64MOVIshift>; -def ARM64mvni_msl : SDNode<"ARM64ISD::MVNImsl", SDT_ARM64MOVIshift>; -def ARM64movi : SDNode<"ARM64ISD::MOVI", SDT_ARM64MOVIedit>; -def ARM64fmov : SDNode<"ARM64ISD::FMOV", SDT_ARM64MOVIedit>; - -def ARM64rev16 : SDNode<"ARM64ISD::REV16", SDT_ARM64UnaryVec>; -def ARM64rev32 : SDNode<"ARM64ISD::REV32", SDT_ARM64UnaryVec>; -def ARM64rev64 : SDNode<"ARM64ISD::REV64", SDT_ARM64UnaryVec>; -def ARM64ext : SDNode<"ARM64ISD::EXT", SDT_ARM64ExtVec>; - -def ARM64vashr : SDNode<"ARM64ISD::VASHR", SDT_ARM64vshift>; -def ARM64vlshr : SDNode<"ARM64ISD::VLSHR", SDT_ARM64vshift>; -def ARM64vshl : SDNode<"ARM64ISD::VSHL", SDT_ARM64vshift>; -def ARM64sqshli : SDNode<"ARM64ISD::SQSHL_I", SDT_ARM64vshift>; -def ARM64uqshli : SDNode<"ARM64ISD::UQSHL_I", SDT_ARM64vshift>; -def ARM64sqshlui : SDNode<"ARM64ISD::SQSHLU_I", SDT_ARM64vshift>; -def ARM64srshri : SDNode<"ARM64ISD::SRSHR_I", SDT_ARM64vshift>; -def ARM64urshri : SDNode<"ARM64ISD::URSHR_I", SDT_ARM64vshift>; - -def ARM64not: SDNode<"ARM64ISD::NOT", SDT_ARM64unvec>; -def ARM64bit: SDNode<"ARM64ISD::BIT", SDT_ARM64trivec>; -def ARM64bsl: SDNode<"ARM64ISD::BSL", SDT_ARM64trivec>; - -def ARM64cmeq: SDNode<"ARM64ISD::CMEQ", SDT_ARM64binvec>; -def ARM64cmge: SDNode<"ARM64ISD::CMGE", SDT_ARM64binvec>; -def ARM64cmgt: SDNode<"ARM64ISD::CMGT", SDT_ARM64binvec>; -def ARM64cmhi: SDNode<"ARM64ISD::CMHI", SDT_ARM64binvec>; -def ARM64cmhs: SDNode<"ARM64ISD::CMHS", SDT_ARM64binvec>; - -def ARM64fcmeq: SDNode<"ARM64ISD::FCMEQ", SDT_ARM64fcmp>; -def ARM64fcmge: SDNode<"ARM64ISD::FCMGE", SDT_ARM64fcmp>; -def ARM64fcmgt: SDNode<"ARM64ISD::FCMGT", SDT_ARM64fcmp>; - -def ARM64cmeqz: SDNode<"ARM64ISD::CMEQz", SDT_ARM64unvec>; -def ARM64cmgez: SDNode<"ARM64ISD::CMGEz", SDT_ARM64unvec>; -def ARM64cmgtz: SDNode<"ARM64ISD::CMGTz", SDT_ARM64unvec>; -def ARM64cmlez: SDNode<"ARM64ISD::CMLEz", SDT_ARM64unvec>; -def ARM64cmltz: SDNode<"ARM64ISD::CMLTz", SDT_ARM64unvec>; -def ARM64cmtst : PatFrag<(ops node:$LHS, node:$RHS), - (ARM64not (ARM64cmeqz (and node:$LHS, node:$RHS)))>; - -def ARM64fcmeqz: SDNode<"ARM64ISD::FCMEQz", SDT_ARM64fcmpz>; -def ARM64fcmgez: SDNode<"ARM64ISD::FCMGEz", SDT_ARM64fcmpz>; -def ARM64fcmgtz: SDNode<"ARM64ISD::FCMGTz", SDT_ARM64fcmpz>; -def ARM64fcmlez: SDNode<"ARM64ISD::FCMLEz", SDT_ARM64fcmpz>; -def ARM64fcmltz: SDNode<"ARM64ISD::FCMLTz", SDT_ARM64fcmpz>; - -def ARM64bici: SDNode<"ARM64ISD::BICi", SDT_ARM64vecimm>; -def ARM64orri: SDNode<"ARM64ISD::ORRi", SDT_ARM64vecimm>; - -def ARM64neg : SDNode<"ARM64ISD::NEG", SDT_ARM64unvec>; - -def ARM64tcret: SDNode<"ARM64ISD::TC_RETURN", SDT_ARM64TCRET, - [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; - -def ARM64Prefetch : SDNode<"ARM64ISD::PREFETCH", SDT_ARM64PREFETCH, - [SDNPHasChain, SDNPSideEffect]>; - -def ARM64sitof: SDNode<"ARM64ISD::SITOF", SDT_ARM64ITOF>; -def ARM64uitof: SDNode<"ARM64ISD::UITOF", SDT_ARM64ITOF>; - -def ARM64tlsdesc_call : SDNode<"ARM64ISD::TLSDESC_CALL", SDT_ARM64TLSDescCall, - [SDNPInGlue, SDNPOutGlue, SDNPHasChain, - SDNPVariadic]>; - -def ARM64WrapperLarge : SDNode<"ARM64ISD::WrapperLarge", SDT_ARM64WrapperLarge>; - - -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// - -// ARM64 Instruction Predicate Definitions. -// -def HasZCZ : Predicate<"Subtarget->hasZeroCycleZeroing()">; -def NoZCZ : Predicate<"!Subtarget->hasZeroCycleZeroing()">; -def IsDarwin : Predicate<"Subtarget->isTargetDarwin()">; -def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">; -def ForCodeSize : Predicate<"ForCodeSize">; -def NotForCodeSize : Predicate<"!ForCodeSize">; - -include "ARM64InstrFormats.td" - -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// Miscellaneous instructions. -//===----------------------------------------------------------------------===// - -let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in { -def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt), - [(ARM64callseq_start timm:$amt)]>; -def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), - [(ARM64callseq_end timm:$amt1, timm:$amt2)]>; -} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 - -let isReMaterializable = 1, isCodeGenOnly = 1 in { -// FIXME: The following pseudo instructions are only needed because remat -// cannot handle multiple instructions. When that changes, they can be -// removed, along with the ARM64Wrapper node. - -let AddedComplexity = 10 in -def LOADgot : Pseudo<(outs GPR64:$dst), (ins i64imm:$addr), - [(set GPR64:$dst, (ARM64LOADgot tglobaladdr:$addr))]>, - Sched<[WriteLDAdr]>; - -// The MOVaddr instruction should match only when the add is not folded -// into a load or store address. -def MOVaddr - : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), - [(set GPR64:$dst, (ARM64addlow (ARM64adrp tglobaladdr:$hi), - tglobaladdr:$low))]>, - Sched<[WriteAdrAdr]>; -def MOVaddrJT - : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), - [(set GPR64:$dst, (ARM64addlow (ARM64adrp tjumptable:$hi), - tjumptable:$low))]>, - Sched<[WriteAdrAdr]>; -def MOVaddrCP - : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), - [(set GPR64:$dst, (ARM64addlow (ARM64adrp tconstpool:$hi), - tconstpool:$low))]>, - Sched<[WriteAdrAdr]>; -def MOVaddrBA - : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), - [(set GPR64:$dst, (ARM64addlow (ARM64adrp tblockaddress:$hi), - tblockaddress:$low))]>, - Sched<[WriteAdrAdr]>; -def MOVaddrTLS - : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), - [(set GPR64:$dst, (ARM64addlow (ARM64adrp tglobaltlsaddr:$hi), - tglobaltlsaddr:$low))]>, - Sched<[WriteAdrAdr]>; -def MOVaddrEXT - : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low), - [(set GPR64:$dst, (ARM64addlow (ARM64adrp texternalsym:$hi), - texternalsym:$low))]>, - Sched<[WriteAdrAdr]>; - -} // isReMaterializable, isCodeGenOnly - -def : Pat<(ARM64LOADgot tglobaltlsaddr:$addr), - (LOADgot tglobaltlsaddr:$addr)>; - -def : Pat<(ARM64LOADgot texternalsym:$addr), - (LOADgot texternalsym:$addr)>; - -def : Pat<(ARM64LOADgot tconstpool:$addr), - (LOADgot tconstpool:$addr)>; - -//===----------------------------------------------------------------------===// -// System instructions. -//===----------------------------------------------------------------------===// - -def HINT : HintI<"hint">; -def : InstAlias<"nop", (HINT 0b000)>; -def : InstAlias<"yield",(HINT 0b001)>; -def : InstAlias<"wfe", (HINT 0b010)>; -def : InstAlias<"wfi", (HINT 0b011)>; -def : InstAlias<"sev", (HINT 0b100)>; -def : InstAlias<"sevl", (HINT 0b101)>; - - // As far as LLVM is concerned this writes to the system's exclusive monitors. -let mayLoad = 1, mayStore = 1 in -def CLREX : CRmSystemI; - -def DMB : CRmSystemI; -def DSB : CRmSystemI; -def ISB : CRmSystemI; -def : InstAlias<"clrex", (CLREX 0xf)>; -def : InstAlias<"isb", (ISB 0xf)>; - -def MRS : MRSI; -def MSR : MSRI; -def MSRpstate: MSRpstateI; - -// The thread pointer (on Linux, at least, where this has been implemented) is -// TPIDR_EL0. -def : Pat<(ARM64threadpointer), (MRS 0xde82)>; - -// Generic system instructions -def SYSxt : SystemXtI<0, "sys">; -def SYSLxt : SystemLXtI<1, "sysl">; - -def : InstAlias<"sys $op1, $Cn, $Cm, $op2", - (SYSxt imm0_7:$op1, sys_cr_op:$Cn, - sys_cr_op:$Cm, imm0_7:$op2, XZR)>; - -//===----------------------------------------------------------------------===// -// Move immediate instructions. -//===----------------------------------------------------------------------===// - -defm MOVK : InsertImmediate<0b11, "movk">; -defm MOVN : MoveImmediate<0b00, "movn">; - -let PostEncoderMethod = "fixMOVZ" in -defm MOVZ : MoveImmediate<0b10, "movz">; - -// First group of aliases covers an implicit "lsl #0". -def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0)>; -def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0)>; -def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>; -def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>; -def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>; -def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>; - -// Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax. -def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>; -def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>; -def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>; -def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>; - -def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>; -def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>; -def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>; -def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>; - -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48)>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32)>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16)>; -def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0)>; - -def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>; -def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>; - -def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>; -def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>; - -def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16)>; -def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0)>; - -// Final group of aliases covers true "mov $Rd, $imm" cases. -multiclass movw_mov_alias { - def _asmoperand : AsmOperandClass { - let Name = basename # width # "_lsl" # shift # "MovAlias"; - let PredicateMethod = "is" # basename # "MovAlias<" # width # ", " - # shift # ">"; - let RenderMethod = "add" # basename # "MovAliasOperands<" # shift # ">"; - } - - def _movimm : Operand { - let ParserMatchClass = !cast(NAME # "_asmoperand"); - } - - def : InstAlias<"mov $Rd, $imm", - (INST GPR:$Rd, !cast(NAME # "_movimm"):$imm, shift)>; -} - -defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 0>; -defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 16>; - -defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 0>; -defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 16>; -defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 32>; -defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 48>; - -defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 0>; -defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 16>; - -defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 0>; -defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 16>; -defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 32>; -defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 48>; - -let isReMaterializable = 1, isCodeGenOnly = 1, isMoveImm = 1, - isAsCheapAsAMove = 1 in { -// FIXME: The following pseudo instructions are only needed because remat -// cannot handle multiple instructions. When that changes, we can select -// directly to the real instructions and get rid of these pseudos. - -def MOVi32imm - : Pseudo<(outs GPR32:$dst), (ins i32imm:$src), - [(set GPR32:$dst, imm:$src)]>, - Sched<[WriteImm]>; -def MOVi64imm - : Pseudo<(outs GPR64:$dst), (ins i64imm:$src), - [(set GPR64:$dst, imm:$src)]>, - Sched<[WriteImm]>; -} // isReMaterializable, isCodeGenOnly - -// If possible, we want to use MOVi32imm even for 64-bit moves. This gives the -// eventual expansion code fewer bits to worry about getting right. Marshalling -// the types is a little tricky though: -def i64imm_32bit : ImmLeaf(Imm); -}]>; - -def trunc_imm : SDNodeXFormgetTargetConstant(N->getZExtValue(), MVT::i32); -}]>; - -def : Pat<(i64 i64imm_32bit:$src), - (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>; - -// Deal with the various forms of (ELF) large addressing with MOVZ/MOVK -// sequences. -def : Pat<(ARM64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2, - tglobaladdr:$g1, tglobaladdr:$g0), - (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48), - tglobaladdr:$g2, 32), - tglobaladdr:$g1, 16), - tglobaladdr:$g0, 0)>; - -def : Pat<(ARM64WrapperLarge tblockaddress:$g3, tblockaddress:$g2, - tblockaddress:$g1, tblockaddress:$g0), - (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48), - tblockaddress:$g2, 32), - tblockaddress:$g1, 16), - tblockaddress:$g0, 0)>; - -def : Pat<(ARM64WrapperLarge tconstpool:$g3, tconstpool:$g2, - tconstpool:$g1, tconstpool:$g0), - (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48), - tconstpool:$g2, 32), - tconstpool:$g1, 16), - tconstpool:$g0, 0)>; - -def : Pat<(ARM64WrapperLarge tjumptable:$g3, tjumptable:$g2, - tjumptable:$g1, tjumptable:$g0), - (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g3, 48), - tjumptable:$g2, 32), - tjumptable:$g1, 16), - tjumptable:$g0, 0)>; - - -//===----------------------------------------------------------------------===// -// Arithmetic instructions. -//===----------------------------------------------------------------------===// - -// Add/subtract with carry. -defm ADC : AddSubCarry<0, "adc", "adcs", ARM64adc, ARM64adc_flag>; -defm SBC : AddSubCarry<1, "sbc", "sbcs", ARM64sbc, ARM64sbc_flag>; - -def : InstAlias<"ngc $dst, $src", (SBCWr GPR32:$dst, WZR, GPR32:$src)>; -def : InstAlias<"ngc $dst, $src", (SBCXr GPR64:$dst, XZR, GPR64:$src)>; -def : InstAlias<"ngcs $dst, $src", (SBCSWr GPR32:$dst, WZR, GPR32:$src)>; -def : InstAlias<"ngcs $dst, $src", (SBCSXr GPR64:$dst, XZR, GPR64:$src)>; - -// Add/subtract -defm ADD : AddSub<0, "add", add>; -defm SUB : AddSub<1, "sub">; - -def : InstAlias<"mov $dst, $src", - (ADDWri GPR32sponly:$dst, GPR32sp:$src, 0, 0)>; -def : InstAlias<"mov $dst, $src", - (ADDWri GPR32sp:$dst, GPR32sponly:$src, 0, 0)>; -def : InstAlias<"mov $dst, $src", - (ADDXri GPR64sponly:$dst, GPR64sp:$src, 0, 0)>; -def : InstAlias<"mov $dst, $src", - (ADDXri GPR64sp:$dst, GPR64sponly:$src, 0, 0)>; - -defm ADDS : AddSubS<0, "adds", ARM64add_flag, "cmn">; -defm SUBS : AddSubS<1, "subs", ARM64sub_flag, "cmp">; - -// Use SUBS instead of SUB to enable CSE between SUBS and SUB. -def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm), - (SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>; -def : Pat<(sub GPR64sp:$Rn, addsub_shifted_imm64:$imm), - (SUBSXri GPR64sp:$Rn, addsub_shifted_imm64:$imm)>; -def : Pat<(sub GPR32:$Rn, GPR32:$Rm), - (SUBSWrr GPR32:$Rn, GPR32:$Rm)>; -def : Pat<(sub GPR64:$Rn, GPR64:$Rm), - (SUBSXrr GPR64:$Rn, GPR64:$Rm)>; -def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm), - (SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>; -def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm), - (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>; -def : Pat<(sub GPR32sp:$R2, arith_extended_reg32:$R3), - (SUBSWrx GPR32sp:$R2, arith_extended_reg32:$R3)>; -def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64:$R3), - (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64:$R3)>; - -// Because of the immediate format for add/sub-imm instructions, the -// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1). -// These patterns capture that transformation. -let AddedComplexity = 1 in { -def : Pat<(add GPR32:$Rn, neg_addsub_shifted_imm32:$imm), - (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>; -def : Pat<(add GPR64:$Rn, neg_addsub_shifted_imm64:$imm), - (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>; -def : Pat<(sub GPR32:$Rn, neg_addsub_shifted_imm32:$imm), - (ADDWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>; -def : Pat<(sub GPR64:$Rn, neg_addsub_shifted_imm64:$imm), - (ADDXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>; -} - -// Because of the immediate format for add/sub-imm instructions, the -// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1). -// These patterns capture that transformation. -let AddedComplexity = 1 in { -def : Pat<(ARM64add_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm), - (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>; -def : Pat<(ARM64add_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm), - (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>; -def : Pat<(ARM64sub_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm), - (ADDSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>; -def : Pat<(ARM64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm), - (ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>; -} - -def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>; -def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>; -def : InstAlias<"neg $dst, $src$shift", - (SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>; -def : InstAlias<"neg $dst, $src$shift", - (SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>; - -def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>; -def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>; -def : InstAlias<"negs $dst, $src$shift", - (SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>; -def : InstAlias<"negs $dst, $src$shift", - (SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>; - - -// Unsigned/Signed divide -defm UDIV : Div<0, "udiv", udiv>; -defm SDIV : Div<1, "sdiv", sdiv>; -let isCodeGenOnly = 1 in { -defm UDIV_Int : Div<0, "udiv", int_arm64_udiv>; -defm SDIV_Int : Div<1, "sdiv", int_arm64_sdiv>; -} - -// Variable shift -defm ASRV : Shift<0b10, "asr", sra>; -defm LSLV : Shift<0b00, "lsl", shl>; -defm LSRV : Shift<0b01, "lsr", srl>; -defm RORV : Shift<0b11, "ror", rotr>; - -def : ShiftAlias<"asrv", ASRVWr, GPR32>; -def : ShiftAlias<"asrv", ASRVXr, GPR64>; -def : ShiftAlias<"lslv", LSLVWr, GPR32>; -def : ShiftAlias<"lslv", LSLVXr, GPR64>; -def : ShiftAlias<"lsrv", LSRVWr, GPR32>; -def : ShiftAlias<"lsrv", LSRVXr, GPR64>; -def : ShiftAlias<"rorv", RORVWr, GPR32>; -def : ShiftAlias<"rorv", RORVXr, GPR64>; - -// Multiply-add -let AddedComplexity = 7 in { -defm MADD : MulAccum<0, "madd", add>; -defm MSUB : MulAccum<1, "msub", sub>; - -def : Pat<(i32 (mul GPR32:$Rn, GPR32:$Rm)), - (MADDWrrr GPR32:$Rn, GPR32:$Rm, WZR)>; -def : Pat<(i64 (mul GPR64:$Rn, GPR64:$Rm)), - (MADDXrrr GPR64:$Rn, GPR64:$Rm, XZR)>; - -def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))), - (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>; -def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))), - (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>; -} // AddedComplexity = 7 - -let AddedComplexity = 5 in { -def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>; -def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>; -def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>; -def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>; - -def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))), - (SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>; -def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))), - (UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>; - -def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))), - (SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>; -def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))), - (UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>; -} // AddedComplexity = 5 - -def : MulAccumWAlias<"mul", MADDWrrr>; -def : MulAccumXAlias<"mul", MADDXrrr>; -def : MulAccumWAlias<"mneg", MSUBWrrr>; -def : MulAccumXAlias<"mneg", MSUBXrrr>; -def : WideMulAccumAlias<"smull", SMADDLrrr>; -def : WideMulAccumAlias<"smnegl", SMSUBLrrr>; -def : WideMulAccumAlias<"umull", UMADDLrrr>; -def : WideMulAccumAlias<"umnegl", UMSUBLrrr>; - -// Multiply-high -def SMULHrr : MulHi<0b010, "smulh", mulhs>; -def UMULHrr : MulHi<0b110, "umulh", mulhu>; - -// CRC32 -def CRC32Brr : BaseCRC32<0, 0b00, 0, GPR32, int_arm64_crc32b, "crc32b">; -def CRC32Hrr : BaseCRC32<0, 0b01, 0, GPR32, int_arm64_crc32h, "crc32h">; -def CRC32Wrr : BaseCRC32<0, 0b10, 0, GPR32, int_arm64_crc32w, "crc32w">; -def CRC32Xrr : BaseCRC32<1, 0b11, 0, GPR64, int_arm64_crc32x, "crc32x">; - -def CRC32CBrr : BaseCRC32<0, 0b00, 1, GPR32, int_arm64_crc32cb, "crc32cb">; -def CRC32CHrr : BaseCRC32<0, 0b01, 1, GPR32, int_arm64_crc32ch, "crc32ch">; -def CRC32CWrr : BaseCRC32<0, 0b10, 1, GPR32, int_arm64_crc32cw, "crc32cw">; -def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_arm64_crc32cx, "crc32cx">; - - -//===----------------------------------------------------------------------===// -// Logical instructions. -//===----------------------------------------------------------------------===// - -// (immediate) -defm ANDS : LogicalImmS<0b11, "ands", ARM64and_flag>; -defm AND : LogicalImm<0b00, "and", and>; -defm EOR : LogicalImm<0b10, "eor", xor>; -defm ORR : LogicalImm<0b01, "orr", or>; - -// FIXME: these aliases *are* canonical sometimes (when movz can't be -// used). Actually, it seems to be working right now, but putting logical_immXX -// here is a bit dodgy on the AsmParser side too. -def : InstAlias<"mov $dst, $imm", (ORRWri GPR32sp:$dst, WZR, - logical_imm32:$imm), 0>; -def : InstAlias<"mov $dst, $imm", (ORRXri GPR64sp:$dst, XZR, - logical_imm64:$imm), 0>; - - -// (register) -defm ANDS : LogicalRegS<0b11, 0, "ands", ARM64and_flag>; -defm BICS : LogicalRegS<0b11, 1, "bics", - BinOpFrag<(ARM64and_flag node:$LHS, (not node:$RHS))>>; -defm AND : LogicalReg<0b00, 0, "and", and>; -defm BIC : LogicalReg<0b00, 1, "bic", - BinOpFrag<(and node:$LHS, (not node:$RHS))>>; -defm EON : LogicalReg<0b10, 1, "eon", - BinOpFrag<(xor node:$LHS, (not node:$RHS))>>; -defm EOR : LogicalReg<0b10, 0, "eor", xor>; -defm ORN : LogicalReg<0b01, 1, "orn", - BinOpFrag<(or node:$LHS, (not node:$RHS))>>; -defm ORR : LogicalReg<0b01, 0, "orr", or>; - -def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0), 2>; -def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0), 2>; - -def : InstAlias<"mvn $Wd, $Wm", (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0), 3>; -def : InstAlias<"mvn $Xd, $Xm", (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0), 3>; - -def : InstAlias<"mvn $Wd, $Wm$sh", - (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, logical_shift32:$sh), 2>; -def : InstAlias<"mvn $Xd, $Xm$sh", - (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, logical_shift64:$sh), 2>; - -def : InstAlias<"tst $src1, $src2", - (ANDSWri WZR, GPR32:$src1, logical_imm32:$src2), 2>; -def : InstAlias<"tst $src1, $src2", - (ANDSXri XZR, GPR64:$src1, logical_imm64:$src2), 2>; - -def : InstAlias<"tst $src1, $src2", - (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0), 3>; -def : InstAlias<"tst $src1, $src2", - (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0), 3>; - -def : InstAlias<"tst $src1, $src2$sh", - (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift32:$sh), 2>; -def : InstAlias<"tst $src1, $src2$sh", - (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift64:$sh), 2>; - - -def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>; -def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>; - - -//===----------------------------------------------------------------------===// -// One operand data processing instructions. -//===----------------------------------------------------------------------===// - -defm CLS : OneOperandData<0b101, "cls">; -defm CLZ : OneOperandData<0b100, "clz", ctlz>; -defm RBIT : OneOperandData<0b000, "rbit">; -def REV16Wr : OneWRegData<0b001, "rev16", - UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>; -def REV16Xr : OneXRegData<0b001, "rev16", null_frag>; - -def : Pat<(cttz GPR32:$Rn), - (CLZWr (RBITWr GPR32:$Rn))>; -def : Pat<(cttz GPR64:$Rn), - (CLZXr (RBITXr GPR64:$Rn))>; -def : Pat<(ctlz (or (shl (xor (sra GPR32:$Rn, (i64 31)), GPR32:$Rn), (i64 1)), - (i32 1))), - (CLSWr GPR32:$Rn)>; -def : Pat<(ctlz (or (shl (xor (sra GPR64:$Rn, (i64 63)), GPR64:$Rn), (i64 1)), - (i64 1))), - (CLSXr GPR64:$Rn)>; - -// Unlike the other one operand instructions, the instructions with the "rev" -// mnemonic do *not* just different in the size bit, but actually use different -// opcode bits for the different sizes. -def REVWr : OneWRegData<0b010, "rev", bswap>; -def REVXr : OneXRegData<0b011, "rev", bswap>; -def REV32Xr : OneXRegData<0b010, "rev32", - UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>; - -// The bswap commutes with the rotr so we want a pattern for both possible -// orders. -def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>; -def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>; - -//===----------------------------------------------------------------------===// -// Bitfield immediate extraction instruction. -//===----------------------------------------------------------------------===// -let neverHasSideEffects = 1 in -defm EXTR : ExtractImm<"extr">; -def : InstAlias<"ror $dst, $src, $shift", - (EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>; -def : InstAlias<"ror $dst, $src, $shift", - (EXTRXrri GPR64:$dst, GPR64:$src, GPR64:$src, imm0_63:$shift)>; - -def : Pat<(rotr GPR32:$Rn, (i64 imm0_31:$imm)), - (EXTRWrri GPR32:$Rn, GPR32:$Rn, imm0_31:$imm)>; -def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)), - (EXTRXrri GPR64:$Rn, GPR64:$Rn, imm0_63:$imm)>; - -//===----------------------------------------------------------------------===// -// Other bitfield immediate instructions. -//===----------------------------------------------------------------------===// -let neverHasSideEffects = 1 in { -defm BFM : BitfieldImmWith2RegArgs<0b01, "bfm">; -defm SBFM : BitfieldImm<0b00, "sbfm">; -defm UBFM : BitfieldImm<0b10, "ubfm">; -} - -def i32shift_a : Operand, SDNodeXFormgetZExtValue()) & 0x1f; - return CurDAG->getTargetConstant(enc, MVT::i64); -}]>; - -def i32shift_b : Operand, SDNodeXFormgetZExtValue(); - return CurDAG->getTargetConstant(enc, MVT::i64); -}]>; - -// min(7, 31 - shift_amt) -def i32shift_sext_i8 : Operand, SDNodeXFormgetZExtValue(); - enc = enc > 7 ? 7 : enc; - return CurDAG->getTargetConstant(enc, MVT::i64); -}]>; - -// min(15, 31 - shift_amt) -def i32shift_sext_i16 : Operand, SDNodeXFormgetZExtValue(); - enc = enc > 15 ? 15 : enc; - return CurDAG->getTargetConstant(enc, MVT::i64); -}]>; - -def i64shift_a : Operand, SDNodeXFormgetZExtValue()) & 0x3f; - return CurDAG->getTargetConstant(enc, MVT::i64); -}]>; - -def i64shift_b : Operand, SDNodeXFormgetZExtValue(); - return CurDAG->getTargetConstant(enc, MVT::i64); -}]>; - -// min(7, 63 - shift_amt) -def i64shift_sext_i8 : Operand, SDNodeXFormgetZExtValue(); - enc = enc > 7 ? 7 : enc; - return CurDAG->getTargetConstant(enc, MVT::i64); -}]>; - -// min(15, 63 - shift_amt) -def i64shift_sext_i16 : Operand, SDNodeXFormgetZExtValue(); - enc = enc > 15 ? 15 : enc; - return CurDAG->getTargetConstant(enc, MVT::i64); -}]>; - -// min(31, 63 - shift_amt) -def i64shift_sext_i32 : Operand, SDNodeXFormgetZExtValue(); - enc = enc > 31 ? 31 : enc; - return CurDAG->getTargetConstant(enc, MVT::i64); -}]>; - -def : Pat<(shl GPR32:$Rn, (i64 imm0_31:$imm)), - (UBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)), - (i64 (i32shift_b imm0_31:$imm)))>; -def : Pat<(shl GPR64:$Rn, (i64 imm0_63:$imm)), - (UBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)), - (i64 (i64shift_b imm0_63:$imm)))>; - -let AddedComplexity = 10 in { -def : Pat<(sra GPR32:$Rn, (i64 imm0_31:$imm)), - (SBFMWri GPR32:$Rn, imm0_31:$imm, 31)>; -def : Pat<(sra GPR64:$Rn, (i64 imm0_63:$imm)), - (SBFMXri GPR64:$Rn, imm0_63:$imm, 63)>; -} - -def : InstAlias<"asr $dst, $src, $shift", - (SBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>; -def : InstAlias<"asr $dst, $src, $shift", - (SBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>; -def : InstAlias<"sxtb $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 7)>; -def : InstAlias<"sxtb $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 7)>; -def : InstAlias<"sxth $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 15)>; -def : InstAlias<"sxth $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 15)>; -def : InstAlias<"sxtw $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 31)>; - -def : Pat<(srl GPR32:$Rn, (i64 imm0_31:$imm)), - (UBFMWri GPR32:$Rn, imm0_31:$imm, 31)>; -def : Pat<(srl GPR64:$Rn, (i64 imm0_63:$imm)), - (UBFMXri GPR64:$Rn, imm0_63:$imm, 63)>; - -def : InstAlias<"lsr $dst, $src, $shift", - (UBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>; -def : InstAlias<"lsr $dst, $src, $shift", - (UBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>; -def : InstAlias<"uxtb $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 7)>; -def : InstAlias<"uxtb $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 7)>; -def : InstAlias<"uxth $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 15)>; -def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>; -def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>; - -//===----------------------------------------------------------------------===// -// Conditionally set flags instructions. -//===----------------------------------------------------------------------===// -defm CCMN : CondSetFlagsImm<0, "ccmn">; -defm CCMP : CondSetFlagsImm<1, "ccmp">; - -defm CCMN : CondSetFlagsReg<0, "ccmn">; -defm CCMP : CondSetFlagsReg<1, "ccmp">; - -//===----------------------------------------------------------------------===// -// Conditional select instructions. -//===----------------------------------------------------------------------===// -defm CSEL : CondSelect<0, 0b00, "csel">; - -def inc : PatFrag<(ops node:$in), (add node:$in, 1)>; -defm CSINC : CondSelectOp<0, 0b01, "csinc", inc>; -defm CSINV : CondSelectOp<1, 0b00, "csinv", not>; -defm CSNEG : CondSelectOp<1, 0b01, "csneg", ineg>; - -def : Pat<(ARM64csinv GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV), - (CSINVWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>; -def : Pat<(ARM64csinv GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV), - (CSINVXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>; -def : Pat<(ARM64csneg GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV), - (CSNEGWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>; -def : Pat<(ARM64csneg GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV), - (CSNEGXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>; -def : Pat<(ARM64csinc GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV), - (CSINCWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>; -def : Pat<(ARM64csinc GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV), - (CSINCXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>; - -def : Pat<(ARM64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV), - (CSINCWr WZR, WZR, (i32 imm:$cc))>; -def : Pat<(ARM64csel (i64 0), (i64 1), (i32 imm:$cc), NZCV), - (CSINCXr XZR, XZR, (i32 imm:$cc))>; -def : Pat<(ARM64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV), - (CSINVWr WZR, WZR, (i32 imm:$cc))>; -def : Pat<(ARM64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV), - (CSINVXr XZR, XZR, (i32 imm:$cc))>; - -// The inverse of the condition code from the alias instruction is what is used -// in the aliased instruction. The parser all ready inverts the condition code -// for these aliases. -def : InstAlias<"cset $dst, $cc", - (CSINCWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>; -def : InstAlias<"cset $dst, $cc", - (CSINCXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>; - -def : InstAlias<"csetm $dst, $cc", - (CSINVWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>; -def : InstAlias<"csetm $dst, $cc", - (CSINVXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>; - -def : InstAlias<"cinc $dst, $src, $cc", - (CSINCWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>; -def : InstAlias<"cinc $dst, $src, $cc", - (CSINCXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>; - -def : InstAlias<"cinv $dst, $src, $cc", - (CSINVWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>; -def : InstAlias<"cinv $dst, $src, $cc", - (CSINVXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>; - -def : InstAlias<"cneg $dst, $src, $cc", - (CSNEGWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>; -def : InstAlias<"cneg $dst, $src, $cc", - (CSNEGXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>; - -//===----------------------------------------------------------------------===// -// PC-relative instructions. -//===----------------------------------------------------------------------===// -let isReMaterializable = 1 in { -let neverHasSideEffects = 1, mayStore = 0, mayLoad = 0 in { -def ADR : ADRI<0, "adr", adrlabel, []>; -} // neverHasSideEffects = 1 - -def ADRP : ADRI<1, "adrp", adrplabel, - [(set GPR64:$Xd, (ARM64adrp tglobaladdr:$label))]>; -} // isReMaterializable = 1 - -// page address of a constant pool entry, block address -def : Pat<(ARM64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>; -def : Pat<(ARM64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>; - -//===----------------------------------------------------------------------===// -// Unconditional branch (register) instructions. -//===----------------------------------------------------------------------===// - -let isReturn = 1, isTerminator = 1, isBarrier = 1 in { -def RET : BranchReg<0b0010, "ret", []>; -def DRPS : SpecialReturn<0b0101, "drps">; -def ERET : SpecialReturn<0b0100, "eret">; -} // isReturn = 1, isTerminator = 1, isBarrier = 1 - -// Default to the LR register. -def : InstAlias<"ret", (RET LR)>; - -let isCall = 1, Defs = [LR], Uses = [SP] in { -def BLR : BranchReg<0b0001, "blr", [(ARM64call GPR64:$Rn)]>; -} // isCall - -let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { -def BR : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>; -} // isBranch, isTerminator, isBarrier, isIndirectBranch - -// Create a separate pseudo-instruction for codegen to use so that we don't -// flag lr as used in every function. It'll be restored before the RET by the -// epilogue if it's legitimately used. -def RET_ReallyLR : Pseudo<(outs), (ins), [(ARM64retflag)]> { - let isTerminator = 1; - let isBarrier = 1; - let isReturn = 1; -} - -// This is a directive-like pseudo-instruction. The purpose is to insert an -// R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction -// (which in the usual case is a BLR). -let hasSideEffects = 1 in -def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> { - let AsmString = ".tlsdesccall $sym"; -} - -// Pseudo-instruction representing a BLR with attached TLSDESC relocation. It -// gets expanded to two MCInsts during lowering. -let isCall = 1, Defs = [LR] in -def TLSDESC_BLR - : Pseudo<(outs), (ins GPR64:$dest, i64imm:$sym), - [(ARM64tlsdesc_call GPR64:$dest, tglobaltlsaddr:$sym)]>; - -def : Pat<(ARM64tlsdesc_call GPR64:$dest, texternalsym:$sym), - (TLSDESC_BLR GPR64:$dest, texternalsym:$sym)>; -//===----------------------------------------------------------------------===// -// Conditional branch (immediate) instruction. -//===----------------------------------------------------------------------===// -def Bcc : BranchCond; - -//===----------------------------------------------------------------------===// -// Compare-and-branch instructions. -//===----------------------------------------------------------------------===// -defm CBZ : CmpBranch<0, "cbz", ARM64cbz>; -defm CBNZ : CmpBranch<1, "cbnz", ARM64cbnz>; - -//===----------------------------------------------------------------------===// -// Test-bit-and-branch instructions. -//===----------------------------------------------------------------------===// -defm TBZ : TestBranch<0, "tbz", ARM64tbz>; -defm TBNZ : TestBranch<1, "tbnz", ARM64tbnz>; - -//===----------------------------------------------------------------------===// -// Unconditional branch (immediate) instructions. -//===----------------------------------------------------------------------===// -let isBranch = 1, isTerminator = 1, isBarrier = 1 in { -def B : BranchImm<0, "b", [(br bb:$addr)]>; -} // isBranch, isTerminator, isBarrier - -let isCall = 1, Defs = [LR], Uses = [SP] in { -def BL : CallImm<1, "bl", [(ARM64call tglobaladdr:$addr)]>; -} // isCall -def : Pat<(ARM64call texternalsym:$func), (BL texternalsym:$func)>; - -//===----------------------------------------------------------------------===// -// Exception generation instructions. -//===----------------------------------------------------------------------===// -def BRK : ExceptionGeneration<0b001, 0b00, "brk">; -def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">; -def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">; -def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">; -def HLT : ExceptionGeneration<0b010, 0b00, "hlt">; -def HVC : ExceptionGeneration<0b000, 0b10, "hvc">; -def SMC : ExceptionGeneration<0b000, 0b11, "smc">; -def SVC : ExceptionGeneration<0b000, 0b01, "svc">; - -// DCPSn defaults to an immediate operand of zero if unspecified. -def : InstAlias<"dcps1", (DCPS1 0)>; -def : InstAlias<"dcps2", (DCPS2 0)>; -def : InstAlias<"dcps3", (DCPS3 0)>; - -//===----------------------------------------------------------------------===// -// Load instructions. -//===----------------------------------------------------------------------===// - -// Pair (indexed, offset) -defm LDPW : LoadPairOffset<0b00, 0, GPR32, simm7s4, "ldp">; -defm LDPX : LoadPairOffset<0b10, 0, GPR64, simm7s8, "ldp">; -defm LDPS : LoadPairOffset<0b00, 1, FPR32, simm7s4, "ldp">; -defm LDPD : LoadPairOffset<0b01, 1, FPR64, simm7s8, "ldp">; -defm LDPQ : LoadPairOffset<0b10, 1, FPR128, simm7s16, "ldp">; - -defm LDPSW : LoadPairOffset<0b01, 0, GPR64, simm7s4, "ldpsw">; - -// Pair (pre-indexed) -def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32, simm7s4, "ldp">; -def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64, simm7s8, "ldp">; -def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32, simm7s4, "ldp">; -def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64, simm7s8, "ldp">; -def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128, simm7s16, "ldp">; - -def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64, simm7s4, "ldpsw">; - -// Pair (post-indexed) -def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32, simm7s4, "ldp">; -def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64, simm7s8, "ldp">; -def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32, simm7s4, "ldp">; -def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64, simm7s8, "ldp">; -def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128, simm7s16, "ldp">; - -def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64, simm7s4, "ldpsw">; - - -// Pair (no allocate) -defm LDNPW : LoadPairNoAlloc<0b00, 0, GPR32, simm7s4, "ldnp">; -defm LDNPX : LoadPairNoAlloc<0b10, 0, GPR64, simm7s8, "ldnp">; -defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32, simm7s4, "ldnp">; -defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64, simm7s8, "ldnp">; -defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128, simm7s16, "ldnp">; - -//--- -// (register offset) -//--- - -// Integer -defm LDRBB : Load8RO<0b00, 0, 0b01, GPR32, "ldrb", i32, zextloadi8>; -defm LDRHH : Load16RO<0b01, 0, 0b01, GPR32, "ldrh", i32, zextloadi16>; -defm LDRW : Load32RO<0b10, 0, 0b01, GPR32, "ldr", i32, load>; -defm LDRX : Load64RO<0b11, 0, 0b01, GPR64, "ldr", i64, load>; - -// Floating-point -defm LDRB : Load8RO<0b00, 1, 0b01, FPR8, "ldr", untyped, load>; -defm LDRH : Load16RO<0b01, 1, 0b01, FPR16, "ldr", f16, load>; -defm LDRS : Load32RO<0b10, 1, 0b01, FPR32, "ldr", f32, load>; -defm LDRD : Load64RO<0b11, 1, 0b01, FPR64, "ldr", f64, load>; -defm LDRQ : Load128RO<0b00, 1, 0b11, FPR128, "ldr", f128, load>; - -// Load sign-extended half-word -defm LDRSHW : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh", i32, sextloadi16>; -defm LDRSHX : Load16RO<0b01, 0, 0b10, GPR64, "ldrsh", i64, sextloadi16>; - -// Load sign-extended byte -defm LDRSBW : Load8RO<0b00, 0, 0b11, GPR32, "ldrsb", i32, sextloadi8>; -defm LDRSBX : Load8RO<0b00, 0, 0b10, GPR64, "ldrsb", i64, sextloadi8>; - -// Load sign-extended word -defm LDRSW : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>; - -// Pre-fetch. -defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">; - -// For regular load, we do not have any alignment requirement. -// Thus, it is safe to directly map the vector loads with interesting -// addressing modes. -// FIXME: We could do the same for bitconvert to floating point vectors. -multiclass ScalToVecROLoadPat { - def : Pat<(VecTy (scalar_to_vector (ScalTy - (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))), - (INSERT_SUBREG (VecTy (IMPLICIT_DEF)), - (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset), - sub)>; - - def : Pat<(VecTy (scalar_to_vector (ScalTy - (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))), - (INSERT_SUBREG (VecTy (IMPLICIT_DEF)), - (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset), - sub)>; -} - -let AddedComplexity = 10 in { -defm : ScalToVecROLoadPat; -defm : ScalToVecROLoadPat; - -defm : ScalToVecROLoadPat; -defm : ScalToVecROLoadPat; - -defm : ScalToVecROLoadPat; -defm : ScalToVecROLoadPat; - -defm : ScalToVecROLoadPat; -defm : ScalToVecROLoadPat; - -defm : ScalToVecROLoadPat; - -defm : ScalToVecROLoadPat; - - -def : Pat <(v1i64 (scalar_to_vector (i64 - (load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm, - ro_Wextend64:$extend))))), - (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>; - -def : Pat <(v1i64 (scalar_to_vector (i64 - (load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm, - ro_Xextend64:$extend))))), - (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>; -} - -// Match all load 64 bits width whose type is compatible with FPR64 -multiclass VecROLoadPat { - - def : Pat<(VecTy (load (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))), - (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>; - - def : Pat<(VecTy (load (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))), - (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>; -} - -let AddedComplexity = 10 in { -let Predicates = [IsLE] in { - // We must do vector loads with LD1 in big-endian. - defm : VecROLoadPat; - defm : VecROLoadPat; - defm : VecROLoadPat; - defm : VecROLoadPat; -} - -defm : VecROLoadPat; -defm : VecROLoadPat; - -// Match all load 128 bits width whose type is compatible with FPR128 -let Predicates = [IsLE] in { - // We must do vector loads with LD1 in big-endian. - defm : VecROLoadPat; - defm : VecROLoadPat; - defm : VecROLoadPat; - defm : VecROLoadPat; - defm : VecROLoadPat; - defm : VecROLoadPat; -} -} // AddedComplexity = 10 - -// zextload -> i64 -multiclass ExtLoadTo64ROPat { - def : Pat<(i64 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))), - (SUBREG_TO_REG (i64 0), - (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend), - sub_32)>; - - def : Pat<(i64 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))), - (SUBREG_TO_REG (i64 0), - (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend), - sub_32)>; -} - -let AddedComplexity = 10 in { - defm : ExtLoadTo64ROPat; - defm : ExtLoadTo64ROPat; - defm : ExtLoadTo64ROPat; - - // zextloadi1 -> zextloadi8 - defm : ExtLoadTo64ROPat; - - // extload -> zextload - defm : ExtLoadTo64ROPat; - defm : ExtLoadTo64ROPat; - defm : ExtLoadTo64ROPat; - - // extloadi1 -> zextloadi8 - defm : ExtLoadTo64ROPat; -} - - -// zextload -> i64 -multiclass ExtLoadTo32ROPat { - def : Pat<(i32 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))), - (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>; - - def : Pat<(i32 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))), - (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>; - -} - -let AddedComplexity = 10 in { - // extload -> zextload - defm : ExtLoadTo32ROPat; - defm : ExtLoadTo32ROPat; - defm : ExtLoadTo32ROPat; - - // zextloadi1 -> zextloadi8 - defm : ExtLoadTo32ROPat; -} - -//--- -// (unsigned immediate) -//--- -defm LDRX : LoadUI<0b11, 0, 0b01, GPR64, uimm12s8, "ldr", - [(set GPR64:$Rt, - (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>; -defm LDRW : LoadUI<0b10, 0, 0b01, GPR32, uimm12s4, "ldr", - [(set GPR32:$Rt, - (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>; -defm LDRB : LoadUI<0b00, 1, 0b01, FPR8, uimm12s1, "ldr", - [(set FPR8:$Rt, - (load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>; -defm LDRH : LoadUI<0b01, 1, 0b01, FPR16, uimm12s2, "ldr", - [(set (f16 FPR16:$Rt), - (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)))]>; -defm LDRS : LoadUI<0b10, 1, 0b01, FPR32, uimm12s4, "ldr", - [(set (f32 FPR32:$Rt), - (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>; -defm LDRD : LoadUI<0b11, 1, 0b01, FPR64, uimm12s8, "ldr", - [(set (f64 FPR64:$Rt), - (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>; -defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128, uimm12s16, "ldr", - [(set (f128 FPR128:$Rt), - (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>; - -// For regular load, we do not have any alignment requirement. -// Thus, it is safe to directly map the vector loads with interesting -// addressing modes. -// FIXME: We could do the same for bitconvert to floating point vectors. -def : Pat <(v8i8 (scalar_to_vector (i32 - (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), - (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)), - (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; -def : Pat <(v16i8 (scalar_to_vector (i32 - (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; -def : Pat <(v4i16 (scalar_to_vector (i32 - (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), - (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)), - (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; -def : Pat <(v8i16 (scalar_to_vector (i32 - (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), - (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), - (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; -def : Pat <(v2i32 (scalar_to_vector (i32 - (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), - (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), - (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; -def : Pat <(v4i32 (scalar_to_vector (i32 - (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), - (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), - (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; -def : Pat <(v1i64 (scalar_to_vector (i64 - (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))), - (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; -def : Pat <(v2i64 (scalar_to_vector (i64 - (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))), - (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), - (LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>; - -// Match all load 64 bits width whose type is compatible with FPR64 -let Predicates = [IsLE] in { - // We must use LD1 to perform vector loads in big-endian. - def : Pat<(v2f32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), - (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; - def : Pat<(v8i8 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), - (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; - def : Pat<(v4i16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), - (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; - def : Pat<(v2i32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), - (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; -} -def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), - (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; -def : Pat<(v1i64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))), - (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>; - -// Match all load 128 bits width whose type is compatible with FPR128 -let Predicates = [IsLE] in { - // We must use LD1 to perform vector loads in big-endian. - def : Pat<(v4f32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), - (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; - def : Pat<(v2f64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), - (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; - def : Pat<(v16i8 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), - (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; - def : Pat<(v8i16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), - (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; - def : Pat<(v4i32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), - (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; - def : Pat<(v2i64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), - (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; -} -def : Pat<(f128 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))), - (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>; - -defm LDRHH : LoadUI<0b01, 0, 0b01, GPR32, uimm12s2, "ldrh", - [(set GPR32:$Rt, - (zextloadi16 (am_indexed16 GPR64sp:$Rn, - uimm12s2:$offset)))]>; -defm LDRBB : LoadUI<0b00, 0, 0b01, GPR32, uimm12s1, "ldrb", - [(set GPR32:$Rt, - (zextloadi8 (am_indexed8 GPR64sp:$Rn, - uimm12s1:$offset)))]>; -// zextload -> i64 -def : Pat<(i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))), - (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>; -def : Pat<(i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))), - (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>; - -// zextloadi1 -> zextloadi8 -def : Pat<(i32 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))), - (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>; -def : Pat<(i64 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))), - (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>; - -// extload -> zextload -def : Pat<(i32 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))), - (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>; -def : Pat<(i32 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))), - (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>; -def : Pat<(i32 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))), - (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>; -def : Pat<(i64 (extloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))), - (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>; -def : Pat<(i64 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))), - (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>; -def : Pat<(i64 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))), - (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>; -def : Pat<(i64 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))), - (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>; - -// load sign-extended half-word -defm LDRSHW : LoadUI<0b01, 0, 0b11, GPR32, uimm12s2, "ldrsh", - [(set GPR32:$Rt, - (sextloadi16 (am_indexed16 GPR64sp:$Rn, - uimm12s2:$offset)))]>; -defm LDRSHX : LoadUI<0b01, 0, 0b10, GPR64, uimm12s2, "ldrsh", - [(set GPR64:$Rt, - (sextloadi16 (am_indexed16 GPR64sp:$Rn, - uimm12s2:$offset)))]>; - -// load sign-extended byte -defm LDRSBW : LoadUI<0b00, 0, 0b11, GPR32, uimm12s1, "ldrsb", - [(set GPR32:$Rt, - (sextloadi8 (am_indexed8 GPR64sp:$Rn, - uimm12s1:$offset)))]>; -defm LDRSBX : LoadUI<0b00, 0, 0b10, GPR64, uimm12s1, "ldrsb", - [(set GPR64:$Rt, - (sextloadi8 (am_indexed8 GPR64sp:$Rn, - uimm12s1:$offset)))]>; - -// load sign-extended word -defm LDRSW : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw", - [(set GPR64:$Rt, - (sextloadi32 (am_indexed32 GPR64sp:$Rn, - uimm12s4:$offset)))]>; - -// load zero-extended word -def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))), - (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>; - -// Pre-fetch. -def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm", - [(ARM64Prefetch imm:$Rt, - (am_indexed64 GPR64sp:$Rn, - uimm12s8:$offset))]>; - -def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>; - -//--- -// (literal) -def LDRWl : LoadLiteral<0b00, 0, GPR32, "ldr">; -def LDRXl : LoadLiteral<0b01, 0, GPR64, "ldr">; -def LDRSl : LoadLiteral<0b00, 1, FPR32, "ldr">; -def LDRDl : LoadLiteral<0b01, 1, FPR64, "ldr">; -def LDRQl : LoadLiteral<0b10, 1, FPR128, "ldr">; - -// load sign-extended word -def LDRSWl : LoadLiteral<0b10, 0, GPR64, "ldrsw">; - -// prefetch -def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>; -// [(ARM64Prefetch imm:$Rt, tglobaladdr:$label)]>; - -//--- -// (unscaled immediate) -defm LDURX : LoadUnscaled<0b11, 0, 0b01, GPR64, "ldur", - [(set GPR64:$Rt, - (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>; -defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32, "ldur", - [(set GPR32:$Rt, - (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>; -defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8, "ldur", - [(set FPR8:$Rt, - (load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>; -defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16, "ldur", - [(set FPR16:$Rt, - (load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>; -defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32, "ldur", - [(set (f32 FPR32:$Rt), - (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>; -defm LDURD : LoadUnscaled<0b11, 1, 0b01, FPR64, "ldur", - [(set (f64 FPR64:$Rt), - (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>; -defm LDURQ : LoadUnscaled<0b00, 1, 0b11, FPR128, "ldur", - [(set (f128 FPR128:$Rt), - (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset)))]>; - -defm LDURHH - : LoadUnscaled<0b01, 0, 0b01, GPR32, "ldurh", - [(set GPR32:$Rt, - (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>; -defm LDURBB - : LoadUnscaled<0b00, 0, 0b01, GPR32, "ldurb", - [(set GPR32:$Rt, - (zextloadi8 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>; - -// Match all load 64 bits width whose type is compatible with FPR64 -let Predicates = [IsLE] in { - def : Pat<(v2f32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), - (LDURDi GPR64sp:$Rn, simm9:$offset)>; - def : Pat<(v2i32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), - (LDURDi GPR64sp:$Rn, simm9:$offset)>; - def : Pat<(v4i16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), - (LDURDi GPR64sp:$Rn, simm9:$offset)>; - def : Pat<(v8i8 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), - (LDURDi GPR64sp:$Rn, simm9:$offset)>; -} -def : Pat<(v1f64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), - (LDURDi GPR64sp:$Rn, simm9:$offset)>; -def : Pat<(v1i64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))), - (LDURDi GPR64sp:$Rn, simm9:$offset)>; - -// Match all load 128 bits width whose type is compatible with FPR128 -let Predicates = [IsLE] in { - def : Pat<(v2f64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), - (LDURQi GPR64sp:$Rn, simm9:$offset)>; - def : Pat<(v2i64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), - (LDURQi GPR64sp:$Rn, simm9:$offset)>; - def : Pat<(v4f32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), - (LDURQi GPR64sp:$Rn, simm9:$offset)>; - def : Pat<(v4i32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), - (LDURQi GPR64sp:$Rn, simm9:$offset)>; - def : Pat<(v8i16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), - (LDURQi GPR64sp:$Rn, simm9:$offset)>; - def : Pat<(v16i8 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))), - (LDURQi GPR64sp:$Rn, simm9:$offset)>; -} - -// anyext -> zext -def : Pat<(i32 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))), - (LDURHHi GPR64sp:$Rn, simm9:$offset)>; -def : Pat<(i32 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), - (LDURBBi GPR64sp:$Rn, simm9:$offset)>; -def : Pat<(i32 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), - (LDURBBi GPR64sp:$Rn, simm9:$offset)>; -def : Pat<(i64 (extloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))), - (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>; -def : Pat<(i64 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))), - (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>; -def : Pat<(i64 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), - (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>; -def : Pat<(i64 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), - (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>; -// unscaled zext -def : Pat<(i32 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))), - (LDURHHi GPR64sp:$Rn, simm9:$offset)>; -def : Pat<(i32 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), - (LDURBBi GPR64sp:$Rn, simm9:$offset)>; -def : Pat<(i32 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), - (LDURBBi GPR64sp:$Rn, simm9:$offset)>; -def : Pat<(i64 (zextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))), - (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>; -def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))), - (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>; -def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), - (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>; -def : Pat<(i64 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), - (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>; - - -//--- -// LDR mnemonics fall back to LDUR for negative or unaligned offsets. - -// Define new assembler match classes as we want to only match these when -// the don't otherwise match the scaled addressing mode for LDR/STR. Don't -// associate a DiagnosticType either, as we want the diagnostic for the -// canonical form (the scaled operand) to take precedence. -class SImm9OffsetOperand : AsmOperandClass { - let Name = "SImm9OffsetFB" # Width; - let PredicateMethod = "isSImm9OffsetFB<" # Width # ">"; - let RenderMethod = "addImmOperands"; -} - -def SImm9OffsetFB8Operand : SImm9OffsetOperand<8>; -def SImm9OffsetFB16Operand : SImm9OffsetOperand<16>; -def SImm9OffsetFB32Operand : SImm9OffsetOperand<32>; -def SImm9OffsetFB64Operand : SImm9OffsetOperand<64>; -def SImm9OffsetFB128Operand : SImm9OffsetOperand<128>; - -def simm9_offset_fb8 : Operand { - let ParserMatchClass = SImm9OffsetFB8Operand; -} -def simm9_offset_fb16 : Operand { - let ParserMatchClass = SImm9OffsetFB16Operand; -} -def simm9_offset_fb32 : Operand { - let ParserMatchClass = SImm9OffsetFB32Operand; -} -def simm9_offset_fb64 : Operand { - let ParserMatchClass = SImm9OffsetFB64Operand; -} -def simm9_offset_fb128 : Operand { - let ParserMatchClass = SImm9OffsetFB128Operand; -} - -def : InstAlias<"ldr $Rt, [$Rn, $offset]", - (LDURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>; -def : InstAlias<"ldr $Rt, [$Rn, $offset]", - (LDURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>; -def : InstAlias<"ldr $Rt, [$Rn, $offset]", - (LDURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>; -def : InstAlias<"ldr $Rt, [$Rn, $offset]", - (LDURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>; -def : InstAlias<"ldr $Rt, [$Rn, $offset]", - (LDURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>; -def : InstAlias<"ldr $Rt, [$Rn, $offset]", - (LDURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>; -def : InstAlias<"ldr $Rt, [$Rn, $offset]", - (LDURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>; - -// zextload -> i64 -def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))), - (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>; -def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))), - (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>; - -// load sign-extended half-word -defm LDURSHW - : LoadUnscaled<0b01, 0, 0b11, GPR32, "ldursh", - [(set GPR32:$Rt, - (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>; -defm LDURSHX - : LoadUnscaled<0b01, 0, 0b10, GPR64, "ldursh", - [(set GPR64:$Rt, - (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>; - -// load sign-extended byte -defm LDURSBW - : LoadUnscaled<0b00, 0, 0b11, GPR32, "ldursb", - [(set GPR32:$Rt, - (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>; -defm LDURSBX - : LoadUnscaled<0b00, 0, 0b10, GPR64, "ldursb", - [(set GPR64:$Rt, - (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>; - -// load sign-extended word -defm LDURSW - : LoadUnscaled<0b10, 0, 0b10, GPR64, "ldursw", - [(set GPR64:$Rt, - (sextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>; - -// zero and sign extending aliases from generic LDR* mnemonics to LDUR*. -def : InstAlias<"ldrb $Rt, [$Rn, $offset]", - (LDURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>; -def : InstAlias<"ldrh $Rt, [$Rn, $offset]", - (LDURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>; -def : InstAlias<"ldrsb $Rt, [$Rn, $offset]", - (LDURSBWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>; -def : InstAlias<"ldrsb $Rt, [$Rn, $offset]", - (LDURSBXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>; -def : InstAlias<"ldrsh $Rt, [$Rn, $offset]", - (LDURSHWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>; -def : InstAlias<"ldrsh $Rt, [$Rn, $offset]", - (LDURSHXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>; -def : InstAlias<"ldrsw $Rt, [$Rn, $offset]", - (LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>; - -// Pre-fetch. -defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum", - [(ARM64Prefetch imm:$Rt, - (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>; - -//--- -// (unscaled immediate, unprivileged) -defm LDTRX : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">; -defm LDTRW : LoadUnprivileged<0b10, 0, 0b01, GPR32, "ldtr">; - -defm LDTRH : LoadUnprivileged<0b01, 0, 0b01, GPR32, "ldtrh">; -defm LDTRB : LoadUnprivileged<0b00, 0, 0b01, GPR32, "ldtrb">; - -// load sign-extended half-word -defm LDTRSHW : LoadUnprivileged<0b01, 0, 0b11, GPR32, "ldtrsh">; -defm LDTRSHX : LoadUnprivileged<0b01, 0, 0b10, GPR64, "ldtrsh">; - -// load sign-extended byte -defm LDTRSBW : LoadUnprivileged<0b00, 0, 0b11, GPR32, "ldtrsb">; -defm LDTRSBX : LoadUnprivileged<0b00, 0, 0b10, GPR64, "ldtrsb">; - -// load sign-extended word -defm LDTRSW : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">; - -//--- -// (immediate pre-indexed) -def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32, "ldr">; -def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64, "ldr">; -def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8, "ldr">; -def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16, "ldr">; -def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32, "ldr">; -def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64, "ldr">; -def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128, "ldr">; - -// load sign-extended half-word -def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32, "ldrsh">; -def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64, "ldrsh">; - -// load sign-extended byte -def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32, "ldrsb">; -def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64, "ldrsb">; - -// load zero-extended byte -def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32, "ldrb">; -def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32, "ldrh">; - -// load sign-extended word -def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64, "ldrsw">; - -//--- -// (immediate post-indexed) -def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32, "ldr">; -def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64, "ldr">; -def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8, "ldr">; -def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16, "ldr">; -def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32, "ldr">; -def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64, "ldr">; -def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128, "ldr">; - -// load sign-extended half-word -def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32, "ldrsh">; -def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64, "ldrsh">; - -// load sign-extended byte -def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32, "ldrsb">; -def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64, "ldrsb">; - -// load zero-extended byte -def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32, "ldrb">; -def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32, "ldrh">; - -// load sign-extended word -def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64, "ldrsw">; - -//===----------------------------------------------------------------------===// -// Store instructions. -//===----------------------------------------------------------------------===// - -// Pair (indexed, offset) -// FIXME: Use dedicated range-checked addressing mode operand here. -defm STPW : StorePairOffset<0b00, 0, GPR32, simm7s4, "stp">; -defm STPX : StorePairOffset<0b10, 0, GPR64, simm7s8, "stp">; -defm STPS : StorePairOffset<0b00, 1, FPR32, simm7s4, "stp">; -defm STPD : StorePairOffset<0b01, 1, FPR64, simm7s8, "stp">; -defm STPQ : StorePairOffset<0b10, 1, FPR128, simm7s16, "stp">; - -// Pair (pre-indexed) -def STPWpre : StorePairPreIdx<0b00, 0, GPR32, simm7s4, "stp">; -def STPXpre : StorePairPreIdx<0b10, 0, GPR64, simm7s8, "stp">; -def STPSpre : StorePairPreIdx<0b00, 1, FPR32, simm7s4, "stp">; -def STPDpre : StorePairPreIdx<0b01, 1, FPR64, simm7s8, "stp">; -def STPQpre : StorePairPreIdx<0b10, 1, FPR128, simm7s16, "stp">; - -// Pair (pre-indexed) -def STPWpost : StorePairPostIdx<0b00, 0, GPR32, simm7s4, "stp">; -def STPXpost : StorePairPostIdx<0b10, 0, GPR64, simm7s8, "stp">; -def STPSpost : StorePairPostIdx<0b00, 1, FPR32, simm7s4, "stp">; -def STPDpost : StorePairPostIdx<0b01, 1, FPR64, simm7s8, "stp">; -def STPQpost : StorePairPostIdx<0b10, 1, FPR128, simm7s16, "stp">; - -// Pair (no allocate) -defm STNPW : StorePairNoAlloc<0b00, 0, GPR32, simm7s4, "stnp">; -defm STNPX : StorePairNoAlloc<0b10, 0, GPR64, simm7s8, "stnp">; -defm STNPS : StorePairNoAlloc<0b00, 1, FPR32, simm7s4, "stnp">; -defm STNPD : StorePairNoAlloc<0b01, 1, FPR64, simm7s8, "stnp">; -defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128, simm7s16, "stnp">; - -//--- -// (Register offset) - -// Integer -defm STRBB : Store8RO< 0b00, 0, 0b00, GPR32, "strb", i32, truncstorei8>; -defm STRHH : Store16RO<0b01, 0, 0b00, GPR32, "strh", i32, truncstorei16>; -defm STRW : Store32RO<0b10, 0, 0b00, GPR32, "str", i32, store>; -defm STRX : Store64RO<0b11, 0, 0b00, GPR64, "str", i64, store>; - - -// Floating-point -defm STRB : Store8RO< 0b00, 1, 0b00, FPR8, "str", untyped, store>; -defm STRH : Store16RO<0b01, 1, 0b00, FPR16, "str", f16, store>; -defm STRS : Store32RO<0b10, 1, 0b00, FPR32, "str", f32, store>; -defm STRD : Store64RO<0b11, 1, 0b00, FPR64, "str", f64, store>; -defm STRQ : Store128RO<0b00, 1, 0b10, FPR128, "str", f128, store>; - -multiclass TruncStoreFrom64ROPat { - - def : Pat<(storeop GPR64:$Rt, - (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)), - (STRW (EXTRACT_SUBREG GPR64:$Rt, sub_32), - GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>; - - def : Pat<(storeop GPR64:$Rt, - (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)), - (STRX (EXTRACT_SUBREG GPR64:$Rt, sub_32), - GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>; -} - -let AddedComplexity = 10 in { - // truncstore i64 - defm : TruncStoreFrom64ROPat; - defm : TruncStoreFrom64ROPat; - defm : TruncStoreFrom64ROPat; -} - -multiclass VecROStorePat { - def : Pat<(store (VecTy FPR:$Rt), - (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)), - (STRW FPR:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>; - - def : Pat<(store (VecTy FPR:$Rt), - (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)), - (STRX FPR:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>; -} - -let AddedComplexity = 10 in { -// Match all store 64 bits width whose type is compatible with FPR64 -let Predicates = [IsLE] in { - // We must use ST1 to store vectors in big-endian. - defm : VecROStorePat; - defm : VecROStorePat; - defm : VecROStorePat; - defm : VecROStorePat; -} - -defm : VecROStorePat; -defm : VecROStorePat; - -// Match all store 128 bits width whose type is compatible with FPR128 -let Predicates = [IsLE] in { - // We must use ST1 to store vectors in big-endian. - defm : VecROStorePat; - defm : VecROStorePat; - defm : VecROStorePat; - defm : VecROStorePat; - defm : VecROStorePat; - defm : VecROStorePat; -} -} // AddedComplexity = 10 - -//--- -// (unsigned immediate) -defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str", - [(store GPR64:$Rt, - (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>; -defm STRW : StoreUI<0b10, 0, 0b00, GPR32, uimm12s4, "str", - [(store GPR32:$Rt, - (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>; -defm STRB : StoreUI<0b00, 1, 0b00, FPR8, uimm12s1, "str", - [(store FPR8:$Rt, - (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>; -defm STRH : StoreUI<0b01, 1, 0b00, FPR16, uimm12s2, "str", - [(store (f16 FPR16:$Rt), - (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))]>; -defm STRS : StoreUI<0b10, 1, 0b00, FPR32, uimm12s4, "str", - [(store (f32 FPR32:$Rt), - (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>; -defm STRD : StoreUI<0b11, 1, 0b00, FPR64, uimm12s8, "str", - [(store (f64 FPR64:$Rt), - (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>; -defm STRQ : StoreUI<0b00, 1, 0b10, FPR128, uimm12s16, "str", []>; - -defm STRHH : StoreUI<0b01, 0, 0b00, GPR32, uimm12s2, "strh", - [(truncstorei16 GPR32:$Rt, - (am_indexed16 GPR64sp:$Rn, - uimm12s2:$offset))]>; -defm STRBB : StoreUI<0b00, 0, 0b00, GPR32, uimm12s1, "strb", - [(truncstorei8 GPR32:$Rt, - (am_indexed8 GPR64sp:$Rn, - uimm12s1:$offset))]>; - -// Match all store 64 bits width whose type is compatible with FPR64 -let AddedComplexity = 10 in { -let Predicates = [IsLE] in { - // We must use ST1 to store vectors in big-endian. - def : Pat<(store (v2f32 FPR64:$Rt), - (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), - (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; - def : Pat<(store (v8i8 FPR64:$Rt), - (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), - (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; - def : Pat<(store (v4i16 FPR64:$Rt), - (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), - (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; - def : Pat<(store (v2i32 FPR64:$Rt), - (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), - (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; -} -def : Pat<(store (v1f64 FPR64:$Rt), - (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), - (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; -def : Pat<(store (v1i64 FPR64:$Rt), - (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)), - (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>; - -// Match all store 128 bits width whose type is compatible with FPR128 -let Predicates = [IsLE] in { - // We must use ST1 to store vectors in big-endian. - def : Pat<(store (v4f32 FPR128:$Rt), - (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), - (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; - def : Pat<(store (v2f64 FPR128:$Rt), - (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), - (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; - def : Pat<(store (v16i8 FPR128:$Rt), - (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), - (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; - def : Pat<(store (v8i16 FPR128:$Rt), - (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), - (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; - def : Pat<(store (v4i32 FPR128:$Rt), - (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), - (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; - def : Pat<(store (v2i64 FPR128:$Rt), - (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), - (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; -} -def : Pat<(store (f128 FPR128:$Rt), - (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)), - (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>; - -// truncstore i64 -def : Pat<(truncstorei32 GPR64:$Rt, - (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)), - (STRWui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s4:$offset)>; -def : Pat<(truncstorei16 GPR64:$Rt, - (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)), - (STRHHui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s2:$offset)>; -def : Pat<(truncstorei8 GPR64:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)), - (STRBBui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s1:$offset)>; - -} // AddedComplexity = 10 - -//--- -// (unscaled immediate) -defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64, "stur", - [(store GPR64:$Rt, - (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>; -defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32, "stur", - [(store GPR32:$Rt, - (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>; -defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8, "stur", - [(store FPR8:$Rt, - (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>; -defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16, "stur", - [(store (f16 FPR16:$Rt), - (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>; -defm STURS : StoreUnscaled<0b10, 1, 0b00, FPR32, "stur", - [(store (f32 FPR32:$Rt), - (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>; -defm STURD : StoreUnscaled<0b11, 1, 0b00, FPR64, "stur", - [(store (f64 FPR64:$Rt), - (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>; -defm STURQ : StoreUnscaled<0b00, 1, 0b10, FPR128, "stur", - [(store (f128 FPR128:$Rt), - (am_unscaled128 GPR64sp:$Rn, simm9:$offset))]>; -defm STURHH : StoreUnscaled<0b01, 0, 0b00, GPR32, "sturh", - [(truncstorei16 GPR32:$Rt, - (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>; -defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32, "sturb", - [(truncstorei8 GPR32:$Rt, - (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>; - -// Match all store 64 bits width whose type is compatible with FPR64 -let Predicates = [IsLE] in { - // We must use ST1 to store vectors in big-endian. - def : Pat<(store (v2f32 FPR64:$Rt), - (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), - (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; - def : Pat<(store (v8i8 FPR64:$Rt), - (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), - (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; - def : Pat<(store (v4i16 FPR64:$Rt), - (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), - (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; - def : Pat<(store (v2i32 FPR64:$Rt), - (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), - (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; -} -def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), - (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; -def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), - (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>; - -// Match all store 128 bits width whose type is compatible with FPR128 -let Predicates = [IsLE] in { - // We must use ST1 to store vectors in big-endian. - def : Pat<(store (v4f32 FPR128:$Rt), - (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), - (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; - def : Pat<(store (v2f64 FPR128:$Rt), - (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), - (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; - def : Pat<(store (v16i8 FPR128:$Rt), - (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), - (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; - def : Pat<(store (v8i16 FPR128:$Rt), - (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), - (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; - def : Pat<(store (v4i32 FPR128:$Rt), - (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), - (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; - def : Pat<(store (v2i64 FPR128:$Rt), - (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), - (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; - def : Pat<(store (v2f64 FPR128:$Rt), - (am_unscaled128 GPR64sp:$Rn, simm9:$offset)), - (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>; -} - -// unscaled i64 truncating stores -def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)), - (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>; -def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)), - (STURHHi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>; -def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)), - (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>; - -//--- -// STR mnemonics fall back to STUR for negative or unaligned offsets. -def : InstAlias<"str $Rt, [$Rn, $offset]", - (STURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>; -def : InstAlias<"str $Rt, [$Rn, $offset]", - (STURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>; -def : InstAlias<"str $Rt, [$Rn, $offset]", - (STURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>; -def : InstAlias<"str $Rt, [$Rn, $offset]", - (STURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>; -def : InstAlias<"str $Rt, [$Rn, $offset]", - (STURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>; -def : InstAlias<"str $Rt, [$Rn, $offset]", - (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>; -def : InstAlias<"str $Rt, [$Rn, $offset]", - (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>; - -def : InstAlias<"strb $Rt, [$Rn, $offset]", - (STURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>; -def : InstAlias<"strh $Rt, [$Rn, $offset]", - (STURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>; - -//--- -// (unscaled immediate, unprivileged) -defm STTRW : StoreUnprivileged<0b10, 0, 0b00, GPR32, "sttr">; -defm STTRX : StoreUnprivileged<0b11, 0, 0b00, GPR64, "sttr">; - -defm STTRH : StoreUnprivileged<0b01, 0, 0b00, GPR32, "sttrh">; -defm STTRB : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">; - -//--- -// (immediate pre-indexed) -def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32, "str", pre_store, i32>; -def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64, "str", pre_store, i64>; -def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8, "str", pre_store, untyped>; -def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16, "str", pre_store, f16>; -def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32, "str", pre_store, f32>; -def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64, "str", pre_store, f64>; -def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128, "str", pre_store, f128>; - -def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32, "strb", pre_truncsti8, i32>; -def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32, "strh", pre_truncsti16, i32>; - -// truncstore i64 -def : Pat<(pre_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off), - (STRWpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr, - simm9:$off)>; -def : Pat<(pre_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off), - (STRHHpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr, - simm9:$off)>; -def : Pat<(pre_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off), - (STRBBpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr, - simm9:$off)>; - -def : Pat<(pre_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off), - (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; -def : Pat<(pre_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off), - (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; -def : Pat<(pre_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off), - (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; -def : Pat<(pre_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off), - (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; -def : Pat<(pre_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off), - (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; -def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off), - (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; - -def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off), - (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; -def : Pat<(pre_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off), - (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; -def : Pat<(pre_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off), - (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; -def : Pat<(pre_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off), - (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; -def : Pat<(pre_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off), - (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; -def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off), - (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; - -//--- -// (immediate post-indexed) -def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32, "str", post_store, i32>; -def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64, "str", post_store, i64>; -def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8, "str", post_store, untyped>; -def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16, "str", post_store, f16>; -def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32, "str", post_store, f32>; -def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64, "str", post_store, f64>; -def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128, "str", post_store, f128>; - -def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32, "strb", post_truncsti8, i32>; -def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32, "strh", post_truncsti16, i32>; - -// truncstore i64 -def : Pat<(post_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off), - (STRWpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr, - simm9:$off)>; -def : Pat<(post_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off), - (STRHHpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr, - simm9:$off)>; -def : Pat<(post_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off), - (STRBBpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr, - simm9:$off)>; - -def : Pat<(post_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off), - (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; -def : Pat<(post_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off), - (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; -def : Pat<(post_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off), - (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; -def : Pat<(post_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off), - (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; -def : Pat<(post_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off), - (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; -def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off), - (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>; - -def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off), - (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; -def : Pat<(post_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off), - (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; -def : Pat<(post_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off), - (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; -def : Pat<(post_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off), - (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; -def : Pat<(post_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off), - (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; -def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off), - (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>; - -//===----------------------------------------------------------------------===// -// Load/store exclusive instructions. -//===----------------------------------------------------------------------===// - -def LDARW : LoadAcquire <0b10, 1, 1, 0, 1, GPR32, "ldar">; -def LDARX : LoadAcquire <0b11, 1, 1, 0, 1, GPR64, "ldar">; -def LDARB : LoadAcquire <0b00, 1, 1, 0, 1, GPR32, "ldarb">; -def LDARH : LoadAcquire <0b01, 1, 1, 0, 1, GPR32, "ldarh">; - -def LDAXRW : LoadExclusive <0b10, 0, 1, 0, 1, GPR32, "ldaxr">; -def LDAXRX : LoadExclusive <0b11, 0, 1, 0, 1, GPR64, "ldaxr">; -def LDAXRB : LoadExclusive <0b00, 0, 1, 0, 1, GPR32, "ldaxrb">; -def LDAXRH : LoadExclusive <0b01, 0, 1, 0, 1, GPR32, "ldaxrh">; - -def LDXRW : LoadExclusive <0b10, 0, 1, 0, 0, GPR32, "ldxr">; -def LDXRX : LoadExclusive <0b11, 0, 1, 0, 0, GPR64, "ldxr">; -def LDXRB : LoadExclusive <0b00, 0, 1, 0, 0, GPR32, "ldxrb">; -def LDXRH : LoadExclusive <0b01, 0, 1, 0, 0, GPR32, "ldxrh">; - -def STLRW : StoreRelease <0b10, 1, 0, 0, 1, GPR32, "stlr">; -def STLRX : StoreRelease <0b11, 1, 0, 0, 1, GPR64, "stlr">; -def STLRB : StoreRelease <0b00, 1, 0, 0, 1, GPR32, "stlrb">; -def STLRH : StoreRelease <0b01, 1, 0, 0, 1, GPR32, "stlrh">; - -def STLXRW : StoreExclusive<0b10, 0, 0, 0, 1, GPR32, "stlxr">; -def STLXRX : StoreExclusive<0b11, 0, 0, 0, 1, GPR64, "stlxr">; -def STLXRB : StoreExclusive<0b00, 0, 0, 0, 1, GPR32, "stlxrb">; -def STLXRH : StoreExclusive<0b01, 0, 0, 0, 1, GPR32, "stlxrh">; - -def STXRW : StoreExclusive<0b10, 0, 0, 0, 0, GPR32, "stxr">; -def STXRX : StoreExclusive<0b11, 0, 0, 0, 0, GPR64, "stxr">; -def STXRB : StoreExclusive<0b00, 0, 0, 0, 0, GPR32, "stxrb">; -def STXRH : StoreExclusive<0b01, 0, 0, 0, 0, GPR32, "stxrh">; - -def LDAXPW : LoadExclusivePair<0b10, 0, 1, 1, 1, GPR32, "ldaxp">; -def LDAXPX : LoadExclusivePair<0b11, 0, 1, 1, 1, GPR64, "ldaxp">; - -def LDXPW : LoadExclusivePair<0b10, 0, 1, 1, 0, GPR32, "ldxp">; -def LDXPX : LoadExclusivePair<0b11, 0, 1, 1, 0, GPR64, "ldxp">; - -def STLXPW : StoreExclusivePair<0b10, 0, 0, 1, 1, GPR32, "stlxp">; -def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">; - -def STXPW : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">; -def STXPX : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">; - -//===----------------------------------------------------------------------===// -// Scaled floating point to integer conversion instructions. -//===----------------------------------------------------------------------===// - -defm FCVTAS : FPToIntegerUnscaled<0b00, 0b100, "fcvtas", int_arm64_neon_fcvtas>; -defm FCVTAU : FPToIntegerUnscaled<0b00, 0b101, "fcvtau", int_arm64_neon_fcvtau>; -defm FCVTMS : FPToIntegerUnscaled<0b10, 0b000, "fcvtms", int_arm64_neon_fcvtms>; -defm FCVTMU : FPToIntegerUnscaled<0b10, 0b001, "fcvtmu", int_arm64_neon_fcvtmu>; -defm FCVTNS : FPToIntegerUnscaled<0b00, 0b000, "fcvtns", int_arm64_neon_fcvtns>; -defm FCVTNU : FPToIntegerUnscaled<0b00, 0b001, "fcvtnu", int_arm64_neon_fcvtnu>; -defm FCVTPS : FPToIntegerUnscaled<0b01, 0b000, "fcvtps", int_arm64_neon_fcvtps>; -defm FCVTPU : FPToIntegerUnscaled<0b01, 0b001, "fcvtpu", int_arm64_neon_fcvtpu>; -defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>; -defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>; -defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>; -defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>; -let isCodeGenOnly = 1 in { -defm FCVTZS_Int : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", int_arm64_neon_fcvtzs>; -defm FCVTZU_Int : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", int_arm64_neon_fcvtzu>; -defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_arm64_neon_fcvtzs>; -defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_arm64_neon_fcvtzu>; -} - -//===----------------------------------------------------------------------===// -// Scaled integer to floating point conversion instructions. -//===----------------------------------------------------------------------===// - -defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>; -defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>; - -//===----------------------------------------------------------------------===// -// Unscaled integer to floating point conversion instruction. -//===----------------------------------------------------------------------===// - -defm FMOV : UnscaledConversion<"fmov">; - -def : Pat<(f32 (fpimm0)), (FMOVWSr WZR)>, Requires<[NoZCZ]>; -def : Pat<(f64 (fpimm0)), (FMOVXDr XZR)>, Requires<[NoZCZ]>; - -//===----------------------------------------------------------------------===// -// Floating point conversion instruction. -//===----------------------------------------------------------------------===// - -defm FCVT : FPConversion<"fcvt">; - -def : Pat<(f32_to_f16 FPR32:$Rn), - (i32 (COPY_TO_REGCLASS - (f32 (SUBREG_TO_REG (i32 0), (FCVTHSr FPR32:$Rn), hsub)), - GPR32))>; - -def FCVTSHpseudo : Pseudo<(outs FPR32:$Rd), (ins FPR32:$Rn), - [(set (f32 FPR32:$Rd), (f16_to_f32 i32:$Rn))]>; - -//===----------------------------------------------------------------------===// -// Floating point single operand instructions. -//===----------------------------------------------------------------------===// - -defm FABS : SingleOperandFPData<0b0001, "fabs", fabs>; -defm FMOV : SingleOperandFPData<0b0000, "fmov">; -defm FNEG : SingleOperandFPData<0b0010, "fneg", fneg>; -defm FRINTA : SingleOperandFPData<0b1100, "frinta", frnd>; -defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>; -defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>; -defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_arm64_neon_frintn>; -defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>; - -def : Pat<(v1f64 (int_arm64_neon_frintn (v1f64 FPR64:$Rn))), - (FRINTNDr FPR64:$Rn)>; - -// FRINTX is inserted to set the flags as required by FENV_ACCESS ON behavior -// in the C spec. Setting hasSideEffects ensures it is not DCE'd. -// -// TODO: We should really model the FPSR flags correctly. This is really ugly. -let hasSideEffects = 1 in { -defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>; -} - -defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>; - -let SchedRW = [WriteFDiv] in { -defm FSQRT : SingleOperandFPData<0b0011, "fsqrt", fsqrt>; -} - -//===----------------------------------------------------------------------===// -// Floating point two operand instructions. -//===----------------------------------------------------------------------===// - -defm FADD : TwoOperandFPData<0b0010, "fadd", fadd>; -let SchedRW = [WriteFDiv] in { -defm FDIV : TwoOperandFPData<0b0001, "fdiv", fdiv>; -} -defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", int_arm64_neon_fmaxnm>; -defm FMAX : TwoOperandFPData<0b0100, "fmax", ARM64fmax>; -defm FMINNM : TwoOperandFPData<0b0111, "fminnm", int_arm64_neon_fminnm>; -defm FMIN : TwoOperandFPData<0b0101, "fmin", ARM64fmin>; -let SchedRW = [WriteFMul] in { -defm FMUL : TwoOperandFPData<0b0000, "fmul", fmul>; -defm FNMUL : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>; -} -defm FSUB : TwoOperandFPData<0b0011, "fsub", fsub>; - -def : Pat<(v1f64 (ARM64fmax (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), - (FMAXDrr FPR64:$Rn, FPR64:$Rm)>; -def : Pat<(v1f64 (ARM64fmin (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), - (FMINDrr FPR64:$Rn, FPR64:$Rm)>; -def : Pat<(v1f64 (int_arm64_neon_fmaxnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), - (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>; -def : Pat<(v1f64 (int_arm64_neon_fminnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), - (FMINNMDrr FPR64:$Rn, FPR64:$Rm)>; - -//===----------------------------------------------------------------------===// -// Floating point three operand instructions. -//===----------------------------------------------------------------------===// - -defm FMADD : ThreeOperandFPData<0, 0, "fmadd", fma>; -defm FMSUB : ThreeOperandFPData<0, 1, "fmsub", - TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >; -defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd", - TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >; -defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub", - TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >; - -// The following def pats catch the case where the LHS of an FMA is negated. -// The TriOpFrag above catches the case where the middle operand is negated. - -// N.b. FMSUB etc have the accumulator at the *end* of (outs), unlike -// the NEON variant. -def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)), - (FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; - -def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)), - (FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; - -// We handled -(a + b*c) for FNMADD above, now it's time for "(-a) + (-b)*c" and -// "(-a) + b*(-c)". -def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))), - (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; - -def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))), - (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; - -def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))), - (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>; - -def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))), - (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; - -//===----------------------------------------------------------------------===// -// Floating point comparison instructions. -//===----------------------------------------------------------------------===// - -defm FCMPE : FPComparison<1, "fcmpe">; -defm FCMP : FPComparison<0, "fcmp", ARM64fcmp>; - -//===----------------------------------------------------------------------===// -// Floating point conditional comparison instructions. -//===----------------------------------------------------------------------===// - -defm FCCMPE : FPCondComparison<1, "fccmpe">; -defm FCCMP : FPCondComparison<0, "fccmp">; - -//===----------------------------------------------------------------------===// -// Floating point conditional select instruction. -//===----------------------------------------------------------------------===// - -defm FCSEL : FPCondSelect<"fcsel">; - -// CSEL instructions providing f128 types need to be handled by a -// pseudo-instruction since the eventual code will need to introduce basic -// blocks and control flow. -def F128CSEL : Pseudo<(outs FPR128:$Rd), - (ins FPR128:$Rn, FPR128:$Rm, ccode:$cond), - [(set (f128 FPR128:$Rd), - (ARM64csel FPR128:$Rn, FPR128:$Rm, - (i32 imm:$cond), NZCV))]> { - let Uses = [NZCV]; - let usesCustomInserter = 1; -} - - -//===----------------------------------------------------------------------===// -// Floating point immediate move. -//===----------------------------------------------------------------------===// - -let isReMaterializable = 1 in { -defm FMOV : FPMoveImmediate<"fmov">; -} - -//===----------------------------------------------------------------------===// -// Advanced SIMD two vector instructions. -//===----------------------------------------------------------------------===// - -defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_arm64_neon_abs>; -defm CLS : SIMDTwoVectorBHS<0, 0b00100, "cls", int_arm64_neon_cls>; -defm CLZ : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>; -defm CMEQ : SIMDCmpTwoVector<0, 0b01001, "cmeq", ARM64cmeqz>; -defm CMGE : SIMDCmpTwoVector<1, 0b01000, "cmge", ARM64cmgez>; -defm CMGT : SIMDCmpTwoVector<0, 0b01000, "cmgt", ARM64cmgtz>; -defm CMLE : SIMDCmpTwoVector<1, 0b01001, "cmle", ARM64cmlez>; -defm CMLT : SIMDCmpTwoVector<0, 0b01010, "cmlt", ARM64cmltz>; -defm CNT : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>; -defm FABS : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>; - -defm FCMEQ : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", ARM64fcmeqz>; -defm FCMGE : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", ARM64fcmgez>; -defm FCMGT : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", ARM64fcmgtz>; -defm FCMLE : SIMDFPCmpTwoVector<1, 1, 0b01101, "fcmle", ARM64fcmlez>; -defm FCMLT : SIMDFPCmpTwoVector<0, 1, 0b01110, "fcmlt", ARM64fcmltz>; -defm FCVTAS : SIMDTwoVectorFPToInt<0,0,0b11100, "fcvtas",int_arm64_neon_fcvtas>; -defm FCVTAU : SIMDTwoVectorFPToInt<1,0,0b11100, "fcvtau",int_arm64_neon_fcvtau>; -defm FCVTL : SIMDFPWidenTwoVector<0, 0, 0b10111, "fcvtl">; -def : Pat<(v4f32 (int_arm64_neon_vcvthf2fp (v4i16 V64:$Rn))), - (FCVTLv4i16 V64:$Rn)>; -def : Pat<(v4f32 (int_arm64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn), - (i64 4)))), - (FCVTLv8i16 V128:$Rn)>; -def : Pat<(v2f64 (fextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>; -def : Pat<(v2f64 (fextend (v2f32 (extract_subvector (v4f32 V128:$Rn), - (i64 2))))), - (FCVTLv4i32 V128:$Rn)>; - -defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_arm64_neon_fcvtms>; -defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_arm64_neon_fcvtmu>; -defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_arm64_neon_fcvtns>; -defm FCVTNU : SIMDTwoVectorFPToInt<1,0,0b11010, "fcvtnu",int_arm64_neon_fcvtnu>; -defm FCVTN : SIMDFPNarrowTwoVector<0, 0, 0b10110, "fcvtn">; -def : Pat<(v4i16 (int_arm64_neon_vcvtfp2hf (v4f32 V128:$Rn))), - (FCVTNv4i16 V128:$Rn)>; -def : Pat<(concat_vectors V64:$Rd, - (v4i16 (int_arm64_neon_vcvtfp2hf (v4f32 V128:$Rn)))), - (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; -def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>; -def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))), - (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; -defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_arm64_neon_fcvtps>; -defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_arm64_neon_fcvtpu>; -defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn", - int_arm64_neon_fcvtxn>; -defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>; -defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>; -let isCodeGenOnly = 1 in { -defm FCVTZS_Int : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", - int_arm64_neon_fcvtzs>; -defm FCVTZU_Int : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", - int_arm64_neon_fcvtzu>; -} -defm FNEG : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>; -defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_arm64_neon_frecpe>; -defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>; -defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>; -defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>; -defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_arm64_neon_frintn>; -defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>; -defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>; -defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>; -defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_arm64_neon_frsqrte>; -defm FSQRT : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>; -defm NEG : SIMDTwoVectorBHSD<1, 0b01011, "neg", - UnOpFrag<(sub immAllZerosV, node:$LHS)> >; -defm NOT : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>; -// Aliases for MVN -> NOT. -def : InstAlias<"mvn{ $Vd.8b, $Vn.8b|.8b $Vd, $Vn}", - (NOTv8i8 V64:$Vd, V64:$Vn)>; -def : InstAlias<"mvn{ $Vd.16b, $Vn.16b|.16b $Vd, $Vn}", - (NOTv16i8 V128:$Vd, V128:$Vn)>; - -def : Pat<(ARM64neg (v8i8 V64:$Rn)), (NEGv8i8 V64:$Rn)>; -def : Pat<(ARM64neg (v16i8 V128:$Rn)), (NEGv16i8 V128:$Rn)>; -def : Pat<(ARM64neg (v4i16 V64:$Rn)), (NEGv4i16 V64:$Rn)>; -def : Pat<(ARM64neg (v8i16 V128:$Rn)), (NEGv8i16 V128:$Rn)>; -def : Pat<(ARM64neg (v2i32 V64:$Rn)), (NEGv2i32 V64:$Rn)>; -def : Pat<(ARM64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>; -def : Pat<(ARM64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>; - -def : Pat<(ARM64not (v8i8 V64:$Rn)), (NOTv8i8 V64:$Rn)>; -def : Pat<(ARM64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>; -def : Pat<(ARM64not (v4i16 V64:$Rn)), (NOTv8i8 V64:$Rn)>; -def : Pat<(ARM64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>; -def : Pat<(ARM64not (v2i32 V64:$Rn)), (NOTv8i8 V64:$Rn)>; -def : Pat<(ARM64not (v1i64 V64:$Rn)), (NOTv8i8 V64:$Rn)>; -def : Pat<(ARM64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>; -def : Pat<(ARM64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>; - -def : Pat<(vnot (v4i16 V64:$Rn)), (NOTv8i8 V64:$Rn)>; -def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>; -def : Pat<(vnot (v2i32 V64:$Rn)), (NOTv8i8 V64:$Rn)>; -def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>; -def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>; - -defm RBIT : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_arm64_neon_rbit>; -defm REV16 : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", ARM64rev16>; -defm REV32 : SIMDTwoVectorBH<1, 0b00000, "rev32", ARM64rev32>; -defm REV64 : SIMDTwoVectorBHS<0, 0b00000, "rev64", ARM64rev64>; -defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp", - BinOpFrag<(add node:$LHS, (int_arm64_neon_saddlp node:$RHS))> >; -defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_arm64_neon_saddlp>; -defm SCVTF : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>; -defm SHLL : SIMDVectorLShiftLongBySizeBHS; -defm SQABS : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_arm64_neon_sqabs>; -defm SQNEG : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_arm64_neon_sqneg>; -defm SQXTN : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_arm64_neon_sqxtn>; -defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_arm64_neon_sqxtun>; -defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_arm64_neon_suqadd>; -defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp", - BinOpFrag<(add node:$LHS, (int_arm64_neon_uaddlp node:$RHS))> >; -defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp", - int_arm64_neon_uaddlp>; -defm UCVTF : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>; -defm UQXTN : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_arm64_neon_uqxtn>; -defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_arm64_neon_urecpe>; -defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_arm64_neon_ursqrte>; -defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_arm64_neon_usqadd>; -defm XTN : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>; - -def : Pat<(v2f32 (ARM64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>; -def : Pat<(v4f32 (ARM64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>; - -// Patterns for vector long shift (by element width). These need to match all -// three of zext, sext and anyext so it's easier to pull the patterns out of the -// definition. -multiclass SIMDVectorLShiftLongBySizeBHSPats { - def : Pat<(ARM64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)), - (SHLLv8i8 V64:$Rn)>; - def : Pat<(ARM64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)), - (SHLLv16i8 V128:$Rn)>; - def : Pat<(ARM64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)), - (SHLLv4i16 V64:$Rn)>; - def : Pat<(ARM64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)), - (SHLLv8i16 V128:$Rn)>; - def : Pat<(ARM64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)), - (SHLLv2i32 V64:$Rn)>; - def : Pat<(ARM64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)), - (SHLLv4i32 V128:$Rn)>; -} - -defm : SIMDVectorLShiftLongBySizeBHSPats; -defm : SIMDVectorLShiftLongBySizeBHSPats; -defm : SIMDVectorLShiftLongBySizeBHSPats; - -//===----------------------------------------------------------------------===// -// Advanced SIMD three vector instructions. -//===----------------------------------------------------------------------===// - -defm ADD : SIMDThreeSameVector<0, 0b10000, "add", add>; -defm ADDP : SIMDThreeSameVector<0, 0b10111, "addp", int_arm64_neon_addp>; -defm CMEQ : SIMDThreeSameVector<1, 0b10001, "cmeq", ARM64cmeq>; -defm CMGE : SIMDThreeSameVector<0, 0b00111, "cmge", ARM64cmge>; -defm CMGT : SIMDThreeSameVector<0, 0b00110, "cmgt", ARM64cmgt>; -defm CMHI : SIMDThreeSameVector<1, 0b00110, "cmhi", ARM64cmhi>; -defm CMHS : SIMDThreeSameVector<1, 0b00111, "cmhs", ARM64cmhs>; -defm CMTST : SIMDThreeSameVector<0, 0b10001, "cmtst", ARM64cmtst>; -defm FABD : SIMDThreeSameVectorFP<1,1,0b11010,"fabd", int_arm64_neon_fabd>; -defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b11101,"facge",int_arm64_neon_facge>; -defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b11101,"facgt",int_arm64_neon_facgt>; -defm FADDP : SIMDThreeSameVectorFP<1,0,0b11010,"faddp",int_arm64_neon_addp>; -defm FADD : SIMDThreeSameVectorFP<0,0,0b11010,"fadd", fadd>; -defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b11100, "fcmeq", ARM64fcmeq>; -defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b11100, "fcmge", ARM64fcmge>; -defm FCMGT : SIMDThreeSameVectorFPCmp<1, 1, 0b11100, "fcmgt", ARM64fcmgt>; -defm FDIV : SIMDThreeSameVectorFP<1,0,0b11111,"fdiv", fdiv>; -defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b11000,"fmaxnmp", int_arm64_neon_fmaxnmp>; -defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b11000,"fmaxnm", int_arm64_neon_fmaxnm>; -defm FMAXP : SIMDThreeSameVectorFP<1,0,0b11110,"fmaxp", int_arm64_neon_fmaxp>; -defm FMAX : SIMDThreeSameVectorFP<0,0,0b11110,"fmax", ARM64fmax>; -defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b11000,"fminnmp", int_arm64_neon_fminnmp>; -defm FMINNM : SIMDThreeSameVectorFP<0,1,0b11000,"fminnm", int_arm64_neon_fminnm>; -defm FMINP : SIMDThreeSameVectorFP<1,1,0b11110,"fminp", int_arm64_neon_fminp>; -defm FMIN : SIMDThreeSameVectorFP<0,1,0b11110,"fmin", ARM64fmin>; - -// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the -// instruction expects the addend first, while the fma intrinsic puts it last. -defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b11001, "fmla", - TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >; -defm FMLS : SIMDThreeSameVectorFPTied<0, 1, 0b11001, "fmls", - TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >; - -// The following def pats catch the case where the LHS of an FMA is negated. -// The TriOpFrag above catches the case where the middle operand is negated. -def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)), - (FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>; - -def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)), - (FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>; - -def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)), - (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>; - -defm FMULX : SIMDThreeSameVectorFP<0,0,0b11011,"fmulx", int_arm64_neon_fmulx>; -defm FMUL : SIMDThreeSameVectorFP<1,0,0b11011,"fmul", fmul>; -defm FRECPS : SIMDThreeSameVectorFP<0,0,0b11111,"frecps", int_arm64_neon_frecps>; -defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b11111,"frsqrts", int_arm64_neon_frsqrts>; -defm FSUB : SIMDThreeSameVectorFP<0,1,0b11010,"fsub", fsub>; -defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", - TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >; -defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", - TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >; -defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>; -defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_arm64_neon_pmul>; -defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba", - TriOpFrag<(add node:$LHS, (int_arm64_neon_sabd node:$MHS, node:$RHS))> >; -defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_arm64_neon_sabd>; -defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_arm64_neon_shadd>; -defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_arm64_neon_shsub>; -defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_arm64_neon_smaxp>; -defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", int_arm64_neon_smax>; -defm SMINP : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_arm64_neon_sminp>; -defm SMIN : SIMDThreeSameVectorBHS<0,0b01101,"smin", int_arm64_neon_smin>; -defm SQADD : SIMDThreeSameVector<0,0b00001,"sqadd", int_arm64_neon_sqadd>; -defm SQDMULH : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_arm64_neon_sqdmulh>; -defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_arm64_neon_sqrdmulh>; -defm SQRSHL : SIMDThreeSameVector<0,0b01011,"sqrshl", int_arm64_neon_sqrshl>; -defm SQSHL : SIMDThreeSameVector<0,0b01001,"sqshl", int_arm64_neon_sqshl>; -defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", int_arm64_neon_sqsub>; -defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_arm64_neon_srhadd>; -defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_arm64_neon_srshl>; -defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_arm64_neon_sshl>; -defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>; -defm UABA : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba", - TriOpFrag<(add node:$LHS, (int_arm64_neon_uabd node:$MHS, node:$RHS))> >; -defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_arm64_neon_uabd>; -defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_arm64_neon_uhadd>; -defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_arm64_neon_uhsub>; -defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_arm64_neon_umaxp>; -defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", int_arm64_neon_umax>; -defm UMINP : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_arm64_neon_uminp>; -defm UMIN : SIMDThreeSameVectorBHS<1,0b01101,"umin", int_arm64_neon_umin>; -defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_arm64_neon_uqadd>; -defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_arm64_neon_uqrshl>; -defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_arm64_neon_uqshl>; -defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_arm64_neon_uqsub>; -defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_arm64_neon_urhadd>; -defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_arm64_neon_urshl>; -defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_arm64_neon_ushl>; - -defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>; -defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic", - BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >; -defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">; -defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", ARM64bit>; -defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl", - TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>; -defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>; -defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn", - BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >; -defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>; - -def : Pat<(ARM64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(ARM64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(ARM64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; -def : Pat<(ARM64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm), - (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; - -def : Pat<(ARM64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(ARM64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(ARM64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; -def : Pat<(ARM64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm), - (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>; - -def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}", - (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>; -def : InstAlias<"mov{\t$dst.8h, $src.8h|.8h\t$dst, $src}", - (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>; -def : InstAlias<"mov{\t$dst.4s, $src.4s|.4s\t$dst, $src}", - (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>; -def : InstAlias<"mov{\t$dst.2d, $src.2d|.2d\t$dst, $src}", - (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>; - -def : InstAlias<"mov{\t$dst.8b, $src.8b|.8b\t$dst, $src}", - (ORRv8i8 V64:$dst, V64:$src, V64:$src), 1>; -def : InstAlias<"mov{\t$dst.4h, $src.4h|.4h\t$dst, $src}", - (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>; -def : InstAlias<"mov{\t$dst.2s, $src.2s|.2s\t$dst, $src}", - (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>; -def : InstAlias<"mov{\t$dst.1d, $src.1d|.1d\t$dst, $src}", - (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>; - -def : InstAlias<"{cmls\t$dst.8b, $src1.8b, $src2.8b" # - "|cmls.8b\t$dst, $src1, $src2}", - (CMHSv8i8 V64:$dst, V64:$src2, V64:$src1), 0>; -def : InstAlias<"{cmls\t$dst.16b, $src1.16b, $src2.16b" # - "|cmls.16b\t$dst, $src1, $src2}", - (CMHSv16i8 V128:$dst, V128:$src2, V128:$src1), 0>; -def : InstAlias<"{cmls\t$dst.4h, $src1.4h, $src2.4h" # - "|cmls.4h\t$dst, $src1, $src2}", - (CMHSv4i16 V64:$dst, V64:$src2, V64:$src1), 0>; -def : InstAlias<"{cmls\t$dst.8h, $src1.8h, $src2.8h" # - "|cmls.8h\t$dst, $src1, $src2}", - (CMHSv8i16 V128:$dst, V128:$src2, V128:$src1), 0>; -def : InstAlias<"{cmls\t$dst.2s, $src1.2s, $src2.2s" # - "|cmls.2s\t$dst, $src1, $src2}", - (CMHSv2i32 V64:$dst, V64:$src2, V64:$src1), 0>; -def : InstAlias<"{cmls\t$dst.4s, $src1.4s, $src2.4s" # - "|cmls.4s\t$dst, $src1, $src2}", - (CMHSv4i32 V128:$dst, V128:$src2, V128:$src1), 0>; -def : InstAlias<"{cmls\t$dst.2d, $src1.2d, $src2.2d" # - "|cmls.2d\t$dst, $src1, $src2}", - (CMHSv2i64 V128:$dst, V128:$src2, V128:$src1), 0>; - -def : InstAlias<"{cmlo\t$dst.8b, $src1.8b, $src2.8b" # - "|cmlo.8b\t$dst, $src1, $src2}", - (CMHIv8i8 V64:$dst, V64:$src2, V64:$src1), 0>; -def : InstAlias<"{cmlo\t$dst.16b, $src1.16b, $src2.16b" # - "|cmlo.16b\t$dst, $src1, $src2}", - (CMHIv16i8 V128:$dst, V128:$src2, V128:$src1), 0>; -def : InstAlias<"{cmlo\t$dst.4h, $src1.4h, $src2.4h" # - "|cmlo.4h\t$dst, $src1, $src2}", - (CMHIv4i16 V64:$dst, V64:$src2, V64:$src1), 0>; -def : InstAlias<"{cmlo\t$dst.8h, $src1.8h, $src2.8h" # - "|cmlo.8h\t$dst, $src1, $src2}", - (CMHIv8i16 V128:$dst, V128:$src2, V128:$src1), 0>; -def : InstAlias<"{cmlo\t$dst.2s, $src1.2s, $src2.2s" # - "|cmlo.2s\t$dst, $src1, $src2}", - (CMHIv2i32 V64:$dst, V64:$src2, V64:$src1), 0>; -def : InstAlias<"{cmlo\t$dst.4s, $src1.4s, $src2.4s" # - "|cmlo.4s\t$dst, $src1, $src2}", - (CMHIv4i32 V128:$dst, V128:$src2, V128:$src1), 0>; -def : InstAlias<"{cmlo\t$dst.2d, $src1.2d, $src2.2d" # - "|cmlo.2d\t$dst, $src1, $src2}", - (CMHIv2i64 V128:$dst, V128:$src2, V128:$src1), 0>; - -def : InstAlias<"{cmle\t$dst.8b, $src1.8b, $src2.8b" # - "|cmle.8b\t$dst, $src1, $src2}", - (CMGEv8i8 V64:$dst, V64:$src2, V64:$src1), 0>; -def : InstAlias<"{cmle\t$dst.16b, $src1.16b, $src2.16b" # - "|cmle.16b\t$dst, $src1, $src2}", - (CMGEv16i8 V128:$dst, V128:$src2, V128:$src1), 0>; -def : InstAlias<"{cmle\t$dst.4h, $src1.4h, $src2.4h" # - "|cmle.4h\t$dst, $src1, $src2}", - (CMGEv4i16 V64:$dst, V64:$src2, V64:$src1), 0>; -def : InstAlias<"{cmle\t$dst.8h, $src1.8h, $src2.8h" # - "|cmle.8h\t$dst, $src1, $src2}", - (CMGEv8i16 V128:$dst, V128:$src2, V128:$src1), 0>; -def : InstAlias<"{cmle\t$dst.2s, $src1.2s, $src2.2s" # - "|cmle.2s\t$dst, $src1, $src2}", - (CMGEv2i32 V64:$dst, V64:$src2, V64:$src1), 0>; -def : InstAlias<"{cmle\t$dst.4s, $src1.4s, $src2.4s" # - "|cmle.4s\t$dst, $src1, $src2}", - (CMGEv4i32 V128:$dst, V128:$src2, V128:$src1), 0>; -def : InstAlias<"{cmle\t$dst.2d, $src1.2d, $src2.2d" # - "|cmle.2d\t$dst, $src1, $src2}", - (CMGEv2i64 V128:$dst, V128:$src2, V128:$src1), 0>; - -def : InstAlias<"{cmlt\t$dst.8b, $src1.8b, $src2.8b" # - "|cmlt.8b\t$dst, $src1, $src2}", - (CMGTv8i8 V64:$dst, V64:$src2, V64:$src1), 0>; -def : InstAlias<"{cmlt\t$dst.16b, $src1.16b, $src2.16b" # - "|cmlt.16b\t$dst, $src1, $src2}", - (CMGTv16i8 V128:$dst, V128:$src2, V128:$src1), 0>; -def : InstAlias<"{cmlt\t$dst.4h, $src1.4h, $src2.4h" # - "|cmlt.4h\t$dst, $src1, $src2}", - (CMGTv4i16 V64:$dst, V64:$src2, V64:$src1), 0>; -def : InstAlias<"{cmlt\t$dst.8h, $src1.8h, $src2.8h" # - "|cmlt.8h\t$dst, $src1, $src2}", - (CMGTv8i16 V128:$dst, V128:$src2, V128:$src1), 0>; -def : InstAlias<"{cmlt\t$dst.2s, $src1.2s, $src2.2s" # - "|cmlt.2s\t$dst, $src1, $src2}", - (CMGTv2i32 V64:$dst, V64:$src2, V64:$src1), 0>; -def : InstAlias<"{cmlt\t$dst.4s, $src1.4s, $src2.4s" # - "|cmlt.4s\t$dst, $src1, $src2}", - (CMGTv4i32 V128:$dst, V128:$src2, V128:$src1), 0>; -def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" # - "|cmlt.2d\t$dst, $src1, $src2}", - (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>; - -def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" # - "|fcmle.2s\t$dst, $src1, $src2}", - (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; -def : InstAlias<"{fcmle\t$dst.4s, $src1.4s, $src2.4s" # - "|fcmle.4s\t$dst, $src1, $src2}", - (FCMGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>; -def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" # - "|fcmle.2d\t$dst, $src1, $src2}", - (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>; - -def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" # - "|fcmlt.2s\t$dst, $src1, $src2}", - (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; -def : InstAlias<"{fcmlt\t$dst.4s, $src1.4s, $src2.4s" # - "|fcmlt.4s\t$dst, $src1, $src2}", - (FCMGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>; -def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" # - "|fcmlt.2d\t$dst, $src1, $src2}", - (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>; - -def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" # - "|facle.2s\t$dst, $src1, $src2}", - (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; -def : InstAlias<"{facle\t$dst.4s, $src1.4s, $src2.4s" # - "|facle.4s\t$dst, $src1, $src2}", - (FACGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>; -def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" # - "|facle.2d\t$dst, $src1, $src2}", - (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>; - -def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" # - "|faclt.2s\t$dst, $src1, $src2}", - (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; -def : InstAlias<"{faclt\t$dst.4s, $src1.4s, $src2.4s" # - "|faclt.4s\t$dst, $src1, $src2}", - (FACGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>; -def : InstAlias<"{faclt\t$dst.2d, $src1.2d, $src2.2d" # - "|faclt.2d\t$dst, $src1, $src2}", - (FACGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>; - -//===----------------------------------------------------------------------===// -// Advanced SIMD three scalar instructions. -//===----------------------------------------------------------------------===// - -defm ADD : SIMDThreeScalarD<0, 0b10000, "add", add>; -defm CMEQ : SIMDThreeScalarD<1, 0b10001, "cmeq", ARM64cmeq>; -defm CMGE : SIMDThreeScalarD<0, 0b00111, "cmge", ARM64cmge>; -defm CMGT : SIMDThreeScalarD<0, 0b00110, "cmgt", ARM64cmgt>; -defm CMHI : SIMDThreeScalarD<1, 0b00110, "cmhi", ARM64cmhi>; -defm CMHS : SIMDThreeScalarD<1, 0b00111, "cmhs", ARM64cmhs>; -defm CMTST : SIMDThreeScalarD<0, 0b10001, "cmtst", ARM64cmtst>; -defm FABD : SIMDThreeScalarSD<1, 1, 0b11010, "fabd", int_arm64_sisd_fabd>; -def : Pat<(v1f64 (int_arm64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), - (FABD64 FPR64:$Rn, FPR64:$Rm)>; -defm FACGE : SIMDThreeScalarFPCmp<1, 0, 0b11101, "facge", - int_arm64_neon_facge>; -defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b11101, "facgt", - int_arm64_neon_facgt>; -defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b11100, "fcmeq", ARM64fcmeq>; -defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b11100, "fcmge", ARM64fcmge>; -defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b11100, "fcmgt", ARM64fcmgt>; -defm FMULX : SIMDThreeScalarSD<0, 0, 0b11011, "fmulx", int_arm64_neon_fmulx>; -defm FRECPS : SIMDThreeScalarSD<0, 0, 0b11111, "frecps", int_arm64_neon_frecps>; -defm FRSQRTS : SIMDThreeScalarSD<0, 1, 0b11111, "frsqrts", int_arm64_neon_frsqrts>; -defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_arm64_neon_sqadd>; -defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_arm64_neon_sqdmulh>; -defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_arm64_neon_sqrdmulh>; -defm SQRSHL : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl",int_arm64_neon_sqrshl>; -defm SQSHL : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", int_arm64_neon_sqshl>; -defm SQSUB : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", int_arm64_neon_sqsub>; -defm SRSHL : SIMDThreeScalarD< 0, 0b01010, "srshl", int_arm64_neon_srshl>; -defm SSHL : SIMDThreeScalarD< 0, 0b01000, "sshl", int_arm64_neon_sshl>; -defm SUB : SIMDThreeScalarD< 1, 0b10000, "sub", sub>; -defm UQADD : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", int_arm64_neon_uqadd>; -defm UQRSHL : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl",int_arm64_neon_uqrshl>; -defm UQSHL : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_arm64_neon_uqshl>; -defm UQSUB : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_arm64_neon_uqsub>; -defm URSHL : SIMDThreeScalarD< 1, 0b01010, "urshl", int_arm64_neon_urshl>; -defm USHL : SIMDThreeScalarD< 1, 0b01000, "ushl", int_arm64_neon_ushl>; - -def : InstAlias<"cmls $dst, $src1, $src2", - (CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; -def : InstAlias<"cmle $dst, $src1, $src2", - (CMGEv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; -def : InstAlias<"cmlo $dst, $src1, $src2", - (CMHIv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; -def : InstAlias<"cmlt $dst, $src1, $src2", - (CMGTv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; -def : InstAlias<"fcmle $dst, $src1, $src2", - (FCMGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>; -def : InstAlias<"fcmle $dst, $src1, $src2", - (FCMGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; -def : InstAlias<"fcmlt $dst, $src1, $src2", - (FCMGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>; -def : InstAlias<"fcmlt $dst, $src1, $src2", - (FCMGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; -def : InstAlias<"facle $dst, $src1, $src2", - (FACGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>; -def : InstAlias<"facle $dst, $src1, $src2", - (FACGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; -def : InstAlias<"faclt $dst, $src1, $src2", - (FACGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>; -def : InstAlias<"faclt $dst, $src1, $src2", - (FACGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>; - -//===----------------------------------------------------------------------===// -// Advanced SIMD three scalar instructions (mixed operands). -//===----------------------------------------------------------------------===// -defm SQDMULL : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull", - int_arm64_neon_sqdmulls_scalar>; -defm SQDMLAL : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">; -defm SQDMLSL : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">; - -def : Pat<(i64 (int_arm64_neon_sqadd (i64 FPR64:$Rd), - (i64 (int_arm64_neon_sqdmulls_scalar (i32 FPR32:$Rn), - (i32 FPR32:$Rm))))), - (SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>; -def : Pat<(i64 (int_arm64_neon_sqsub (i64 FPR64:$Rd), - (i64 (int_arm64_neon_sqdmulls_scalar (i32 FPR32:$Rn), - (i32 FPR32:$Rm))))), - (SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>; - -//===----------------------------------------------------------------------===// -// Advanced SIMD two scalar instructions. -//===----------------------------------------------------------------------===// - -defm ABS : SIMDTwoScalarD< 0, 0b01011, "abs", int_arm64_neon_abs>; -defm CMEQ : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", ARM64cmeqz>; -defm CMGE : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", ARM64cmgez>; -defm CMGT : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", ARM64cmgtz>; -defm CMLE : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", ARM64cmlez>; -defm CMLT : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", ARM64cmltz>; -defm FCMEQ : SIMDCmpTwoScalarSD<0, 1, 0b01101, "fcmeq", ARM64fcmeqz>; -defm FCMGE : SIMDCmpTwoScalarSD<1, 1, 0b01100, "fcmge", ARM64fcmgez>; -defm FCMGT : SIMDCmpTwoScalarSD<0, 1, 0b01100, "fcmgt", ARM64fcmgtz>; -defm FCMLE : SIMDCmpTwoScalarSD<1, 1, 0b01101, "fcmle", ARM64fcmlez>; -defm FCMLT : SIMDCmpTwoScalarSD<0, 1, 0b01110, "fcmlt", ARM64fcmltz>; -defm FCVTAS : SIMDTwoScalarSD< 0, 0, 0b11100, "fcvtas">; -defm FCVTAU : SIMDTwoScalarSD< 1, 0, 0b11100, "fcvtau">; -defm FCVTMS : SIMDTwoScalarSD< 0, 0, 0b11011, "fcvtms">; -defm FCVTMU : SIMDTwoScalarSD< 1, 0, 0b11011, "fcvtmu">; -defm FCVTNS : SIMDTwoScalarSD< 0, 0, 0b11010, "fcvtns">; -defm FCVTNU : SIMDTwoScalarSD< 1, 0, 0b11010, "fcvtnu">; -defm FCVTPS : SIMDTwoScalarSD< 0, 1, 0b11010, "fcvtps">; -defm FCVTPU : SIMDTwoScalarSD< 1, 1, 0b11010, "fcvtpu">; -def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">; -defm FCVTZS : SIMDTwoScalarSD< 0, 1, 0b11011, "fcvtzs">; -defm FCVTZU : SIMDTwoScalarSD< 1, 1, 0b11011, "fcvtzu">; -defm FRECPE : SIMDTwoScalarSD< 0, 1, 0b11101, "frecpe">; -defm FRECPX : SIMDTwoScalarSD< 0, 1, 0b11111, "frecpx">; -defm FRSQRTE : SIMDTwoScalarSD< 1, 1, 0b11101, "frsqrte">; -defm NEG : SIMDTwoScalarD< 1, 0b01011, "neg", - UnOpFrag<(sub immAllZerosV, node:$LHS)> >; -defm SCVTF : SIMDTwoScalarCVTSD< 0, 0, 0b11101, "scvtf", ARM64sitof>; -defm SQABS : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_arm64_neon_sqabs>; -defm SQNEG : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_arm64_neon_sqneg>; -defm SQXTN : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_arm64_neon_scalar_sqxtn>; -defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_arm64_neon_scalar_sqxtun>; -defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd", - int_arm64_neon_suqadd>; -defm UCVTF : SIMDTwoScalarCVTSD< 1, 0, 0b11101, "ucvtf", ARM64uitof>; -defm UQXTN : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_arm64_neon_scalar_uqxtn>; -defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd", - int_arm64_neon_usqadd>; - -def : Pat<(ARM64neg (v1i64 V64:$Rn)), (NEGv1i64 V64:$Rn)>; - -def : Pat<(v1i64 (int_arm64_neon_fcvtas (v1f64 FPR64:$Rn))), - (FCVTASv1i64 FPR64:$Rn)>; -def : Pat<(v1i64 (int_arm64_neon_fcvtau (v1f64 FPR64:$Rn))), - (FCVTAUv1i64 FPR64:$Rn)>; -def : Pat<(v1i64 (int_arm64_neon_fcvtms (v1f64 FPR64:$Rn))), - (FCVTMSv1i64 FPR64:$Rn)>; -def : Pat<(v1i64 (int_arm64_neon_fcvtmu (v1f64 FPR64:$Rn))), - (FCVTMUv1i64 FPR64:$Rn)>; -def : Pat<(v1i64 (int_arm64_neon_fcvtns (v1f64 FPR64:$Rn))), - (FCVTNSv1i64 FPR64:$Rn)>; -def : Pat<(v1i64 (int_arm64_neon_fcvtnu (v1f64 FPR64:$Rn))), - (FCVTNUv1i64 FPR64:$Rn)>; -def : Pat<(v1i64 (int_arm64_neon_fcvtps (v1f64 FPR64:$Rn))), - (FCVTPSv1i64 FPR64:$Rn)>; -def : Pat<(v1i64 (int_arm64_neon_fcvtpu (v1f64 FPR64:$Rn))), - (FCVTPUv1i64 FPR64:$Rn)>; - -def : Pat<(f32 (int_arm64_neon_frecpe (f32 FPR32:$Rn))), - (FRECPEv1i32 FPR32:$Rn)>; -def : Pat<(f64 (int_arm64_neon_frecpe (f64 FPR64:$Rn))), - (FRECPEv1i64 FPR64:$Rn)>; -def : Pat<(v1f64 (int_arm64_neon_frecpe (v1f64 FPR64:$Rn))), - (FRECPEv1i64 FPR64:$Rn)>; - -def : Pat<(f32 (int_arm64_neon_frecpx (f32 FPR32:$Rn))), - (FRECPXv1i32 FPR32:$Rn)>; -def : Pat<(f64 (int_arm64_neon_frecpx (f64 FPR64:$Rn))), - (FRECPXv1i64 FPR64:$Rn)>; - -def : Pat<(f32 (int_arm64_neon_frsqrte (f32 FPR32:$Rn))), - (FRSQRTEv1i32 FPR32:$Rn)>; -def : Pat<(f64 (int_arm64_neon_frsqrte (f64 FPR64:$Rn))), - (FRSQRTEv1i64 FPR64:$Rn)>; -def : Pat<(v1f64 (int_arm64_neon_frsqrte (v1f64 FPR64:$Rn))), - (FRSQRTEv1i64 FPR64:$Rn)>; - -// If an integer is about to be converted to a floating point value, -// just load it on the floating point unit. -// Here are the patterns for 8 and 16-bits to float. -// 8-bits -> float. -multiclass UIntToFPROLoadPat { - def : Pat<(DstTy (uint_to_fp (SrcTy - (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, - ro.Wext:$extend))))), - (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)), - (LDRW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend), - sub))>; - - def : Pat<(DstTy (uint_to_fp (SrcTy - (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, - ro.Wext:$extend))))), - (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)), - (LDRX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend), - sub))>; -} - -defm : UIntToFPROLoadPat; -def : Pat <(f32 (uint_to_fp (i32 - (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), - (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)), - (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>; -def : Pat <(f32 (uint_to_fp (i32 - (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))), - (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)), - (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>; -// 16-bits -> float. -defm : UIntToFPROLoadPat; -def : Pat <(f32 (uint_to_fp (i32 - (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), - (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)), - (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>; -def : Pat <(f32 (uint_to_fp (i32 - (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))), - (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)), - (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>; -// 32-bits are handled in target specific dag combine: -// performIntToFpCombine. -// 64-bits integer to 32-bits floating point, not possible with -// UCVTF on floating point registers (both source and destination -// must have the same size). - -// Here are the patterns for 8, 16, 32, and 64-bits to double. -// 8-bits -> double. -defm : UIntToFPROLoadPat; -def : Pat <(f64 (uint_to_fp (i32 - (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), - (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)), - (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>; -def : Pat <(f64 (uint_to_fp (i32 - (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))), - (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)), - (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>; -// 16-bits -> double. -defm : UIntToFPROLoadPat; -def : Pat <(f64 (uint_to_fp (i32 - (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), - (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)), - (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>; -def : Pat <(f64 (uint_to_fp (i32 - (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))), - (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)), - (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>; -// 32-bits -> double. -defm : UIntToFPROLoadPat; -def : Pat <(f64 (uint_to_fp (i32 - (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), - (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)), - (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub))>; -def : Pat <(f64 (uint_to_fp (i32 - (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))), - (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)), - (LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>; -// 64-bits -> double are handled in target specific dag combine: -// performIntToFpCombine. - -//===----------------------------------------------------------------------===// -// Advanced SIMD three different-sized vector instructions. -//===----------------------------------------------------------------------===// - -defm ADDHN : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_arm64_neon_addhn>; -defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_arm64_neon_subhn>; -defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_arm64_neon_raddhn>; -defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_arm64_neon_rsubhn>; -defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_arm64_neon_pmull>; -defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal", - int_arm64_neon_sabd>; -defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl", - int_arm64_neon_sabd>; -defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl", - BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>; -defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw", - BinOpFrag<(add node:$LHS, (sext node:$RHS))>>; -defm SMLAL : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal", - TriOpFrag<(add node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>; -defm SMLSL : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl", - TriOpFrag<(sub node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>; -defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_arm64_neon_smull>; -defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal", - int_arm64_neon_sqadd>; -defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl", - int_arm64_neon_sqsub>; -defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull", - int_arm64_neon_sqdmull>; -defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl", - BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>; -defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw", - BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>; -defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal", - int_arm64_neon_uabd>; -defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", - int_arm64_neon_uabd>; -defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl", - BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>; -defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw", - BinOpFrag<(add node:$LHS, (zext node:$RHS))>>; -defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal", - TriOpFrag<(add node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>; -defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl", - TriOpFrag<(sub node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>; -defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_arm64_neon_umull>; -defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl", - BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>; -defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw", - BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>; - -// Patterns for 64-bit pmull -def : Pat<(int_arm64_neon_pmull64 V64:$Rn, V64:$Rm), - (PMULLv1i64 V64:$Rn, V64:$Rm)>; -def : Pat<(int_arm64_neon_pmull64 (vector_extract (v2i64 V128:$Rn), (i64 1)), - (vector_extract (v2i64 V128:$Rm), (i64 1))), - (PMULLv2i64 V128:$Rn, V128:$Rm)>; - -// CodeGen patterns for addhn and subhn instructions, which can actually be -// written in LLVM IR without too much difficulty. - -// ADDHN -def : Pat<(v8i8 (trunc (v8i16 (ARM64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))), - (ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>; -def : Pat<(v4i16 (trunc (v4i32 (ARM64vlshr (add V128:$Rn, V128:$Rm), - (i32 16))))), - (ADDHNv4i32_v4i16 V128:$Rn, V128:$Rm)>; -def : Pat<(v2i32 (trunc (v2i64 (ARM64vlshr (add V128:$Rn, V128:$Rm), - (i32 32))))), - (ADDHNv2i64_v2i32 V128:$Rn, V128:$Rm)>; -def : Pat<(concat_vectors (v8i8 V64:$Rd), - (trunc (v8i16 (ARM64vlshr (add V128:$Rn, V128:$Rm), - (i32 8))))), - (ADDHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), - V128:$Rn, V128:$Rm)>; -def : Pat<(concat_vectors (v4i16 V64:$Rd), - (trunc (v4i32 (ARM64vlshr (add V128:$Rn, V128:$Rm), - (i32 16))))), - (ADDHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), - V128:$Rn, V128:$Rm)>; -def : Pat<(concat_vectors (v2i32 V64:$Rd), - (trunc (v2i64 (ARM64vlshr (add V128:$Rn, V128:$Rm), - (i32 32))))), - (ADDHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), - V128:$Rn, V128:$Rm)>; - -// SUBHN -def : Pat<(v8i8 (trunc (v8i16 (ARM64vlshr (sub V128:$Rn, V128:$Rm), (i32 8))))), - (SUBHNv8i16_v8i8 V128:$Rn, V128:$Rm)>; -def : Pat<(v4i16 (trunc (v4i32 (ARM64vlshr (sub V128:$Rn, V128:$Rm), - (i32 16))))), - (SUBHNv4i32_v4i16 V128:$Rn, V128:$Rm)>; -def : Pat<(v2i32 (trunc (v2i64 (ARM64vlshr (sub V128:$Rn, V128:$Rm), - (i32 32))))), - (SUBHNv2i64_v2i32 V128:$Rn, V128:$Rm)>; -def : Pat<(concat_vectors (v8i8 V64:$Rd), - (trunc (v8i16 (ARM64vlshr (sub V128:$Rn, V128:$Rm), - (i32 8))))), - (SUBHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), - V128:$Rn, V128:$Rm)>; -def : Pat<(concat_vectors (v4i16 V64:$Rd), - (trunc (v4i32 (ARM64vlshr (sub V128:$Rn, V128:$Rm), - (i32 16))))), - (SUBHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), - V128:$Rn, V128:$Rm)>; -def : Pat<(concat_vectors (v2i32 V64:$Rd), - (trunc (v2i64 (ARM64vlshr (sub V128:$Rn, V128:$Rm), - (i32 32))))), - (SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub), - V128:$Rn, V128:$Rm)>; - -//---------------------------------------------------------------------------- -// AdvSIMD bitwise extract from vector instruction. -//---------------------------------------------------------------------------- - -defm EXT : SIMDBitwiseExtract<"ext">; - -def : Pat<(v4i16 (ARM64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))), - (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>; -def : Pat<(v8i16 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), - (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; -def : Pat<(v2i32 (ARM64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))), - (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>; -def : Pat<(v2f32 (ARM64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))), - (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>; -def : Pat<(v4i32 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), - (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; -def : Pat<(v4f32 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), - (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; -def : Pat<(v2i64 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), - (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; -def : Pat<(v2f64 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))), - (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>; - -// We use EXT to handle extract_subvector to copy the upper 64-bits of a -// 128-bit vector. -def : Pat<(v8i8 (extract_subvector V128:$Rn, (i64 8))), - (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; -def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 4))), - (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; -def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))), - (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; -def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))), - (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; -def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))), - (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; -def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))), - (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>; - - -//---------------------------------------------------------------------------- -// AdvSIMD zip vector -//---------------------------------------------------------------------------- - -defm TRN1 : SIMDZipVector<0b010, "trn1", ARM64trn1>; -defm TRN2 : SIMDZipVector<0b110, "trn2", ARM64trn2>; -defm UZP1 : SIMDZipVector<0b001, "uzp1", ARM64uzp1>; -defm UZP2 : SIMDZipVector<0b101, "uzp2", ARM64uzp2>; -defm ZIP1 : SIMDZipVector<0b011, "zip1", ARM64zip1>; -defm ZIP2 : SIMDZipVector<0b111, "zip2", ARM64zip2>; - -//---------------------------------------------------------------------------- -// AdvSIMD TBL/TBX instructions -//---------------------------------------------------------------------------- - -defm TBL : SIMDTableLookup< 0, "tbl">; -defm TBX : SIMDTableLookupTied<1, "tbx">; - -def : Pat<(v8i8 (int_arm64_neon_tbl1 (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))), - (TBLv8i8One VecListOne128:$Rn, V64:$Ri)>; -def : Pat<(v16i8 (int_arm64_neon_tbl1 (v16i8 V128:$Ri), (v16i8 V128:$Rn))), - (TBLv16i8One V128:$Ri, V128:$Rn)>; - -def : Pat<(v8i8 (int_arm64_neon_tbx1 (v8i8 V64:$Rd), - (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))), - (TBXv8i8One V64:$Rd, VecListOne128:$Rn, V64:$Ri)>; -def : Pat<(v16i8 (int_arm64_neon_tbx1 (v16i8 V128:$Rd), - (v16i8 V128:$Ri), (v16i8 V128:$Rn))), - (TBXv16i8One V128:$Rd, V128:$Ri, V128:$Rn)>; - - -//---------------------------------------------------------------------------- -// AdvSIMD scalar CPY instruction -//---------------------------------------------------------------------------- - -defm CPY : SIMDScalarCPY<"cpy">; - -//---------------------------------------------------------------------------- -// AdvSIMD scalar pairwise instructions -//---------------------------------------------------------------------------- - -defm ADDP : SIMDPairwiseScalarD<0, 0b11011, "addp">; -defm FADDP : SIMDPairwiseScalarSD<1, 0, 0b01101, "faddp">; -defm FMAXNMP : SIMDPairwiseScalarSD<1, 0, 0b01100, "fmaxnmp">; -defm FMAXP : SIMDPairwiseScalarSD<1, 0, 0b01111, "fmaxp">; -defm FMINNMP : SIMDPairwiseScalarSD<1, 1, 0b01100, "fminnmp">; -defm FMINP : SIMDPairwiseScalarSD<1, 1, 0b01111, "fminp">; -def : Pat<(i64 (int_arm64_neon_saddv (v2i64 V128:$Rn))), - (ADDPv2i64p V128:$Rn)>; -def : Pat<(i64 (int_arm64_neon_uaddv (v2i64 V128:$Rn))), - (ADDPv2i64p V128:$Rn)>; -def : Pat<(f32 (int_arm64_neon_faddv (v2f32 V64:$Rn))), - (FADDPv2i32p V64:$Rn)>; -def : Pat<(f32 (int_arm64_neon_faddv (v4f32 V128:$Rn))), - (FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>; -def : Pat<(f64 (int_arm64_neon_faddv (v2f64 V128:$Rn))), - (FADDPv2i64p V128:$Rn)>; -def : Pat<(f32 (int_arm64_neon_fmaxnmv (v2f32 V64:$Rn))), - (FMAXNMPv2i32p V64:$Rn)>; -def : Pat<(f64 (int_arm64_neon_fmaxnmv (v2f64 V128:$Rn))), - (FMAXNMPv2i64p V128:$Rn)>; -def : Pat<(f32 (int_arm64_neon_fmaxv (v2f32 V64:$Rn))), - (FMAXPv2i32p V64:$Rn)>; -def : Pat<(f64 (int_arm64_neon_fmaxv (v2f64 V128:$Rn))), - (FMAXPv2i64p V128:$Rn)>; -def : Pat<(f32 (int_arm64_neon_fminnmv (v2f32 V64:$Rn))), - (FMINNMPv2i32p V64:$Rn)>; -def : Pat<(f64 (int_arm64_neon_fminnmv (v2f64 V128:$Rn))), - (FMINNMPv2i64p V128:$Rn)>; -def : Pat<(f32 (int_arm64_neon_fminv (v2f32 V64:$Rn))), - (FMINPv2i32p V64:$Rn)>; -def : Pat<(f64 (int_arm64_neon_fminv (v2f64 V128:$Rn))), - (FMINPv2i64p V128:$Rn)>; - -//---------------------------------------------------------------------------- -// AdvSIMD INS/DUP instructions -//---------------------------------------------------------------------------- - -def DUPv8i8gpr : SIMDDupFromMain<0, 0b00001, ".8b", v8i8, V64, GPR32>; -def DUPv16i8gpr : SIMDDupFromMain<1, 0b00001, ".16b", v16i8, V128, GPR32>; -def DUPv4i16gpr : SIMDDupFromMain<0, 0b00010, ".4h", v4i16, V64, GPR32>; -def DUPv8i16gpr : SIMDDupFromMain<1, 0b00010, ".8h", v8i16, V128, GPR32>; -def DUPv2i32gpr : SIMDDupFromMain<0, 0b00100, ".2s", v2i32, V64, GPR32>; -def DUPv4i32gpr : SIMDDupFromMain<1, 0b00100, ".4s", v4i32, V128, GPR32>; -def DUPv2i64gpr : SIMDDupFromMain<1, 0b01000, ".2d", v2i64, V128, GPR64>; - -def DUPv2i64lane : SIMDDup64FromElement; -def DUPv2i32lane : SIMDDup32FromElement<0, ".2s", v2i32, V64>; -def DUPv4i32lane : SIMDDup32FromElement<1, ".4s", v4i32, V128>; -def DUPv4i16lane : SIMDDup16FromElement<0, ".4h", v4i16, V64>; -def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>; -def DUPv8i8lane : SIMDDup8FromElement <0, ".8b", v8i8, V64>; -def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>; - -def : Pat<(v2f32 (ARM64dup (f32 FPR32:$Rn))), - (v2f32 (DUPv2i32lane - (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub), - (i64 0)))>; -def : Pat<(v4f32 (ARM64dup (f32 FPR32:$Rn))), - (v4f32 (DUPv4i32lane - (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub), - (i64 0)))>; -def : Pat<(v2f64 (ARM64dup (f64 FPR64:$Rn))), - (v2f64 (DUPv2i64lane - (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub), - (i64 0)))>; - -def : Pat<(v2f32 (ARM64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)), - (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>; -def : Pat<(v4f32 (ARM64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)), - (DUPv4i32lane V128:$Rn, VectorIndexS:$imm)>; -def : Pat<(v2f64 (ARM64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)), - (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>; - -// If there's an (ARM64dup (vector_extract ...) ...), we can use a duplane -// instruction even if the types don't match: we just have to remap the lane -// carefully. N.b. this trick only applies to truncations. -def VecIndex_x2 : SDNodeXFormgetTargetConstant(2 * N->getZExtValue(), MVT::i64); -}]>; -def VecIndex_x4 : SDNodeXFormgetTargetConstant(4 * N->getZExtValue(), MVT::i64); -}]>; -def VecIndex_x8 : SDNodeXFormgetTargetConstant(8 * N->getZExtValue(), MVT::i64); -}]>; - -multiclass DUPWithTruncPats { - def : Pat<(ResVT (ARM64dup (ScalVT (vector_extract (Src128VT V128:$Rn), - imm:$idx)))), - (DUP V128:$Rn, (IdxXFORM imm:$idx))>; - - def : Pat<(ResVT (ARM64dup (ScalVT (vector_extract (Src64VT V64:$Rn), - imm:$idx)))), - (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>; -} - -defm : DUPWithTruncPats; -defm : DUPWithTruncPats; -defm : DUPWithTruncPats; - -defm : DUPWithTruncPats; -defm : DUPWithTruncPats; -defm : DUPWithTruncPats; - -multiclass DUPWithTrunci64Pats { - def : Pat<(ResVT (ARM64dup (i32 (trunc (vector_extract (v2i64 V128:$Rn), - imm:$idx))))), - (DUP V128:$Rn, (IdxXFORM imm:$idx))>; - - def : Pat<(ResVT (ARM64dup (i32 (trunc (vector_extract (v1i64 V64:$Rn), - imm:$idx))))), - (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>; -} - -defm : DUPWithTrunci64Pats; -defm : DUPWithTrunci64Pats; -defm : DUPWithTrunci64Pats; - -defm : DUPWithTrunci64Pats; -defm : DUPWithTrunci64Pats; -defm : DUPWithTrunci64Pats; - -// SMOV and UMOV definitions, with some extra patterns for convenience -defm SMOV : SMov; -defm UMOV : UMov; - -def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8), - (i32 (SMOVvi8to32 V128:$Rn, VectorIndexB:$idx))>; -def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8), - (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>; -def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16), - (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>; -def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16), - (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>; -def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16), - (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>; -def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))), - (i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>; - -// Extracting i8 or i16 elements will have the zero-extend transformed to -// an 'and' mask by type legalization since neither i8 nor i16 are legal types -// for ARM64. Match these patterns here since UMOV already zeroes out the high -// bits of the destination register. -def : Pat<(and (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), - (i32 0xff)), - (i32 (UMOVvi8 V128:$Rn, VectorIndexB:$idx))>; -def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx), - (i32 0xffff)), - (i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx))>; - -defm INS : SIMDIns; - -def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)), - (SUBREG_TO_REG (i32 0), - (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>; -def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)), - (SUBREG_TO_REG (i32 0), - (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>; - -def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)), - (SUBREG_TO_REG (i32 0), - (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>; -def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)), - (SUBREG_TO_REG (i32 0), - (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>; - -def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))), - (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), - (i32 FPR32:$Rn), ssub))>; -def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))), - (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), - (i32 FPR32:$Rn), ssub))>; -def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))), - (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), - (i64 FPR64:$Rn), dsub))>; - -def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))), - (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>; -def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))), - (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>; -def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))), - (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>; - -def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn), - (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))), - (EXTRACT_SUBREG - (INSvi32lane - (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)), - VectorIndexS:$imm, - (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)), - (i64 0)), - dsub)>; -def : Pat<(v4f32 (vector_insert (v4f32 V128:$Rn), - (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))), - (INSvi32lane - V128:$Rn, VectorIndexS:$imm, - (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)), - (i64 0))>; -def : Pat<(v2f64 (vector_insert (v2f64 V128:$Rn), - (f64 FPR64:$Rm), (i64 VectorIndexD:$imm))), - (INSvi64lane - V128:$Rn, VectorIndexD:$imm, - (v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)), - (i64 0))>; - -// Copy an element at a constant index in one vector into a constant indexed -// element of another. -// FIXME refactor to a shared class/dev parameterized on vector type, vector -// index type and INS extension -def : Pat<(v16i8 (int_arm64_neon_vcopy_lane - (v16i8 V128:$Vd), VectorIndexB:$idx, (v16i8 V128:$Vs), - VectorIndexB:$idx2)), - (v16i8 (INSvi8lane - V128:$Vd, VectorIndexB:$idx, V128:$Vs, VectorIndexB:$idx2) - )>; -def : Pat<(v8i16 (int_arm64_neon_vcopy_lane - (v8i16 V128:$Vd), VectorIndexH:$idx, (v8i16 V128:$Vs), - VectorIndexH:$idx2)), - (v8i16 (INSvi16lane - V128:$Vd, VectorIndexH:$idx, V128:$Vs, VectorIndexH:$idx2) - )>; -def : Pat<(v4i32 (int_arm64_neon_vcopy_lane - (v4i32 V128:$Vd), VectorIndexS:$idx, (v4i32 V128:$Vs), - VectorIndexS:$idx2)), - (v4i32 (INSvi32lane - V128:$Vd, VectorIndexS:$idx, V128:$Vs, VectorIndexS:$idx2) - )>; -def : Pat<(v2i64 (int_arm64_neon_vcopy_lane - (v2i64 V128:$Vd), VectorIndexD:$idx, (v2i64 V128:$Vs), - VectorIndexD:$idx2)), - (v2i64 (INSvi64lane - V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2) - )>; - -multiclass Neon_INS_elt_pattern { - def : Pat<(VT128 (vector_insert V128:$src, - (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)), - imm:$Immd)), - (INS V128:$src, imm:$Immd, V128:$Rn, imm:$Immn)>; - - def : Pat<(VT128 (vector_insert V128:$src, - (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)), - imm:$Immd)), - (INS V128:$src, imm:$Immd, - (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn)>; - - def : Pat<(VT64 (vector_insert V64:$src, - (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)), - imm:$Immd)), - (EXTRACT_SUBREG (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), - imm:$Immd, V128:$Rn, imm:$Immn), - dsub)>; - - def : Pat<(VT64 (vector_insert V64:$src, - (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)), - imm:$Immd)), - (EXTRACT_SUBREG - (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), imm:$Immd, - (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn), - dsub)>; -} - -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; -defm : Neon_INS_elt_pattern; - - -// Floating point vector extractions are codegen'd as either a sequence of -// subregister extractions, possibly fed by an INS if the lane number is -// anything other than zero. -def : Pat<(vector_extract (v2f64 V128:$Rn), 0), - (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>; -def : Pat<(vector_extract (v4f32 V128:$Rn), 0), - (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>; -def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx), - (f64 (EXTRACT_SUBREG - (INSvi64lane (v2f64 (IMPLICIT_DEF)), 0, - V128:$Rn, VectorIndexD:$idx), - dsub))>; -def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx), - (f32 (EXTRACT_SUBREG - (INSvi32lane (v4f32 (IMPLICIT_DEF)), 0, - V128:$Rn, VectorIndexS:$idx), - ssub))>; - -// All concat_vectors operations are canonicalised to act on i64 vectors for -// ARM64. In the general case we need an instruction, which had just as well be -// INS. -class ConcatPat - : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)), - (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1, - (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>; - -def : ConcatPat; -def : ConcatPat; -def : ConcatPat; -def : ConcatPat; -def : ConcatPat; -def : ConcatPat; - -// If the high lanes are undef, though, we can just ignore them: -class ConcatUndefPat - : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)), - (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>; - -def : ConcatUndefPat; -def : ConcatUndefPat; -def : ConcatUndefPat; -def : ConcatUndefPat; -def : ConcatUndefPat; -def : ConcatUndefPat; - -//---------------------------------------------------------------------------- -// AdvSIMD across lanes instructions -//---------------------------------------------------------------------------- - -defm ADDV : SIMDAcrossLanesBHS<0, 0b11011, "addv">; -defm SMAXV : SIMDAcrossLanesBHS<0, 0b01010, "smaxv">; -defm SMINV : SIMDAcrossLanesBHS<0, 0b11010, "sminv">; -defm UMAXV : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">; -defm UMINV : SIMDAcrossLanesBHS<1, 0b11010, "uminv">; -defm SADDLV : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">; -defm UADDLV : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">; -defm FMAXNMV : SIMDAcrossLanesS<0b01100, 0, "fmaxnmv", int_arm64_neon_fmaxnmv>; -defm FMAXV : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_arm64_neon_fmaxv>; -defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_arm64_neon_fminnmv>; -defm FMINV : SIMDAcrossLanesS<0b01111, 1, "fminv", int_arm64_neon_fminv>; - -multiclass SIMDAcrossLanesSignedIntrinsic { -// If there is a sign extension after this intrinsic, consume it as smov already -// performed it - def : Pat<(i32 (sext_inreg (i32 (intOp (v8i8 V64:$Rn))), i8)), - (i32 (SMOVvi8to32 - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), - (i64 0)))>; - def : Pat<(i32 (intOp (v8i8 V64:$Rn))), - (i32 (SMOVvi8to32 - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), - (i64 0)))>; -// If there is a sign extension after this intrinsic, consume it as smov already -// performed it -def : Pat<(i32 (sext_inreg (i32 (intOp (v16i8 V128:$Rn))), i8)), - (i32 (SMOVvi8to32 - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), - (i64 0)))>; -def : Pat<(i32 (intOp (v16i8 V128:$Rn))), - (i32 (SMOVvi8to32 - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), - (i64 0)))>; -// If there is a sign extension after this intrinsic, consume it as smov already -// performed it -def : Pat<(i32 (sext_inreg (i32 (intOp (v4i16 V64:$Rn))), i16)), - (i32 (SMOVvi16to32 - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), - (i64 0)))>; -def : Pat<(i32 (intOp (v4i16 V64:$Rn))), - (i32 (SMOVvi16to32 - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), - (i64 0)))>; -// If there is a sign extension after this intrinsic, consume it as smov already -// performed it -def : Pat<(i32 (sext_inreg (i32 (intOp (v8i16 V128:$Rn))), i16)), - (i32 (SMOVvi16to32 - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub), - (i64 0)))>; -def : Pat<(i32 (intOp (v8i16 V128:$Rn))), - (i32 (SMOVvi16to32 - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub), - (i64 0)))>; - -def : Pat<(i32 (intOp (v4i32 V128:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub), - ssub))>; -} - -multiclass SIMDAcrossLanesUnsignedIntrinsic { -// If there is a masking operation keeping only what has been actually -// generated, consume it. - def : Pat<(i32 (and (i32 (intOp (v8i8 V64:$Rn))), maski8_or_more)), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), - ssub))>; - def : Pat<(i32 (intOp (v8i8 V64:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), - ssub))>; -// If there is a masking operation keeping only what has been actually -// generated, consume it. -def : Pat<(i32 (and (i32 (intOp (v16i8 V128:$Rn))), maski8_or_more)), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), - ssub))>; -def : Pat<(i32 (intOp (v16i8 V128:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), - ssub))>; - -// If there is a masking operation keeping only what has been actually -// generated, consume it. -def : Pat<(i32 (and (i32 (intOp (v4i16 V64:$Rn))), maski16_or_more)), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), - ssub))>; -def : Pat<(i32 (intOp (v4i16 V64:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), - ssub))>; -// If there is a masking operation keeping only what has been actually -// generated, consume it. -def : Pat<(i32 (and (i32 (intOp (v8i16 V128:$Rn))), maski16_or_more)), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub), - ssub))>; -def : Pat<(i32 (intOp (v8i16 V128:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub), - ssub))>; - -def : Pat<(i32 (intOp (v4i32 V128:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub), - ssub))>; - -} - -multiclass SIMDAcrossLanesSignedLongIntrinsic { - def : Pat<(i32 (intOp (v8i8 V64:$Rn))), - (i32 (SMOVvi16to32 - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub), - (i64 0)))>; -def : Pat<(i32 (intOp (v16i8 V128:$Rn))), - (i32 (SMOVvi16to32 - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub), - (i64 0)))>; - -def : Pat<(i32 (intOp (v4i16 V64:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub), - ssub))>; -def : Pat<(i32 (intOp (v8i16 V128:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub), - ssub))>; - -def : Pat<(i64 (intOp (v4i32 V128:$Rn))), - (i64 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub), - dsub))>; -} - -multiclass SIMDAcrossLanesUnsignedLongIntrinsic { - def : Pat<(i32 (intOp (v8i8 V64:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub), - ssub))>; -def : Pat<(i32 (intOp (v16i8 V128:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub), - ssub))>; - -def : Pat<(i32 (intOp (v4i16 V64:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub), - ssub))>; -def : Pat<(i32 (intOp (v8i16 V128:$Rn))), - (i32 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub), - ssub))>; - -def : Pat<(i64 (intOp (v4i32 V128:$Rn))), - (i64 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub), - dsub))>; -} - -defm : SIMDAcrossLanesSignedIntrinsic<"ADDV", int_arm64_neon_saddv>; -// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm -def : Pat<(i32 (int_arm64_neon_saddv (v2i32 V64:$Rn))), - (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>; - -defm : SIMDAcrossLanesUnsignedIntrinsic<"ADDV", int_arm64_neon_uaddv>; -// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm -def : Pat<(i32 (int_arm64_neon_uaddv (v2i32 V64:$Rn))), - (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>; - -defm : SIMDAcrossLanesSignedIntrinsic<"SMAXV", int_arm64_neon_smaxv>; -def : Pat<(i32 (int_arm64_neon_smaxv (v2i32 V64:$Rn))), - (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>; - -defm : SIMDAcrossLanesSignedIntrinsic<"SMINV", int_arm64_neon_sminv>; -def : Pat<(i32 (int_arm64_neon_sminv (v2i32 V64:$Rn))), - (EXTRACT_SUBREG (SMINPv2i32 V64:$Rn, V64:$Rn), ssub)>; - -defm : SIMDAcrossLanesUnsignedIntrinsic<"UMAXV", int_arm64_neon_umaxv>; -def : Pat<(i32 (int_arm64_neon_umaxv (v2i32 V64:$Rn))), - (EXTRACT_SUBREG (UMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>; - -defm : SIMDAcrossLanesUnsignedIntrinsic<"UMINV", int_arm64_neon_uminv>; -def : Pat<(i32 (int_arm64_neon_uminv (v2i32 V64:$Rn))), - (EXTRACT_SUBREG (UMINPv2i32 V64:$Rn, V64:$Rn), ssub)>; - -defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_arm64_neon_saddlv>; -defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_arm64_neon_uaddlv>; - -// The vaddlv_s32 intrinsic gets mapped to SADDLP. -def : Pat<(i64 (int_arm64_neon_saddlv (v2i32 V64:$Rn))), - (i64 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (SADDLPv2i32_v1i64 V64:$Rn), dsub), - dsub))>; -// The vaddlv_u32 intrinsic gets mapped to UADDLP. -def : Pat<(i64 (int_arm64_neon_uaddlv (v2i32 V64:$Rn))), - (i64 (EXTRACT_SUBREG - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (UADDLPv2i32_v1i64 V64:$Rn), dsub), - dsub))>; - -//------------------------------------------------------------------------------ -// AdvSIMD modified immediate instructions -//------------------------------------------------------------------------------ - -// AdvSIMD BIC -defm BIC : SIMDModifiedImmVectorShiftTied<1, 0b11, 0b01, "bic", ARM64bici>; -// AdvSIMD ORR -defm ORR : SIMDModifiedImmVectorShiftTied<0, 0b11, 0b01, "orr", ARM64orri>; - -def : InstAlias<"bic $Vd.4h, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0)>; -def : InstAlias<"bic $Vd.8h, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>; -def : InstAlias<"bic $Vd.2s, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0)>; -def : InstAlias<"bic $Vd.4s, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>; - -def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; - -def : InstAlias<"orr $Vd.4h, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0)>; -def : InstAlias<"orr $Vd.8h, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>; -def : InstAlias<"orr $Vd.2s, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0)>; -def : InstAlias<"orr $Vd.4s, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>; - -def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; - -// AdvSIMD FMOV -def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1111, V128, fpimm8, - "fmov", ".2d", - [(set (v2f64 V128:$Rd), (ARM64fmov imm0_255:$imm8))]>; -def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1111, V64, fpimm8, - "fmov", ".2s", - [(set (v2f32 V64:$Rd), (ARM64fmov imm0_255:$imm8))]>; -def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1111, V128, fpimm8, - "fmov", ".4s", - [(set (v4f32 V128:$Rd), (ARM64fmov imm0_255:$imm8))]>; - -// AdvSIMD MOVI - -// EDIT byte mask: scalar -let isReMaterializable = 1, isAsCheapAsAMove = 1 in -def MOVID : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi", - [(set FPR64:$Rd, simdimmtype10:$imm8)]>; -// The movi_edit node has the immediate value already encoded, so we use -// a plain imm0_255 here. -def : Pat<(f64 (ARM64movi_edit imm0_255:$shift)), - (MOVID imm0_255:$shift)>; - -def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>; -def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>; -def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>; -def : Pat<(v8i8 immAllZerosV), (MOVID (i32 0))>; - -def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>; -def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>; -def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>; -def : Pat<(v8i8 immAllOnesV), (MOVID (i32 255))>; - -// EDIT byte mask: 2d - -// The movi_edit node has the immediate value already encoded, so we use -// a plain imm0_255 in the pattern -let isReMaterializable = 1, isAsCheapAsAMove = 1 in -def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1110, V128, - simdimmtype10, - "movi", ".2d", - [(set (v2i64 V128:$Rd), (ARM64movi_edit imm0_255:$imm8))]>; - - -// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing. -// Complexity is added to break a tie with a plain MOVI. -let AddedComplexity = 1 in { -def : Pat<(f32 fpimm0), - (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>, - Requires<[HasZCZ]>; -def : Pat<(f64 fpimm0), - (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>, - Requires<[HasZCZ]>; -} - -def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>; -def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>; -def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>; -def : Pat<(v16i8 immAllZerosV), (MOVIv2d_ns (i32 0))>; - -def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>; -def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>; -def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>; -def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>; - -def : Pat<(v2f64 (ARM64dup (f64 fpimm0))), (MOVIv2d_ns (i32 0))>; -def : Pat<(v4f32 (ARM64dup (f32 fpimm0))), (MOVIv2d_ns (i32 0))>; - -// EDIT per word & halfword: 2s, 4h, 4s, & 8h -defm MOVI : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">; - -def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"movi $Vd.4s, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; - -def : InstAlias<"movi.4h $Vd, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"movi.8h $Vd, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"movi.2s $Vd, $imm", (MOVIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"movi.4s $Vd, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; - -def : Pat<(v2i32 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))), - (MOVIv2i32 imm0_255:$imm8, imm:$shift)>; -def : Pat<(v4i32 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))), - (MOVIv4i32 imm0_255:$imm8, imm:$shift)>; -def : Pat<(v4i16 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))), - (MOVIv4i16 imm0_255:$imm8, imm:$shift)>; -def : Pat<(v8i16 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))), - (MOVIv8i16 imm0_255:$imm8, imm:$shift)>; - -// EDIT per word: 2s & 4s with MSL shifter -def MOVIv2s_msl : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s", - [(set (v2i32 V64:$Rd), - (ARM64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>; -def MOVIv4s_msl : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s", - [(set (v4i32 V128:$Rd), - (ARM64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>; - -// Per byte: 8b & 16b -def MOVIv8b_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1110, V64, imm0_255, - "movi", ".8b", - [(set (v8i8 V64:$Rd), (ARM64movi imm0_255:$imm8))]>; -def MOVIv16b_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1110, V128, imm0_255, - "movi", ".16b", - [(set (v16i8 V128:$Rd), (ARM64movi imm0_255:$imm8))]>; - -// AdvSIMD MVNI - -// EDIT per word & halfword: 2s, 4h, 4s, & 8h -defm MVNI : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">; - -def : InstAlias<"mvni $Vd.4h, $imm", (MVNIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"mvni $Vd.8h, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"mvni $Vd.2s, $imm", (MVNIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"mvni $Vd.4s, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; - -def : InstAlias<"mvni.4h $Vd, $imm", (MVNIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"mvni.8h $Vd, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"mvni.2s $Vd, $imm", (MVNIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; -def : InstAlias<"mvni.4s $Vd, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; - -def : Pat<(v2i32 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))), - (MVNIv2i32 imm0_255:$imm8, imm:$shift)>; -def : Pat<(v4i32 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))), - (MVNIv4i32 imm0_255:$imm8, imm:$shift)>; -def : Pat<(v4i16 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))), - (MVNIv4i16 imm0_255:$imm8, imm:$shift)>; -def : Pat<(v8i16 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))), - (MVNIv8i16 imm0_255:$imm8, imm:$shift)>; - -// EDIT per word: 2s & 4s with MSL shifter -def MVNIv2s_msl : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s", - [(set (v2i32 V64:$Rd), - (ARM64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>; -def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s", - [(set (v4i32 V128:$Rd), - (ARM64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>; - -//---------------------------------------------------------------------------- -// AdvSIMD indexed element -//---------------------------------------------------------------------------- - -let neverHasSideEffects = 1 in { - defm FMLA : SIMDFPIndexedSDTied<0, 0b0001, "fmla">; - defm FMLS : SIMDFPIndexedSDTied<0, 0b0101, "fmls">; -} - -// NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the -// instruction expects the addend first, while the intrinsic expects it last. - -// On the other hand, there are quite a few valid combinatorial options due to -// the commutativity of multiplication and the fact that (-x) * y = x * (-y). -defm : SIMDFPIndexedSDTiedPatterns<"FMLA", - TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>; -defm : SIMDFPIndexedSDTiedPatterns<"FMLA", - TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>; - -defm : SIMDFPIndexedSDTiedPatterns<"FMLS", - TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >; -defm : SIMDFPIndexedSDTiedPatterns<"FMLS", - TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >; -defm : SIMDFPIndexedSDTiedPatterns<"FMLS", - TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >; -defm : SIMDFPIndexedSDTiedPatterns<"FMLS", - TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >; - -multiclass FMLSIndexedAfterNegPatterns { - // 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit - // and DUP scalar. - def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), - (ARM64duplane32 (v4f32 (fneg V128:$Rm)), - VectorIndexS:$idx))), - (FMLSv2i32_indexed V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>; - def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), - (v2f32 (ARM64duplane32 - (v4f32 (insert_subvector undef, - (v2f32 (fneg V64:$Rm)), - (i32 0))), - VectorIndexS:$idx)))), - (FMLSv2i32_indexed V64:$Rd, V64:$Rn, - (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), - VectorIndexS:$idx)>; - def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), - (ARM64dup (f32 (fneg FPR32Op:$Rm))))), - (FMLSv2i32_indexed V64:$Rd, V64:$Rn, - (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>; - - // 3 variants for the .4s version: DUPLANE from 128-bit, DUPLANE from 64-bit - // and DUP scalar. - def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), - (ARM64duplane32 (v4f32 (fneg V128:$Rm)), - VectorIndexS:$idx))), - (FMLSv4i32_indexed V128:$Rd, V128:$Rn, V128:$Rm, - VectorIndexS:$idx)>; - def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), - (v4f32 (ARM64duplane32 - (v4f32 (insert_subvector undef, - (v2f32 (fneg V64:$Rm)), - (i32 0))), - VectorIndexS:$idx)))), - (FMLSv4i32_indexed V128:$Rd, V128:$Rn, - (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), - VectorIndexS:$idx)>; - def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), - (ARM64dup (f32 (fneg FPR32Op:$Rm))))), - (FMLSv4i32_indexed V128:$Rd, V128:$Rn, - (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>; - - // 2 variants for the .2d version: DUPLANE from 128-bit, and DUP scalar - // (DUPLANE from 64-bit would be trivial). - def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), - (ARM64duplane64 (v2f64 (fneg V128:$Rm)), - VectorIndexD:$idx))), - (FMLSv2i64_indexed - V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>; - def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), - (ARM64dup (f64 (fneg FPR64Op:$Rm))))), - (FMLSv2i64_indexed V128:$Rd, V128:$Rn, - (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>; - - // 2 variants for 32-bit scalar version: extract from .2s or from .4s - def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), - (vector_extract (v4f32 (fneg V128:$Rm)), - VectorIndexS:$idx))), - (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn, - V128:$Rm, VectorIndexS:$idx)>; - def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), - (vector_extract (v2f32 (fneg V64:$Rm)), - VectorIndexS:$idx))), - (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn, - (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>; - - // 1 variant for 64-bit scalar version: extract from .1d or from .2d - def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn), - (vector_extract (v2f64 (fneg V128:$Rm)), - VectorIndexS:$idx))), - (FMLSv1i64_indexed FPR64:$Rd, FPR64:$Rn, - V128:$Rm, VectorIndexS:$idx)>; -} - -defm : FMLSIndexedAfterNegPatterns< - TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >; -defm : FMLSIndexedAfterNegPatterns< - TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >; - -defm FMULX : SIMDFPIndexedSD<1, 0b1001, "fmulx", int_arm64_neon_fmulx>; -defm FMUL : SIMDFPIndexedSD<0, 0b1001, "fmul", fmul>; - -def : Pat<(v2f32 (fmul V64:$Rn, (ARM64dup (f32 FPR32:$Rm)))), - (FMULv2i32_indexed V64:$Rn, - (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub), - (i64 0))>; -def : Pat<(v4f32 (fmul V128:$Rn, (ARM64dup (f32 FPR32:$Rm)))), - (FMULv4i32_indexed V128:$Rn, - (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub), - (i64 0))>; -def : Pat<(v2f64 (fmul V128:$Rn, (ARM64dup (f64 FPR64:$Rm)))), - (FMULv2i64_indexed V128:$Rn, - (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub), - (i64 0))>; - -defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_arm64_neon_sqdmulh>; -defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_arm64_neon_sqrdmulh>; -defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla", - TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>; -defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls", - TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>; -defm MUL : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>; -defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal", - TriOpFrag<(add node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>; -defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl", - TriOpFrag<(sub node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>; -defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull", - int_arm64_neon_smull>; -defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal", - int_arm64_neon_sqadd>; -defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl", - int_arm64_neon_sqsub>; -defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_arm64_neon_sqdmull>; -defm UMLAL : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal", - TriOpFrag<(add node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>; -defm UMLSL : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl", - TriOpFrag<(sub node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>; -defm UMULL : SIMDVectorIndexedLongSD<1, 0b1010, "umull", - int_arm64_neon_umull>; - -// A scalar sqdmull with the second operand being a vector lane can be -// handled directly with the indexed instruction encoding. -def : Pat<(int_arm64_neon_sqdmulls_scalar (i32 FPR32:$Rn), - (vector_extract (v4i32 V128:$Vm), - VectorIndexS:$idx)), - (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>; - -//---------------------------------------------------------------------------- -// AdvSIMD scalar shift instructions -//---------------------------------------------------------------------------- -defm FCVTZS : SIMDScalarRShiftSD<0, 0b11111, "fcvtzs">; -defm FCVTZU : SIMDScalarRShiftSD<1, 0b11111, "fcvtzu">; -defm SCVTF : SIMDScalarRShiftSD<0, 0b11100, "scvtf">; -defm UCVTF : SIMDScalarRShiftSD<1, 0b11100, "ucvtf">; -// Codegen patterns for the above. We don't put these directly on the -// instructions because TableGen's type inference can't handle the truth. -// Having the same base pattern for fp <--> int totally freaks it out. -def : Pat<(int_arm64_neon_vcvtfp2fxs FPR32:$Rn, vecshiftR32:$imm), - (FCVTZSs FPR32:$Rn, vecshiftR32:$imm)>; -def : Pat<(int_arm64_neon_vcvtfp2fxu FPR32:$Rn, vecshiftR32:$imm), - (FCVTZUs FPR32:$Rn, vecshiftR32:$imm)>; -def : Pat<(i64 (int_arm64_neon_vcvtfp2fxs (f64 FPR64:$Rn), vecshiftR64:$imm)), - (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>; -def : Pat<(i64 (int_arm64_neon_vcvtfp2fxu (f64 FPR64:$Rn), vecshiftR64:$imm)), - (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>; -def : Pat<(v1i64 (int_arm64_neon_vcvtfp2fxs (v1f64 FPR64:$Rn), - vecshiftR64:$imm)), - (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>; -def : Pat<(v1i64 (int_arm64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn), - vecshiftR64:$imm)), - (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>; -def : Pat<(int_arm64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm), - (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>; -def : Pat<(int_arm64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm), - (UCVTFs FPR32:$Rn, vecshiftR32:$imm)>; -def : Pat<(f64 (int_arm64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)), - (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>; -def : Pat<(f64 (int_arm64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)), - (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>; -def : Pat<(v1f64 (int_arm64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn), - vecshiftR64:$imm)), - (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>; -def : Pat<(v1f64 (int_arm64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn), - vecshiftR64:$imm)), - (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>; - -defm SHL : SIMDScalarLShiftD< 0, 0b01010, "shl", ARM64vshl>; -defm SLI : SIMDScalarLShiftDTied<1, 0b01010, "sli">; -defm SQRSHRN : SIMDScalarRShiftBHS< 0, 0b10011, "sqrshrn", - int_arm64_neon_sqrshrn>; -defm SQRSHRUN : SIMDScalarRShiftBHS< 1, 0b10001, "sqrshrun", - int_arm64_neon_sqrshrun>; -defm SQSHLU : SIMDScalarLShiftBHSD<1, 0b01100, "sqshlu", ARM64sqshlui>; -defm SQSHL : SIMDScalarLShiftBHSD<0, 0b01110, "sqshl", ARM64sqshli>; -defm SQSHRN : SIMDScalarRShiftBHS< 0, 0b10010, "sqshrn", - int_arm64_neon_sqshrn>; -defm SQSHRUN : SIMDScalarRShiftBHS< 1, 0b10000, "sqshrun", - int_arm64_neon_sqshrun>; -defm SRI : SIMDScalarRShiftDTied< 1, 0b01000, "sri">; -defm SRSHR : SIMDScalarRShiftD< 0, 0b00100, "srshr", ARM64srshri>; -defm SRSRA : SIMDScalarRShiftDTied< 0, 0b00110, "srsra", - TriOpFrag<(add node:$LHS, - (ARM64srshri node:$MHS, node:$RHS))>>; -defm SSHR : SIMDScalarRShiftD< 0, 0b00000, "sshr", ARM64vashr>; -defm SSRA : SIMDScalarRShiftDTied< 0, 0b00010, "ssra", - TriOpFrag<(add node:$LHS, - (ARM64vashr node:$MHS, node:$RHS))>>; -defm UQRSHRN : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn", - int_arm64_neon_uqrshrn>; -defm UQSHL : SIMDScalarLShiftBHSD<1, 0b01110, "uqshl", ARM64uqshli>; -defm UQSHRN : SIMDScalarRShiftBHS< 1, 0b10010, "uqshrn", - int_arm64_neon_uqshrn>; -defm URSHR : SIMDScalarRShiftD< 1, 0b00100, "urshr", ARM64urshri>; -defm URSRA : SIMDScalarRShiftDTied< 1, 0b00110, "ursra", - TriOpFrag<(add node:$LHS, - (ARM64urshri node:$MHS, node:$RHS))>>; -defm USHR : SIMDScalarRShiftD< 1, 0b00000, "ushr", ARM64vlshr>; -defm USRA : SIMDScalarRShiftDTied< 1, 0b00010, "usra", - TriOpFrag<(add node:$LHS, - (ARM64vlshr node:$MHS, node:$RHS))>>; - -//---------------------------------------------------------------------------- -// AdvSIMD vector shift instructions -//---------------------------------------------------------------------------- -defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_arm64_neon_vcvtfp2fxs>; -defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_arm64_neon_vcvtfp2fxu>; -defm SCVTF: SIMDVectorRShiftSDToFP<0, 0b11100, "scvtf", - int_arm64_neon_vcvtfxs2fp>; -defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn", - int_arm64_neon_rshrn>; -defm SHL : SIMDVectorLShiftBHSD<0, 0b01010, "shl", ARM64vshl>; -defm SHRN : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn", - BinOpFrag<(trunc (ARM64vashr node:$LHS, node:$RHS))>>; -defm SLI : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_arm64_neon_vsli>; -def : Pat<(v1i64 (int_arm64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), - (i32 vecshiftL64:$imm))), - (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>; -defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn", - int_arm64_neon_sqrshrn>; -defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun", - int_arm64_neon_sqrshrun>; -defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", ARM64sqshlui>; -defm SQSHL : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", ARM64sqshli>; -defm SQSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn", - int_arm64_neon_sqshrn>; -defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun", - int_arm64_neon_sqshrun>; -defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_arm64_neon_vsri>; -def : Pat<(v1i64 (int_arm64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn), - (i32 vecshiftR64:$imm))), - (SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>; -defm SRSHR : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", ARM64srshri>; -defm SRSRA : SIMDVectorRShiftBHSDTied<0, 0b00110, "srsra", - TriOpFrag<(add node:$LHS, - (ARM64srshri node:$MHS, node:$RHS))> >; -defm SSHLL : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll", - BinOpFrag<(ARM64vshl (sext node:$LHS), node:$RHS)>>; - -defm SSHR : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", ARM64vashr>; -defm SSRA : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra", - TriOpFrag<(add node:$LHS, (ARM64vashr node:$MHS, node:$RHS))>>; -defm UCVTF : SIMDVectorRShiftSDToFP<1, 0b11100, "ucvtf", - int_arm64_neon_vcvtfxu2fp>; -defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn", - int_arm64_neon_uqrshrn>; -defm UQSHL : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", ARM64uqshli>; -defm UQSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn", - int_arm64_neon_uqshrn>; -defm URSHR : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", ARM64urshri>; -defm URSRA : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra", - TriOpFrag<(add node:$LHS, - (ARM64urshri node:$MHS, node:$RHS))> >; -defm USHLL : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll", - BinOpFrag<(ARM64vshl (zext node:$LHS), node:$RHS)>>; -defm USHR : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", ARM64vlshr>; -defm USRA : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra", - TriOpFrag<(add node:$LHS, (ARM64vlshr node:$MHS, node:$RHS))> >; - -// SHRN patterns for when a logical right shift was used instead of arithmetic -// (the immediate guarantees no sign bits actually end up in the result so it -// doesn't matter). -def : Pat<(v8i8 (trunc (ARM64vlshr (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))), - (SHRNv8i8_shift V128:$Rn, vecshiftR16Narrow:$imm)>; -def : Pat<(v4i16 (trunc (ARM64vlshr (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))), - (SHRNv4i16_shift V128:$Rn, vecshiftR32Narrow:$imm)>; -def : Pat<(v2i32 (trunc (ARM64vlshr (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))), - (SHRNv2i32_shift V128:$Rn, vecshiftR64Narrow:$imm)>; - -def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Rd), - (trunc (ARM64vlshr (v8i16 V128:$Rn), - vecshiftR16Narrow:$imm)))), - (SHRNv16i8_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), - V128:$Rn, vecshiftR16Narrow:$imm)>; -def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Rd), - (trunc (ARM64vlshr (v4i32 V128:$Rn), - vecshiftR32Narrow:$imm)))), - (SHRNv8i16_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), - V128:$Rn, vecshiftR32Narrow:$imm)>; -def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Rd), - (trunc (ARM64vlshr (v2i64 V128:$Rn), - vecshiftR64Narrow:$imm)))), - (SHRNv4i32_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), - V128:$Rn, vecshiftR32Narrow:$imm)>; - -// Vector sign and zero extensions are implemented with SSHLL and USSHLL. -// Anyexts are implemented as zexts. -def : Pat<(v8i16 (sext (v8i8 V64:$Rn))), (SSHLLv8i8_shift V64:$Rn, (i32 0))>; -def : Pat<(v8i16 (zext (v8i8 V64:$Rn))), (USHLLv8i8_shift V64:$Rn, (i32 0))>; -def : Pat<(v8i16 (anyext (v8i8 V64:$Rn))), (USHLLv8i8_shift V64:$Rn, (i32 0))>; -def : Pat<(v4i32 (sext (v4i16 V64:$Rn))), (SSHLLv4i16_shift V64:$Rn, (i32 0))>; -def : Pat<(v4i32 (zext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>; -def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>; -def : Pat<(v2i64 (sext (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>; -def : Pat<(v2i64 (zext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>; -def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>; -// Also match an extend from the upper half of a 128 bit source register. -def : Pat<(v8i16 (anyext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))), - (USHLLv16i8_shift V128:$Rn, (i32 0))>; -def : Pat<(v8i16 (zext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))), - (USHLLv16i8_shift V128:$Rn, (i32 0))>; -def : Pat<(v8i16 (sext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))), - (SSHLLv16i8_shift V128:$Rn, (i32 0))>; -def : Pat<(v4i32 (anyext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))), - (USHLLv8i16_shift V128:$Rn, (i32 0))>; -def : Pat<(v4i32 (zext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))), - (USHLLv8i16_shift V128:$Rn, (i32 0))>; -def : Pat<(v4i32 (sext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))), - (SSHLLv8i16_shift V128:$Rn, (i32 0))>; -def : Pat<(v2i64 (anyext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))), - (USHLLv4i32_shift V128:$Rn, (i32 0))>; -def : Pat<(v2i64 (zext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))), - (USHLLv4i32_shift V128:$Rn, (i32 0))>; -def : Pat<(v2i64 (sext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))), - (SSHLLv4i32_shift V128:$Rn, (i32 0))>; - -// Vector shift sxtl aliases -def : InstAlias<"sxtl.8h $dst, $src1", - (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>; -def : InstAlias<"sxtl $dst.8h, $src1.8b", - (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>; -def : InstAlias<"sxtl.4s $dst, $src1", - (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>; -def : InstAlias<"sxtl $dst.4s, $src1.4h", - (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>; -def : InstAlias<"sxtl.2d $dst, $src1", - (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>; -def : InstAlias<"sxtl $dst.2d, $src1.2s", - (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>; - -// Vector shift sxtl2 aliases -def : InstAlias<"sxtl2.8h $dst, $src1", - (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>; -def : InstAlias<"sxtl2 $dst.8h, $src1.16b", - (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>; -def : InstAlias<"sxtl2.4s $dst, $src1", - (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>; -def : InstAlias<"sxtl2 $dst.4s, $src1.8h", - (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>; -def : InstAlias<"sxtl2.2d $dst, $src1", - (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>; -def : InstAlias<"sxtl2 $dst.2d, $src1.4s", - (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>; - -// Vector shift uxtl aliases -def : InstAlias<"uxtl.8h $dst, $src1", - (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>; -def : InstAlias<"uxtl $dst.8h, $src1.8b", - (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>; -def : InstAlias<"uxtl.4s $dst, $src1", - (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>; -def : InstAlias<"uxtl $dst.4s, $src1.4h", - (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>; -def : InstAlias<"uxtl.2d $dst, $src1", - (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>; -def : InstAlias<"uxtl $dst.2d, $src1.2s", - (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>; - -// Vector shift uxtl2 aliases -def : InstAlias<"uxtl2.8h $dst, $src1", - (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>; -def : InstAlias<"uxtl2 $dst.8h, $src1.16b", - (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>; -def : InstAlias<"uxtl2.4s $dst, $src1", - (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>; -def : InstAlias<"uxtl2 $dst.4s, $src1.8h", - (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>; -def : InstAlias<"uxtl2.2d $dst, $src1", - (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>; -def : InstAlias<"uxtl2 $dst.2d, $src1.4s", - (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>; - -// If an integer is about to be converted to a floating point value, -// just load it on the floating point unit. -// These patterns are more complex because floating point loads do not -// support sign extension. -// The sign extension has to be explicitly added and is only supported for -// one step: byte-to-half, half-to-word, word-to-doubleword. -// SCVTF GPR -> FPR is 9 cycles. -// SCVTF FPR -> FPR is 4 cyclces. -// (sign extension with lengthen) SXTL FPR -> FPR is 2 cycles. -// Therefore, we can do 2 sign extensions and one SCVTF FPR -> FPR -// and still being faster. -// However, this is not good for code size. -// 8-bits -> float. 2 sizes step-up. -class SExtLoadi8CVTf32Pat - : Pat<(f32 (sint_to_fp (i32 (sextloadi8 addrmode)))), - (SCVTFv1i32 (f32 (EXTRACT_SUBREG - (SSHLLv4i16_shift - (f64 - (EXTRACT_SUBREG - (SSHLLv8i8_shift - (INSERT_SUBREG (f64 (IMPLICIT_DEF)), - INST, - bsub), - 0), - dsub)), - 0), - ssub)))>, Requires<[NotForCodeSize]>; - -def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext), - (LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>; -def : SExtLoadi8CVTf32Pat<(ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext), - (LDRBroX GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext)>; -def : SExtLoadi8CVTf32Pat<(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset), - (LDRBui GPR64sp:$Rn, uimm12s1:$offset)>; -def : SExtLoadi8CVTf32Pat<(am_unscaled8 GPR64sp:$Rn, simm9:$offset), - (LDURBi GPR64sp:$Rn, simm9:$offset)>; - -// 16-bits -> float. 1 size step-up. -class SExtLoadi16CVTf32Pat - : Pat<(f32 (sint_to_fp (i32 (sextloadi16 addrmode)))), - (SCVTFv1i32 (f32 (EXTRACT_SUBREG - (SSHLLv4i16_shift - (INSERT_SUBREG (f64 (IMPLICIT_DEF)), - INST, - hsub), - 0), - ssub)))>, Requires<[NotForCodeSize]>; - -def : SExtLoadi16CVTf32Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext), - (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>; -def : SExtLoadi16CVTf32Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext), - (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>; -def : SExtLoadi16CVTf32Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset), - (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>; -def : SExtLoadi16CVTf32Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset), - (LDURHi GPR64sp:$Rn, simm9:$offset)>; - -// 32-bits to 32-bits are handled in target specific dag combine: -// performIntToFpCombine. -// 64-bits integer to 32-bits floating point, not possible with -// SCVTF on floating point registers (both source and destination -// must have the same size). - -// Here are the patterns for 8, 16, 32, and 64-bits to double. -// 8-bits -> double. 3 size step-up: give up. -// 16-bits -> double. 2 size step. -class SExtLoadi16CVTf64Pat - : Pat <(f64 (sint_to_fp (i32 (sextloadi16 addrmode)))), - (SCVTFv1i64 (f64 (EXTRACT_SUBREG - (SSHLLv2i32_shift - (f64 - (EXTRACT_SUBREG - (SSHLLv4i16_shift - (INSERT_SUBREG (f64 (IMPLICIT_DEF)), - INST, - hsub), - 0), - dsub)), - 0), - dsub)))>, Requires<[NotForCodeSize]>; - -def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext), - (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>; -def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext), - (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>; -def : SExtLoadi16CVTf64Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset), - (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>; -def : SExtLoadi16CVTf64Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset), - (LDURHi GPR64sp:$Rn, simm9:$offset)>; -// 32-bits -> double. 1 size step-up. -class SExtLoadi32CVTf64Pat - : Pat <(f64 (sint_to_fp (i32 (load addrmode)))), - (SCVTFv1i64 (f64 (EXTRACT_SUBREG - (SSHLLv2i32_shift - (INSERT_SUBREG (f64 (IMPLICIT_DEF)), - INST, - ssub), - 0), - dsub)))>, Requires<[NotForCodeSize]>; - -def : SExtLoadi32CVTf64Pat<(ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext), - (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext)>; -def : SExtLoadi32CVTf64Pat<(ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext), - (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext)>; -def : SExtLoadi32CVTf64Pat<(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset), - (LDRSui GPR64sp:$Rn, uimm12s4:$offset)>; -def : SExtLoadi32CVTf64Pat<(am_unscaled32 GPR64sp:$Rn, simm9:$offset), - (LDURSi GPR64sp:$Rn, simm9:$offset)>; - -// 64-bits -> double are handled in target specific dag combine: -// performIntToFpCombine. - - -//---------------------------------------------------------------------------- -// AdvSIMD Load-Store Structure -//---------------------------------------------------------------------------- -defm LD1 : SIMDLd1Multiple<"ld1">; -defm LD2 : SIMDLd2Multiple<"ld2">; -defm LD3 : SIMDLd3Multiple<"ld3">; -defm LD4 : SIMDLd4Multiple<"ld4">; - -defm ST1 : SIMDSt1Multiple<"st1">; -defm ST2 : SIMDSt2Multiple<"st2">; -defm ST3 : SIMDSt3Multiple<"st3">; -defm ST4 : SIMDSt4Multiple<"st4">; - -class Ld1Pat - : Pat<(ty (load GPR64sp:$Rn)), (INST GPR64sp:$Rn)>; - -def : Ld1Pat; -def : Ld1Pat; -def : Ld1Pat; -def : Ld1Pat; -def : Ld1Pat; -def : Ld1Pat; -def : Ld1Pat; -def : Ld1Pat; - -class St1Pat - : Pat<(store ty:$Vt, GPR64sp:$Rn), - (INST ty:$Vt, GPR64sp:$Rn)>; - -def : St1Pat; -def : St1Pat; -def : St1Pat; -def : St1Pat; -def : St1Pat; -def : St1Pat; -def : St1Pat; -def : St1Pat; - -//--- -// Single-element -//--- - -defm LD1R : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>; -defm LD2R : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>; -defm LD3R : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>; -defm LD4R : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>; -let mayLoad = 1, neverHasSideEffects = 1 in { -defm LD1 : SIMDLdSingleBTied<0, 0b000, "ld1", VecListOneb, GPR64pi1>; -defm LD1 : SIMDLdSingleHTied<0, 0b010, 0, "ld1", VecListOneh, GPR64pi2>; -defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes, GPR64pi4>; -defm LD1 : SIMDLdSingleDTied<0, 0b100, 0b01, "ld1", VecListOned, GPR64pi8>; -defm LD2 : SIMDLdSingleBTied<1, 0b000, "ld2", VecListTwob, GPR64pi2>; -defm LD2 : SIMDLdSingleHTied<1, 0b010, 0, "ld2", VecListTwoh, GPR64pi4>; -defm LD2 : SIMDLdSingleSTied<1, 0b100, 0b00, "ld2", VecListTwos, GPR64pi8>; -defm LD2 : SIMDLdSingleDTied<1, 0b100, 0b01, "ld2", VecListTwod, GPR64pi16>; -defm LD3 : SIMDLdSingleBTied<0, 0b001, "ld3", VecListThreeb, GPR64pi3>; -defm LD3 : SIMDLdSingleHTied<0, 0b011, 0, "ld3", VecListThreeh, GPR64pi6>; -defm LD3 : SIMDLdSingleSTied<0, 0b101, 0b00, "ld3", VecListThrees, GPR64pi12>; -defm LD3 : SIMDLdSingleDTied<0, 0b101, 0b01, "ld3", VecListThreed, GPR64pi24>; -defm LD4 : SIMDLdSingleBTied<1, 0b001, "ld4", VecListFourb, GPR64pi4>; -defm LD4 : SIMDLdSingleHTied<1, 0b011, 0, "ld4", VecListFourh, GPR64pi8>; -defm LD4 : SIMDLdSingleSTied<1, 0b101, 0b00, "ld4", VecListFours, GPR64pi16>; -defm LD4 : SIMDLdSingleDTied<1, 0b101, 0b01, "ld4", VecListFourd, GPR64pi32>; -} - -def : Pat<(v8i8 (ARM64dup (i32 (extloadi8 GPR64sp:$Rn)))), - (LD1Rv8b GPR64sp:$Rn)>; -def : Pat<(v16i8 (ARM64dup (i32 (extloadi8 GPR64sp:$Rn)))), - (LD1Rv16b GPR64sp:$Rn)>; -def : Pat<(v4i16 (ARM64dup (i32 (extloadi16 GPR64sp:$Rn)))), - (LD1Rv4h GPR64sp:$Rn)>; -def : Pat<(v8i16 (ARM64dup (i32 (extloadi16 GPR64sp:$Rn)))), - (LD1Rv8h GPR64sp:$Rn)>; -def : Pat<(v2i32 (ARM64dup (i32 (load GPR64sp:$Rn)))), - (LD1Rv2s GPR64sp:$Rn)>; -def : Pat<(v4i32 (ARM64dup (i32 (load GPR64sp:$Rn)))), - (LD1Rv4s GPR64sp:$Rn)>; -def : Pat<(v2i64 (ARM64dup (i64 (load GPR64sp:$Rn)))), - (LD1Rv2d GPR64sp:$Rn)>; -def : Pat<(v1i64 (ARM64dup (i64 (load GPR64sp:$Rn)))), - (LD1Rv1d GPR64sp:$Rn)>; -// Grab the floating point version too -def : Pat<(v2f32 (ARM64dup (f32 (load GPR64sp:$Rn)))), - (LD1Rv2s GPR64sp:$Rn)>; -def : Pat<(v4f32 (ARM64dup (f32 (load GPR64sp:$Rn)))), - (LD1Rv4s GPR64sp:$Rn)>; -def : Pat<(v2f64 (ARM64dup (f64 (load GPR64sp:$Rn)))), - (LD1Rv2d GPR64sp:$Rn)>; -def : Pat<(v1f64 (ARM64dup (f64 (load GPR64sp:$Rn)))), - (LD1Rv1d GPR64sp:$Rn)>; - -class Ld1Lane128Pat - : Pat<(vector_insert (VTy VecListOne128:$Rd), - (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx), - (LD1 VecListOne128:$Rd, VecIndex:$idx, GPR64sp:$Rn)>; - -def : Ld1Lane128Pat; -def : Ld1Lane128Pat; -def : Ld1Lane128Pat; -def : Ld1Lane128Pat; -def : Ld1Lane128Pat; -def : Ld1Lane128Pat; - -class Ld1Lane64Pat - : Pat<(vector_insert (VTy VecListOne64:$Rd), - (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx), - (EXTRACT_SUBREG - (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub), - VecIndex:$idx, GPR64sp:$Rn), - dsub)>; - -def : Ld1Lane64Pat; -def : Ld1Lane64Pat; -def : Ld1Lane64Pat; -def : Ld1Lane64Pat; - - -defm LD1 : SIMDLdSt1SingleAliases<"ld1">; -defm LD2 : SIMDLdSt2SingleAliases<"ld2">; -defm LD3 : SIMDLdSt3SingleAliases<"ld3">; -defm LD4 : SIMDLdSt4SingleAliases<"ld4">; - -// Stores -defm ST1 : SIMDStSingleB<0, 0b000, "st1", VecListOneb, GPR64pi1>; -defm ST1 : SIMDStSingleH<0, 0b010, 0, "st1", VecListOneh, GPR64pi2>; -defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>; -defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>; - -let AddedComplexity = 15 in -class St1Lane128Pat - : Pat<(scalar_store - (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)), - GPR64sp:$Rn), - (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn)>; - -def : St1Lane128Pat; -def : St1Lane128Pat; -def : St1Lane128Pat; -def : St1Lane128Pat; -def : St1Lane128Pat; -def : St1Lane128Pat; - -let AddedComplexity = 15 in -class St1Lane64Pat - : Pat<(scalar_store - (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)), - GPR64sp:$Rn), - (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub), - VecIndex:$idx, GPR64sp:$Rn)>; - -def : St1Lane64Pat; -def : St1Lane64Pat; -def : St1Lane64Pat; -def : St1Lane64Pat; - -multiclass St1LanePost64Pat { - def : Pat<(scalar_store - (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)), - GPR64sp:$Rn, offset), - (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub), - VecIndex:$idx, GPR64sp:$Rn, XZR)>; - - def : Pat<(scalar_store - (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)), - GPR64sp:$Rn, GPR64:$Rm), - (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub), - VecIndex:$idx, GPR64sp:$Rn, $Rm)>; -} - -defm : St1LanePost64Pat; -defm : St1LanePost64Pat; -defm : St1LanePost64Pat; -defm : St1LanePost64Pat; -defm : St1LanePost64Pat; -defm : St1LanePost64Pat; - -multiclass St1LanePost128Pat { - def : Pat<(scalar_store - (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)), - GPR64sp:$Rn, offset), - (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, XZR)>; - - def : Pat<(scalar_store - (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)), - GPR64sp:$Rn, GPR64:$Rm), - (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, $Rm)>; -} - -defm : St1LanePost128Pat; -defm : St1LanePost128Pat; -defm : St1LanePost128Pat; -defm : St1LanePost128Pat; -defm : St1LanePost128Pat; -defm : St1LanePost128Pat; - -let mayStore = 1, neverHasSideEffects = 1 in { -defm ST2 : SIMDStSingleB<1, 0b000, "st2", VecListTwob, GPR64pi2>; -defm ST2 : SIMDStSingleH<1, 0b010, 0, "st2", VecListTwoh, GPR64pi4>; -defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos, GPR64pi8>; -defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod, GPR64pi16>; -defm ST3 : SIMDStSingleB<0, 0b001, "st3", VecListThreeb, GPR64pi3>; -defm ST3 : SIMDStSingleH<0, 0b011, 0, "st3", VecListThreeh, GPR64pi6>; -defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, GPR64pi12>; -defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, GPR64pi24>; -defm ST4 : SIMDStSingleB<1, 0b001, "st4", VecListFourb, GPR64pi4>; -defm ST4 : SIMDStSingleH<1, 0b011, 0, "st4", VecListFourh, GPR64pi8>; -defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours, GPR64pi16>; -defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd, GPR64pi32>; -} - -defm ST1 : SIMDLdSt1SingleAliases<"st1">; -defm ST2 : SIMDLdSt2SingleAliases<"st2">; -defm ST3 : SIMDLdSt3SingleAliases<"st3">; -defm ST4 : SIMDLdSt4SingleAliases<"st4">; - -//---------------------------------------------------------------------------- -// Crypto extensions -//---------------------------------------------------------------------------- - -def AESErr : AESTiedInst<0b0100, "aese", int_arm64_crypto_aese>; -def AESDrr : AESTiedInst<0b0101, "aesd", int_arm64_crypto_aesd>; -def AESMCrr : AESInst< 0b0110, "aesmc", int_arm64_crypto_aesmc>; -def AESIMCrr : AESInst< 0b0111, "aesimc", int_arm64_crypto_aesimc>; - -def SHA1Crrr : SHATiedInstQSV<0b000, "sha1c", int_arm64_crypto_sha1c>; -def SHA1Prrr : SHATiedInstQSV<0b001, "sha1p", int_arm64_crypto_sha1p>; -def SHA1Mrrr : SHATiedInstQSV<0b010, "sha1m", int_arm64_crypto_sha1m>; -def SHA1SU0rrr : SHATiedInstVVV<0b011, "sha1su0", int_arm64_crypto_sha1su0>; -def SHA256Hrrr : SHATiedInstQQV<0b100, "sha256h", int_arm64_crypto_sha256h>; -def SHA256H2rrr : SHATiedInstQQV<0b101, "sha256h2",int_arm64_crypto_sha256h2>; -def SHA256SU1rrr :SHATiedInstVVV<0b110, "sha256su1",int_arm64_crypto_sha256su1>; - -def SHA1Hrr : SHAInstSS< 0b0000, "sha1h", int_arm64_crypto_sha1h>; -def SHA1SU1rr : SHATiedInstVV<0b0001, "sha1su1", int_arm64_crypto_sha1su1>; -def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_arm64_crypto_sha256su0>; - -//---------------------------------------------------------------------------- -// Compiler-pseudos -//---------------------------------------------------------------------------- -// FIXME: Like for X86, these should go in their own separate .td file. - -// Any instruction that defines a 32-bit result leaves the high half of the -// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may -// be copying from a truncate. But any other 32-bit operation will zero-extend -// up to 64 bits. -// FIXME: X86 also checks for CMOV here. Do we need something similar? -def def32 : PatLeaf<(i32 GPR32:$src), [{ - return N->getOpcode() != ISD::TRUNCATE && - N->getOpcode() != TargetOpcode::EXTRACT_SUBREG && - N->getOpcode() != ISD::CopyFromReg; -}]>; - -// In the case of a 32-bit def that is known to implicitly zero-extend, -// we can use a SUBREG_TO_REG. -def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>; - -// For an anyext, we don't care what the high bits are, so we can perform an -// INSERT_SUBREF into an IMPLICIT_DEF. -def : Pat<(i64 (anyext GPR32:$src)), - (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>; - -// When we need to explicitly zero-extend, we use an unsigned bitfield move -// instruction (UBFM) on the enclosing super-reg. -def : Pat<(i64 (zext GPR32:$src)), - (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>; - -// To sign extend, we use a signed bitfield move instruction (SBFM) on the -// containing super-reg. -def : Pat<(i64 (sext GPR32:$src)), - (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>; -def : Pat<(i64 (sext_inreg GPR64:$src, i32)), (SBFMXri GPR64:$src, 0, 31)>; -def : Pat<(i64 (sext_inreg GPR64:$src, i16)), (SBFMXri GPR64:$src, 0, 15)>; -def : Pat<(i64 (sext_inreg GPR64:$src, i8)), (SBFMXri GPR64:$src, 0, 7)>; -def : Pat<(i64 (sext_inreg GPR64:$src, i1)), (SBFMXri GPR64:$src, 0, 0)>; -def : Pat<(i32 (sext_inreg GPR32:$src, i16)), (SBFMWri GPR32:$src, 0, 15)>; -def : Pat<(i32 (sext_inreg GPR32:$src, i8)), (SBFMWri GPR32:$src, 0, 7)>; -def : Pat<(i32 (sext_inreg GPR32:$src, i1)), (SBFMWri GPR32:$src, 0, 0)>; - -def : Pat<(shl (sext_inreg GPR32:$Rn, i8), (i64 imm0_31:$imm)), - (SBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)), - (i64 (i32shift_sext_i8 imm0_31:$imm)))>; -def : Pat<(shl (sext_inreg GPR64:$Rn, i8), (i64 imm0_63:$imm)), - (SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)), - (i64 (i64shift_sext_i8 imm0_63:$imm)))>; - -def : Pat<(shl (sext_inreg GPR32:$Rn, i16), (i64 imm0_31:$imm)), - (SBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)), - (i64 (i32shift_sext_i16 imm0_31:$imm)))>; -def : Pat<(shl (sext_inreg GPR64:$Rn, i16), (i64 imm0_63:$imm)), - (SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)), - (i64 (i64shift_sext_i16 imm0_63:$imm)))>; - -def : Pat<(shl (i64 (sext GPR32:$Rn)), (i64 imm0_63:$imm)), - (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32), - (i64 (i64shift_a imm0_63:$imm)), - (i64 (i64shift_sext_i32 imm0_63:$imm)))>; - -// sra patterns have an AddedComplexity of 10, so make sure we have a higher -// AddedComplexity for the following patterns since we want to match sext + sra -// patterns before we attempt to match a single sra node. -let AddedComplexity = 20 in { -// We support all sext + sra combinations which preserve at least one bit of the -// original value which is to be sign extended. E.g. we support shifts up to -// bitwidth-1 bits. -def : Pat<(sra (sext_inreg GPR32:$Rn, i8), (i64 imm0_7:$imm)), - (SBFMWri GPR32:$Rn, (i64 imm0_7:$imm), 7)>; -def : Pat<(sra (sext_inreg GPR64:$Rn, i8), (i64 imm0_7:$imm)), - (SBFMXri GPR64:$Rn, (i64 imm0_7:$imm), 7)>; - -def : Pat<(sra (sext_inreg GPR32:$Rn, i16), (i64 imm0_15:$imm)), - (SBFMWri GPR32:$Rn, (i64 imm0_15:$imm), 15)>; -def : Pat<(sra (sext_inreg GPR64:$Rn, i16), (i64 imm0_15:$imm)), - (SBFMXri GPR64:$Rn, (i64 imm0_15:$imm), 15)>; - -def : Pat<(sra (i64 (sext GPR32:$Rn)), (i64 imm0_31:$imm)), - (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32), - (i64 imm0_31:$imm), 31)>; -} // AddedComplexity = 20 - -// To truncate, we can simply extract from a subregister. -def : Pat<(i32 (trunc GPR64sp:$src)), - (i32 (EXTRACT_SUBREG GPR64sp:$src, sub_32))>; - -// __builtin_trap() uses the BRK instruction on ARM64. -def : Pat<(trap), (BRK 1)>; - -// Conversions within AdvSIMD types in the same register size are free. -// But because we need a consistent lane ordering, in big endian many -// conversions require one or more REV instructions. -// -// Consider a simple memory load followed by a bitconvert then a store. -// v0 = load v2i32 -// v1 = BITCAST v2i32 v0 to v4i16 -// store v4i16 v2 -// -// In big endian mode every memory access has an implicit byte swap. LDR and -// STR do a 64-bit byte swap, whereas LD1/ST1 do a byte swap per lane - that -// is, they treat the vector as a sequence of elements to be byte-swapped. -// The two pairs of instructions are fundamentally incompatible. We've decided -// to use LD1/ST1 only to simplify compiler implementation. -// -// LD1/ST1 perform the equivalent of a sequence of LDR/STR + REV. This makes -// the original code sequence: -// v0 = load v2i32 -// v1 = REV v2i32 (implicit) -// v2 = BITCAST v2i32 v1 to v4i16 -// v3 = REV v4i16 v2 (implicit) -// store v4i16 v3 -// -// But this is now broken - the value stored is different to the value loaded -// due to lane reordering. To fix this, on every BITCAST we must perform two -// other REVs: -// v0 = load v2i32 -// v1 = REV v2i32 (implicit) -// v2 = REV v2i32 -// v3 = BITCAST v2i32 v2 to v4i16 -// v4 = REV v4i16 -// v5 = REV v4i16 v4 (implicit) -// store v4i16 v5 -// -// This means an extra two instructions, but actually in most cases the two REV -// instructions can be combined into one. For example: -// (REV64_2s (REV64_4h X)) === (REV32_4h X) -// -// There is also no 128-bit REV instruction. This must be synthesized with an -// EXT instruction. -// -// Most bitconverts require some sort of conversion. The only exceptions are: -// a) Identity conversions - vNfX <-> vNiX -// b) Single-lane-to-scalar - v1fX <-> fX or v1iX <-> iX -// - -let Predicates = [IsLE] in { -def : Pat<(v8i8 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; -def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; -def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; -def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; - -def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))), - (COPY_TO_REGCLASS V64:$Vn, GPR64)>; -def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))), - (COPY_TO_REGCLASS V64:$Vn, GPR64)>; -def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))), - (COPY_TO_REGCLASS V64:$Vn, GPR64)>; -def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), - (COPY_TO_REGCLASS V64:$Vn, GPR64)>; -def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))), - (COPY_TO_REGCLASS V64:$Vn, GPR64)>; -} -let Predicates = [IsBE] in { -def : Pat<(v8i8 (bitconvert GPR64:$Xn)), - (REV64v8i8 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; -def : Pat<(v4i16 (bitconvert GPR64:$Xn)), - (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; -def : Pat<(v2i32 (bitconvert GPR64:$Xn)), - (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; -def : Pat<(v2f32 (bitconvert GPR64:$Xn)), - (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>; - -def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))), - (REV64v8i8 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; -def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))), - (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; -def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))), - (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; -def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), - (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>; -} -def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; -def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; -def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))), - (COPY_TO_REGCLASS V64:$Vn, GPR64)>; -def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)), - (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; -def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)), - (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; -def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>; - -def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))), - (COPY_TO_REGCLASS GPR32:$Xn, FPR32)>; -def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))), - (COPY_TO_REGCLASS FPR32:$Xn, GPR32)>; -def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))), - (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>; -def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))), - (COPY_TO_REGCLASS FPR64:$Xn, GPR64)>; -def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))), - (COPY_TO_REGCLASS V64:$Vn, GPR64)>; - -let Predicates = [IsLE] in { -def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>; -def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>; -def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>; -def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>; -} -let Predicates = [IsBE] in { -def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), - (v1i64 (REV64v2i32 FPR64:$src))>; -def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), - (v1i64 (REV64v4i16 FPR64:$src))>; -def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), - (v1i64 (REV64v8i8 FPR64:$src))>; -def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), - (v1i64 (REV64v2i32 FPR64:$src))>; -} -def : Pat<(v1i64 (bitconvert (v1f64 FPR64:$src))), (v1i64 FPR64:$src)>; -def : Pat<(v1i64 (bitconvert (f64 FPR64:$src))), (v1i64 FPR64:$src)>; - -let Predicates = [IsLE] in { -def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), (v2i32 FPR64:$src)>; -def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>; -def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>; -def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (v2i32 FPR64:$src)>; -def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>; -} -let Predicates = [IsBE] in { -def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), - (v2i32 (REV64v2i32 FPR64:$src))>; -def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), - (v2i32 (REV32v4i16 FPR64:$src))>; -def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))), - (v2i32 (REV32v8i8 FPR64:$src))>; -def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), - (v2i32 (REV64v2i32 FPR64:$src))>; -def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), - (v2i32 (REV64v2i32 FPR64:$src))>; -} -def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>; - -let Predicates = [IsLE] in { -def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>; -def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>; -def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>; -def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), (v4i16 FPR64:$src)>; -def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>; -def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>; -} -let Predicates = [IsBE] in { -def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), - (v4i16 (REV64v4i16 FPR64:$src))>; -def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), - (v4i16 (REV32v4i16 FPR64:$src))>; -def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))), - (v4i16 (REV16v8i8 FPR64:$src))>; -def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), - (v4i16 (REV64v4i16 FPR64:$src))>; -def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), - (v4i16 (REV32v4i16 FPR64:$src))>; -def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), - (v4i16 (REV64v4i16 FPR64:$src))>; -} - -let Predicates = [IsLE] in { -def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), (v8i8 FPR64:$src)>; -def : Pat<(v8i8 (bitconvert (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>; -def : Pat<(v8i8 (bitconvert (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>; -def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), (v8i8 FPR64:$src)>; -def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>; -def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), (v8i8 FPR64:$src)>; -} -let Predicates = [IsBE] in { -def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), - (v8i8 (REV64v8i8 FPR64:$src))>; -def : Pat<(v8i8 (bitconvert (v2i32 FPR64:$src))), - (v8i8 (REV32v8i8 FPR64:$src))>; -def : Pat<(v8i8 (bitconvert (v4i16 FPR64:$src))), - (v8i8 (REV16v8i8 FPR64:$src))>; -def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), - (v8i8 (REV64v8i8 FPR64:$src))>; -def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))), - (v8i8 (REV32v8i8 FPR64:$src))>; -def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), - (v8i8 (REV64v8i8 FPR64:$src))>; -} - -let Predicates = [IsLE] in { -def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))), (f64 FPR64:$src)>; -def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))), (f64 FPR64:$src)>; -def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))), (f64 FPR64:$src)>; -def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), (f64 FPR64:$src)>; -} -let Predicates = [IsBE] in { -def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))), - (f64 (REV64v2i32 FPR64:$src))>; -def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))), - (f64 (REV64v4i16 FPR64:$src))>; -def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))), - (f64 (REV64v2i32 FPR64:$src))>; -def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), - (f64 (REV64v8i8 FPR64:$src))>; -} -def : Pat<(f64 (bitconvert (v1i64 FPR64:$src))), (f64 FPR64:$src)>; -def : Pat<(f64 (bitconvert (v1f64 FPR64:$src))), (f64 FPR64:$src)>; - -let Predicates = [IsLE] in { -def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>; -def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>; -def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))), (v1f64 FPR64:$src)>; -def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>; -} -let Predicates = [IsBE] in { -def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), - (v1f64 (REV64v2i32 FPR64:$src))>; -def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), - (v1f64 (REV64v4i16 FPR64:$src))>; -def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))), - (v1f64 (REV64v8i8 FPR64:$src))>; -def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), - (v1f64 (REV64v2i32 FPR64:$src))>; -} -def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>; -def : Pat<(v1f64 (bitconvert (f64 FPR64:$src))), (v1f64 FPR64:$src)>; - -let Predicates = [IsLE] in { -def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), (v2f32 FPR64:$src)>; -def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>; -def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>; -def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>; -def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 FPR64:$src)>; -} -let Predicates = [IsBE] in { -def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), - (v2f32 (REV64v2i32 FPR64:$src))>; -def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), - (v2f32 (REV32v4i16 FPR64:$src))>; -def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))), - (v2f32 (REV32v8i8 FPR64:$src))>; -def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), - (v2f32 (REV64v2i32 FPR64:$src))>; -def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), - (v2f32 (REV64v2i32 FPR64:$src))>; -} -def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>; - -let Predicates = [IsLE] in { -def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), (f128 FPR128:$src)>; -def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>; -def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>; -def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>; -def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>; -def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>; -} -let Predicates = [IsBE] in { -def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), - (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>; -def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), - (f128 (EXTv16i8 (REV64v4i32 FPR128:$src), - (REV64v4i32 FPR128:$src), (i32 8)))>; -def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), - (f128 (EXTv16i8 (REV64v8i16 FPR128:$src), - (REV64v8i16 FPR128:$src), (i32 8)))>; -def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), - (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>; -def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), - (f128 (EXTv16i8 (REV64v4i32 FPR128:$src), - (REV64v4i32 FPR128:$src), (i32 8)))>; -def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), - (f128 (EXTv16i8 (REV64v16i8 FPR128:$src), - (REV64v16i8 FPR128:$src), (i32 8)))>; -} - -let Predicates = [IsLE] in { -def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), (v2f64 FPR128:$src)>; -def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>; -def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>; -def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>; -def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>; -} -let Predicates = [IsBE] in { -def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), - (v2f64 (EXTv16i8 FPR128:$src, - FPR128:$src, (i32 8)))>; -def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), - (v2f64 (REV64v4i32 FPR128:$src))>; -def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), - (v2f64 (REV64v8i16 FPR128:$src))>; -def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), - (v2f64 (REV64v16i8 FPR128:$src))>; -def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), - (v2f64 (REV64v4i32 FPR128:$src))>; -} -def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>; - -let Predicates = [IsLE] in { -def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>; -def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>; -def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>; -def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>; -def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>; -} -let Predicates = [IsBE] in { -def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), - (v4f32 (EXTv16i8 (REV64v4i32 FPR128:$src), - (REV64v4i32 FPR128:$src), (i32 8)))>; -def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), - (v4f32 (REV32v8i16 FPR128:$src))>; -def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), - (v4f32 (REV32v16i8 FPR128:$src))>; -def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), - (v4f32 (REV64v4i32 FPR128:$src))>; -def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), - (v4f32 (REV64v4i32 FPR128:$src))>; -} -def : Pat<(v4f32 (bitconvert (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>; - -let Predicates = [IsLE] in { -def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), (v2i64 FPR128:$src)>; -def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>; -def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>; -def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>; -def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>; -} -let Predicates = [IsBE] in { -def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), - (v2i64 (EXTv16i8 FPR128:$src, - FPR128:$src, (i32 8)))>; -def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), - (v2i64 (REV64v4i32 FPR128:$src))>; -def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), - (v2i64 (REV64v8i16 FPR128:$src))>; -def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), - (v2i64 (REV64v16i8 FPR128:$src))>; -def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), - (v2i64 (REV64v4i32 FPR128:$src))>; -} -def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>; - -let Predicates = [IsLE] in { -def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), (v4i32 FPR128:$src)>; -def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>; -def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>; -def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>; -def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>; -} -let Predicates = [IsBE] in { -def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), - (v4i32 (EXTv16i8 (REV64v4i32 FPR128:$src), - (REV64v4i32 FPR128:$src), - (i32 8)))>; -def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), - (v4i32 (REV64v4i32 FPR128:$src))>; -def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), - (v4i32 (REV32v8i16 FPR128:$src))>; -def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), - (v4i32 (REV32v16i8 FPR128:$src))>; -def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), - (v4i32 (REV64v4i32 FPR128:$src))>; -} -def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>; - -let Predicates = [IsLE] in { -def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))), (v8i16 FPR128:$src)>; -def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>; -def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>; -def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>; -def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>; -def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>; -} -let Predicates = [IsBE] in { -def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))), - (v8i16 (EXTv16i8 (REV64v8i16 FPR128:$src), - (REV64v8i16 FPR128:$src), - (i32 8)))>; -def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), - (v8i16 (REV64v8i16 FPR128:$src))>; -def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), - (v8i16 (REV32v8i16 FPR128:$src))>; -def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), - (v8i16 (REV16v16i8 FPR128:$src))>; -def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), - (v8i16 (REV64v8i16 FPR128:$src))>; -def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), - (v8i16 (REV32v8i16 FPR128:$src))>; -} - -let Predicates = [IsLE] in { -def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>; -def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>; -def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>; -def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>; -def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>; -def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>; -} -let Predicates = [IsBE] in { -def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), - (v16i8 (EXTv16i8 (REV64v16i8 FPR128:$src), - (REV64v16i8 FPR128:$src), - (i32 8)))>; -def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), - (v16i8 (REV64v16i8 FPR128:$src))>; -def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), - (v16i8 (REV32v16i8 FPR128:$src))>; -def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), - (v16i8 (REV16v16i8 FPR128:$src))>; -def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), - (v16i8 (REV64v16i8 FPR128:$src))>; -def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), - (v16i8 (REV32v16i8 FPR128:$src))>; -} - -def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))), - (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>; -def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))), - (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>; -def : Pat<(v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 1))), - (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>; -def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))), - (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>; - -// A 64-bit subvector insert to the first 128-bit vector position -// is a subregister copy that needs no instruction. -def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (i32 0)), - (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>; -def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (i32 0)), - (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>; -def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (i32 0)), - (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>; -def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)), - (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>; -def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)), - (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>; -def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)), - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>; - -// Use pair-wise add instructions when summing up the lanes for v2f64, v2i64 -// or v2f32. -def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)), - (vector_extract (v2i64 FPR128:$Rn), (i64 1)))), - (i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>; -def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)), - (vector_extract (v2f64 FPR128:$Rn), (i64 1)))), - (f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>; - // vector_extract on 64-bit vectors gets promoted to a 128 bit vector, - // so we match on v4f32 here, not v2f32. This will also catch adding - // the low two lanes of a true v4f32 vector. -def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)), - (vector_extract (v4f32 FPR128:$Rn), (i64 1))), - (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>; - -// Scalar 64-bit shifts in FPR64 registers. -def : Pat<(i64 (int_arm64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), - (SSHLv1i64 FPR64:$Rn, FPR64:$Rm)>; -def : Pat<(i64 (int_arm64_neon_ushl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), - (USHLv1i64 FPR64:$Rn, FPR64:$Rm)>; -def : Pat<(i64 (int_arm64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), - (SRSHLv1i64 FPR64:$Rn, FPR64:$Rm)>; -def : Pat<(i64 (int_arm64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), - (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>; - -// Tail call return handling. These are all compiler pseudo-instructions, -// so no encoding information or anything like that. -let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in { - def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>; - def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>; -} - -def : Pat<(ARM64tcret tcGPR64:$dst, (i32 timm:$FPDiff)), - (TCRETURNri tcGPR64:$dst, imm:$FPDiff)>; -def : Pat<(ARM64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)), - (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>; -def : Pat<(ARM64tcret texternalsym:$dst, (i32 timm:$FPDiff)), - (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>; - -include "ARM64InstrAtomics.td" diff --git a/lib/Target/ARM64/ARM64LoadStoreOptimizer.cpp b/lib/Target/ARM64/ARM64LoadStoreOptimizer.cpp deleted file mode 100644 index e2c4b13f036..00000000000 --- a/lib/Target/ARM64/ARM64LoadStoreOptimizer.cpp +++ /dev/null @@ -1,944 +0,0 @@ -//===-- ARM64LoadStoreOptimizer.cpp - ARM64 load/store opt. pass --*- C++ -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains a pass that performs load / store related peephole -// optimizations. This pass should be run after register allocation. -// -//===----------------------------------------------------------------------===// - -#include "ARM64InstrInfo.h" -#include "MCTargetDesc/ARM64AddressingModes.h" -#include "llvm/ADT/BitVector.h" -#include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetRegisterInfo.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/Statistic.h" -using namespace llvm; - -#define DEBUG_TYPE "arm64-ldst-opt" - -/// ARM64AllocLoadStoreOpt - Post-register allocation pass to combine -/// load / store instructions to form ldp / stp instructions. - -STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); -STATISTIC(NumPostFolded, "Number of post-index updates folded"); -STATISTIC(NumPreFolded, "Number of pre-index updates folded"); -STATISTIC(NumUnscaledPairCreated, - "Number of load/store from unscaled generated"); - -static cl::opt ScanLimit("arm64-load-store-scan-limit", cl::init(20), - cl::Hidden); - -// Place holder while testing unscaled load/store combining -static cl::opt -EnableARM64UnscaledMemOp("arm64-unscaled-mem-op", cl::Hidden, - cl::desc("Allow ARM64 unscaled load/store combining"), - cl::init(true)); - -namespace { -struct ARM64LoadStoreOpt : public MachineFunctionPass { - static char ID; - ARM64LoadStoreOpt() : MachineFunctionPass(ID) {} - - const ARM64InstrInfo *TII; - const TargetRegisterInfo *TRI; - - // Scan the instructions looking for a load/store that can be combined - // with the current instruction into a load/store pair. - // Return the matching instruction if one is found, else MBB->end(). - // If a matching instruction is found, mergeForward is set to true if the - // merge is to remove the first instruction and replace the second with - // a pair-wise insn, and false if the reverse is true. - MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, - bool &mergeForward, - unsigned Limit); - // Merge the two instructions indicated into a single pair-wise instruction. - // If mergeForward is true, erase the first instruction and fold its - // operation into the second. If false, the reverse. Return the instruction - // following the first instruction (which may change during processing). - MachineBasicBlock::iterator - mergePairedInsns(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, bool mergeForward); - - // Scan the instruction list to find a base register update that can - // be combined with the current instruction (a load or store) using - // pre or post indexed addressing with writeback. Scan forwards. - MachineBasicBlock::iterator - findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit, - int Value); - - // Scan the instruction list to find a base register update that can - // be combined with the current instruction (a load or store) using - // pre or post indexed addressing with writeback. Scan backwards. - MachineBasicBlock::iterator - findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); - - // Merge a pre-index base register update into a ld/st instruction. - MachineBasicBlock::iterator - mergePreIdxUpdateInsn(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Update); - - // Merge a post-index base register update into a ld/st instruction. - MachineBasicBlock::iterator - mergePostIdxUpdateInsn(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Update); - - bool optimizeBlock(MachineBasicBlock &MBB); - - bool runOnMachineFunction(MachineFunction &Fn) override; - - const char *getPassName() const override { - return "ARM64 load / store optimization pass"; - } - -private: - int getMemSize(MachineInstr *MemMI); -}; -char ARM64LoadStoreOpt::ID = 0; -} - -static bool isUnscaledLdst(unsigned Opc) { - switch (Opc) { - default: - return false; - case ARM64::STURSi: - return true; - case ARM64::STURDi: - return true; - case ARM64::STURQi: - return true; - case ARM64::STURWi: - return true; - case ARM64::STURXi: - return true; - case ARM64::LDURSi: - return true; - case ARM64::LDURDi: - return true; - case ARM64::LDURQi: - return true; - case ARM64::LDURWi: - return true; - case ARM64::LDURXi: - return true; - } -} - -// Size in bytes of the data moved by an unscaled load or store -int ARM64LoadStoreOpt::getMemSize(MachineInstr *MemMI) { - switch (MemMI->getOpcode()) { - default: - llvm_unreachable("Opcode has has unknown size!"); - case ARM64::STRSui: - case ARM64::STURSi: - return 4; - case ARM64::STRDui: - case ARM64::STURDi: - return 8; - case ARM64::STRQui: - case ARM64::STURQi: - return 16; - case ARM64::STRWui: - case ARM64::STURWi: - return 4; - case ARM64::STRXui: - case ARM64::STURXi: - return 8; - case ARM64::LDRSui: - case ARM64::LDURSi: - return 4; - case ARM64::LDRDui: - case ARM64::LDURDi: - return 8; - case ARM64::LDRQui: - case ARM64::LDURQi: - return 16; - case ARM64::LDRWui: - case ARM64::LDURWi: - return 4; - case ARM64::LDRXui: - case ARM64::LDURXi: - return 8; - } -} - -static unsigned getMatchingPairOpcode(unsigned Opc) { - switch (Opc) { - default: - llvm_unreachable("Opcode has no pairwise equivalent!"); - case ARM64::STRSui: - case ARM64::STURSi: - return ARM64::STPSi; - case ARM64::STRDui: - case ARM64::STURDi: - return ARM64::STPDi; - case ARM64::STRQui: - case ARM64::STURQi: - return ARM64::STPQi; - case ARM64::STRWui: - case ARM64::STURWi: - return ARM64::STPWi; - case ARM64::STRXui: - case ARM64::STURXi: - return ARM64::STPXi; - case ARM64::LDRSui: - case ARM64::LDURSi: - return ARM64::LDPSi; - case ARM64::LDRDui: - case ARM64::LDURDi: - return ARM64::LDPDi; - case ARM64::LDRQui: - case ARM64::LDURQi: - return ARM64::LDPQi; - case ARM64::LDRWui: - case ARM64::LDURWi: - return ARM64::LDPWi; - case ARM64::LDRXui: - case ARM64::LDURXi: - return ARM64::LDPXi; - } -} - -static unsigned getPreIndexedOpcode(unsigned Opc) { - switch (Opc) { - default: - llvm_unreachable("Opcode has no pre-indexed equivalent!"); - case ARM64::STRSui: return ARM64::STRSpre; - case ARM64::STRDui: return ARM64::STRDpre; - case ARM64::STRQui: return ARM64::STRQpre; - case ARM64::STRWui: return ARM64::STRWpre; - case ARM64::STRXui: return ARM64::STRXpre; - case ARM64::LDRSui: return ARM64::LDRSpre; - case ARM64::LDRDui: return ARM64::LDRDpre; - case ARM64::LDRQui: return ARM64::LDRQpre; - case ARM64::LDRWui: return ARM64::LDRWpre; - case ARM64::LDRXui: return ARM64::LDRXpre; - } -} - -static unsigned getPostIndexedOpcode(unsigned Opc) { - switch (Opc) { - default: - llvm_unreachable("Opcode has no post-indexed wise equivalent!"); - case ARM64::STRSui: - return ARM64::STRSpost; - case ARM64::STRDui: - return ARM64::STRDpost; - case ARM64::STRQui: - return ARM64::STRQpost; - case ARM64::STRWui: - return ARM64::STRWpost; - case ARM64::STRXui: - return ARM64::STRXpost; - case ARM64::LDRSui: - return ARM64::LDRSpost; - case ARM64::LDRDui: - return ARM64::LDRDpost; - case ARM64::LDRQui: - return ARM64::LDRQpost; - case ARM64::LDRWui: - return ARM64::LDRWpost; - case ARM64::LDRXui: - return ARM64::LDRXpost; - } -} - -MachineBasicBlock::iterator -ARM64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, - bool mergeForward) { - MachineBasicBlock::iterator NextI = I; - ++NextI; - // If NextI is the second of the two instructions to be merged, we need - // to skip one further. Either way we merge will invalidate the iterator, - // and we don't need to scan the new instruction, as it's a pairwise - // instruction, which we're not considering for further action anyway. - if (NextI == Paired) - ++NextI; - - bool IsUnscaled = isUnscaledLdst(I->getOpcode()); - int OffsetStride = IsUnscaled && EnableARM64UnscaledMemOp ? getMemSize(I) : 1; - - unsigned NewOpc = getMatchingPairOpcode(I->getOpcode()); - // Insert our new paired instruction after whichever of the paired - // instructions mergeForward indicates. - MachineBasicBlock::iterator InsertionPoint = mergeForward ? Paired : I; - // Also based on mergeForward is from where we copy the base register operand - // so we get the flags compatible with the input code. - MachineOperand &BaseRegOp = - mergeForward ? Paired->getOperand(1) : I->getOperand(1); - - // Which register is Rt and which is Rt2 depends on the offset order. - MachineInstr *RtMI, *Rt2MI; - if (I->getOperand(2).getImm() == - Paired->getOperand(2).getImm() + OffsetStride) { - RtMI = Paired; - Rt2MI = I; - } else { - RtMI = I; - Rt2MI = Paired; - } - // Handle Unscaled - int OffsetImm = RtMI->getOperand(2).getImm(); - if (IsUnscaled && EnableARM64UnscaledMemOp) - OffsetImm /= OffsetStride; - - // Construct the new instruction. - MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint, - I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(RtMI->getOperand(0)) - .addOperand(Rt2MI->getOperand(0)) - .addOperand(BaseRegOp) - .addImm(OffsetImm); - (void)MIB; - - // FIXME: Do we need/want to copy the mem operands from the source - // instructions? Probably. What uses them after this? - - DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n "); - DEBUG(I->print(dbgs())); - DEBUG(dbgs() << " "); - DEBUG(Paired->print(dbgs())); - DEBUG(dbgs() << " with instruction:\n "); - DEBUG(((MachineInstr *)MIB)->print(dbgs())); - DEBUG(dbgs() << "\n"); - - // Erase the old instructions. - I->eraseFromParent(); - Paired->eraseFromParent(); - - return NextI; -} - -/// trackRegDefsUses - Remember what registers the specified instruction uses -/// and modifies. -static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs, - BitVector &UsedRegs, - const TargetRegisterInfo *TRI) { - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); - if (MO.isRegMask()) - ModifiedRegs.setBitsNotInMask(MO.getRegMask()); - - if (!MO.isReg()) - continue; - unsigned Reg = MO.getReg(); - if (MO.isDef()) { - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) - ModifiedRegs.set(*AI); - } else { - assert(MO.isUse() && "Reg operand not a def and not a use?!?"); - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) - UsedRegs.set(*AI); - } - } -} - -static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { - if (!IsUnscaled && (Offset > 63 || Offset < -64)) - return false; - if (IsUnscaled) { - // Convert the byte-offset used by unscaled into an "element" offset used - // by the scaled pair load/store instructions. - int elemOffset = Offset / OffsetStride; - if (elemOffset > 63 || elemOffset < -64) - return false; - } - return true; -} - -// Do alignment, specialized to power of 2 and for signed ints, -// avoiding having to do a C-style cast from uint_64t to int when -// using RoundUpToAlignment from include/llvm/Support/MathExtras.h. -// FIXME: Move this function to include/MathExtras.h? -static int alignTo(int Num, int PowOf2) { - return (Num + PowOf2 - 1) & ~(PowOf2 - 1); -} - -/// findMatchingInsn - Scan the instructions looking for a load/store that can -/// be combined with the current instruction into a load/store pair. -MachineBasicBlock::iterator -ARM64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, - bool &mergeForward, unsigned Limit) { - MachineBasicBlock::iterator E = I->getParent()->end(); - MachineBasicBlock::iterator MBBI = I; - MachineInstr *FirstMI = I; - ++MBBI; - - int Opc = FirstMI->getOpcode(); - bool mayLoad = FirstMI->mayLoad(); - bool IsUnscaled = isUnscaledLdst(Opc); - unsigned Reg = FirstMI->getOperand(0).getReg(); - unsigned BaseReg = FirstMI->getOperand(1).getReg(); - int Offset = FirstMI->getOperand(2).getImm(); - - // Early exit if the first instruction modifies the base register. - // e.g., ldr x0, [x0] - // Early exit if the offset if not possible to match. (6 bits of positive - // range, plus allow an extra one in case we find a later insn that matches - // with Offset-1 - if (FirstMI->modifiesRegister(BaseReg, TRI)) - return E; - int OffsetStride = - IsUnscaled && EnableARM64UnscaledMemOp ? getMemSize(FirstMI) : 1; - if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride)) - return E; - - // Track which registers have been modified and used between the first insn - // (inclusive) and the second insn. - BitVector ModifiedRegs, UsedRegs; - ModifiedRegs.resize(TRI->getNumRegs()); - UsedRegs.resize(TRI->getNumRegs()); - for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { - MachineInstr *MI = MBBI; - // Skip DBG_VALUE instructions. Otherwise debug info can affect the - // optimization by changing how far we scan. - if (MI->isDebugValue()) - continue; - - // Now that we know this is a real instruction, count it. - ++Count; - - if (Opc == MI->getOpcode() && MI->getOperand(2).isImm()) { - // If we've found another instruction with the same opcode, check to see - // if the base and offset are compatible with our starting instruction. - // These instructions all have scaled immediate operands, so we just - // check for +1/-1. Make sure to check the new instruction offset is - // actually an immediate and not a symbolic reference destined for - // a relocation. - // - // Pairwise instructions have a 7-bit signed offset field. Single insns - // have a 12-bit unsigned offset field. To be a valid combine, the - // final offset must be in range. - unsigned MIBaseReg = MI->getOperand(1).getReg(); - int MIOffset = MI->getOperand(2).getImm(); - if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) || - (Offset + OffsetStride == MIOffset))) { - int MinOffset = Offset < MIOffset ? Offset : MIOffset; - // If this is a volatile load/store that otherwise matched, stop looking - // as something is going on that we don't have enough information to - // safely transform. Similarly, stop if we see a hint to avoid pairs. - if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) - return E; - // If the resultant immediate offset of merging these instructions - // is out of range for a pairwise instruction, bail and keep looking. - bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode()); - if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { - trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - continue; - } - // If the alignment requirements of the paired (scaled) instruction - // can't express the offset of the unscaled input, bail and keep - // looking. - if (IsUnscaled && EnableARM64UnscaledMemOp && - (alignTo(MinOffset, OffsetStride) != MinOffset)) { - trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - continue; - } - // If the destination register of the loads is the same register, bail - // and keep looking. A load-pair instruction with both destination - // registers the same is UNPREDICTABLE and will result in an exception. - if (mayLoad && Reg == MI->getOperand(0).getReg()) { - trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - continue; - } - - // If the Rt of the second instruction was not modified or used between - // the two instructions, we can combine the second into the first. - if (!ModifiedRegs[MI->getOperand(0).getReg()] && - !UsedRegs[MI->getOperand(0).getReg()]) { - mergeForward = false; - return MBBI; - } - - // Likewise, if the Rt of the first instruction is not modified or used - // between the two instructions, we can combine the first into the - // second. - if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] && - !UsedRegs[FirstMI->getOperand(0).getReg()]) { - mergeForward = true; - return MBBI; - } - // Unable to combine these instructions due to interference in between. - // Keep looking. - } - } - - // If the instruction wasn't a matching load or store, but does (or can) - // modify memory, stop searching, as we don't have alias analysis or - // anything like that to tell us whether the access is tromping on the - // locations we care about. The big one we want to catch is calls. - // - // FIXME: Theoretically, we can do better than that for SP and FP based - // references since we can effectively know where those are touching. It's - // unclear if it's worth the extra code, though. Most paired instructions - // will be sequential, perhaps with a few intervening non-memory related - // instructions. - if (MI->mayStore() || MI->isCall()) - return E; - // Likewise, if we're matching a store instruction, we don't want to - // move across a load, as it may be reading the same location. - if (FirstMI->mayStore() && MI->mayLoad()) - return E; - - // Update modified / uses register lists. - trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - - // Otherwise, if the base register is modified, we have no match, so - // return early. - if (ModifiedRegs[BaseReg]) - return E; - } - return E; -} - -MachineBasicBlock::iterator -ARM64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Update) { - assert((Update->getOpcode() == ARM64::ADDXri || - Update->getOpcode() == ARM64::SUBXri) && - "Unexpected base register update instruction to merge!"); - MachineBasicBlock::iterator NextI = I; - // Return the instruction following the merged instruction, which is - // the instruction following our unmerged load. Unless that's the add/sub - // instruction we're merging, in which case it's the one after that. - if (++NextI == Update) - ++NextI; - - int Value = Update->getOperand(2).getImm(); - assert(ARM64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && - "Can't merge 1 << 12 offset into pre-indexed load / store"); - if (Update->getOpcode() == ARM64::SUBXri) - Value = -Value; - - unsigned NewOpc = getPreIndexedOpcode(I->getOpcode()); - MachineInstrBuilder MIB = - BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(Update->getOperand(0)) - .addOperand(I->getOperand(0)) - .addOperand(I->getOperand(1)) - .addImm(Value); - (void)MIB; - - DEBUG(dbgs() << "Creating pre-indexed load/store."); - DEBUG(dbgs() << " Replacing instructions:\n "); - DEBUG(I->print(dbgs())); - DEBUG(dbgs() << " "); - DEBUG(Update->print(dbgs())); - DEBUG(dbgs() << " with instruction:\n "); - DEBUG(((MachineInstr *)MIB)->print(dbgs())); - DEBUG(dbgs() << "\n"); - - // Erase the old instructions for the block. - I->eraseFromParent(); - Update->eraseFromParent(); - - return NextI; -} - -MachineBasicBlock::iterator -ARM64LoadStoreOpt::mergePostIdxUpdateInsn(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Update) { - assert((Update->getOpcode() == ARM64::ADDXri || - Update->getOpcode() == ARM64::SUBXri) && - "Unexpected base register update instruction to merge!"); - MachineBasicBlock::iterator NextI = I; - // Return the instruction following the merged instruction, which is - // the instruction following our unmerged load. Unless that's the add/sub - // instruction we're merging, in which case it's the one after that. - if (++NextI == Update) - ++NextI; - - int Value = Update->getOperand(2).getImm(); - assert(ARM64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && - "Can't merge 1 << 12 offset into post-indexed load / store"); - if (Update->getOpcode() == ARM64::SUBXri) - Value = -Value; - - unsigned NewOpc = getPostIndexedOpcode(I->getOpcode()); - MachineInstrBuilder MIB = - BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(Update->getOperand(0)) - .addOperand(I->getOperand(0)) - .addOperand(I->getOperand(1)) - .addImm(Value); - (void)MIB; - - DEBUG(dbgs() << "Creating post-indexed load/store."); - DEBUG(dbgs() << " Replacing instructions:\n "); - DEBUG(I->print(dbgs())); - DEBUG(dbgs() << " "); - DEBUG(Update->print(dbgs())); - DEBUG(dbgs() << " with instruction:\n "); - DEBUG(((MachineInstr *)MIB)->print(dbgs())); - DEBUG(dbgs() << "\n"); - - // Erase the old instructions for the block. - I->eraseFromParent(); - Update->eraseFromParent(); - - return NextI; -} - -static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg, - int Offset) { - switch (MI->getOpcode()) { - default: - break; - case ARM64::SUBXri: - // Negate the offset for a SUB instruction. - Offset *= -1; - // FALLTHROUGH - case ARM64::ADDXri: - // Make sure it's a vanilla immediate operand, not a relocation or - // anything else we can't handle. - if (!MI->getOperand(2).isImm()) - break; - // Watch out for 1 << 12 shifted value. - if (ARM64_AM::getShiftValue(MI->getOperand(3).getImm())) - break; - // If the instruction has the base register as source and dest and the - // immediate will fit in a signed 9-bit integer, then we have a match. - if (MI->getOperand(0).getReg() == BaseReg && - MI->getOperand(1).getReg() == BaseReg && - MI->getOperand(2).getImm() <= 255 && - MI->getOperand(2).getImm() >= -256) { - // If we have a non-zero Offset, we check that it matches the amount - // we're adding to the register. - if (!Offset || Offset == MI->getOperand(2).getImm()) - return true; - } - break; - } - return false; -} - -MachineBasicBlock::iterator -ARM64LoadStoreOpt::findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, - unsigned Limit, int Value) { - MachineBasicBlock::iterator E = I->getParent()->end(); - MachineInstr *MemMI = I; - MachineBasicBlock::iterator MBBI = I; - const MachineFunction &MF = *MemMI->getParent()->getParent(); - - unsigned DestReg = MemMI->getOperand(0).getReg(); - unsigned BaseReg = MemMI->getOperand(1).getReg(); - int Offset = MemMI->getOperand(2).getImm() * - TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); - - // If the base register overlaps the destination register, we can't - // merge the update. - if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) - return E; - - // Scan forward looking for post-index opportunities. - // Updating instructions can't be formed if the memory insn already - // has an offset other than the value we're looking for. - if (Offset != Value) - return E; - - // Track which registers have been modified and used between the first insn - // (inclusive) and the second insn. - BitVector ModifiedRegs, UsedRegs; - ModifiedRegs.resize(TRI->getNumRegs()); - UsedRegs.resize(TRI->getNumRegs()); - ++MBBI; - for (unsigned Count = 0; MBBI != E; ++MBBI) { - MachineInstr *MI = MBBI; - // Skip DBG_VALUE instructions. Otherwise debug info can affect the - // optimization by changing how far we scan. - if (MI->isDebugValue()) - continue; - - // Now that we know this is a real instruction, count it. - ++Count; - - // If we found a match, return it. - if (isMatchingUpdateInsn(MI, BaseReg, Value)) - return MBBI; - - // Update the status of what the instruction clobbered and used. - trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - - // Otherwise, if the base register is used or modified, we have no match, so - // return early. - if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) - return E; - } - return E; -} - -MachineBasicBlock::iterator -ARM64LoadStoreOpt::findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, - unsigned Limit) { - MachineBasicBlock::iterator B = I->getParent()->begin(); - MachineBasicBlock::iterator E = I->getParent()->end(); - MachineInstr *MemMI = I; - MachineBasicBlock::iterator MBBI = I; - const MachineFunction &MF = *MemMI->getParent()->getParent(); - - unsigned DestReg = MemMI->getOperand(0).getReg(); - unsigned BaseReg = MemMI->getOperand(1).getReg(); - int Offset = MemMI->getOperand(2).getImm(); - unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); - - // If the load/store is the first instruction in the block, there's obviously - // not any matching update. Ditto if the memory offset isn't zero. - if (MBBI == B || Offset != 0) - return E; - // If the base register overlaps the destination register, we can't - // merge the update. - if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) - return E; - - // Track which registers have been modified and used between the first insn - // (inclusive) and the second insn. - BitVector ModifiedRegs, UsedRegs; - ModifiedRegs.resize(TRI->getNumRegs()); - UsedRegs.resize(TRI->getNumRegs()); - --MBBI; - for (unsigned Count = 0; MBBI != B; --MBBI) { - MachineInstr *MI = MBBI; - // Skip DBG_VALUE instructions. Otherwise debug info can affect the - // optimization by changing how far we scan. - if (MI->isDebugValue()) - continue; - - // Now that we know this is a real instruction, count it. - ++Count; - - // If we found a match, return it. - if (isMatchingUpdateInsn(MI, BaseReg, RegSize)) - return MBBI; - - // Update the status of what the instruction clobbered and used. - trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - - // Otherwise, if the base register is used or modified, we have no match, so - // return early. - if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) - return E; - } - return E; -} - -bool ARM64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { - bool Modified = false; - // Two tranformations to do here: - // 1) Find loads and stores that can be merged into a single load or store - // pair instruction. - // e.g., - // ldr x0, [x2] - // ldr x1, [x2, #8] - // ; becomes - // ldp x0, x1, [x2] - // 2) Find base register updates that can be merged into the load or store - // as a base-reg writeback. - // e.g., - // ldr x0, [x2] - // add x2, x2, #4 - // ; becomes - // ldr x0, [x2], #4 - - for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); - MBBI != E;) { - MachineInstr *MI = MBBI; - switch (MI->getOpcode()) { - default: - // Just move on to the next instruction. - ++MBBI; - break; - case ARM64::STRSui: - case ARM64::STRDui: - case ARM64::STRQui: - case ARM64::STRXui: - case ARM64::STRWui: - case ARM64::LDRSui: - case ARM64::LDRDui: - case ARM64::LDRQui: - case ARM64::LDRXui: - case ARM64::LDRWui: - // do the unscaled versions as well - case ARM64::STURSi: - case ARM64::STURDi: - case ARM64::STURQi: - case ARM64::STURWi: - case ARM64::STURXi: - case ARM64::LDURSi: - case ARM64::LDURDi: - case ARM64::LDURQi: - case ARM64::LDURWi: - case ARM64::LDURXi: { - // If this is a volatile load/store, don't mess with it. - if (MI->hasOrderedMemoryRef()) { - ++MBBI; - break; - } - // Make sure this is a reg+imm (as opposed to an address reloc). - if (!MI->getOperand(2).isImm()) { - ++MBBI; - break; - } - // Check if this load/store has a hint to avoid pair formation. - // MachineMemOperands hints are set by the ARM64StorePairSuppress pass. - if (TII->isLdStPairSuppressed(MI)) { - ++MBBI; - break; - } - // Look ahead up to ScanLimit instructions for a pairable instruction. - bool mergeForward = false; - MachineBasicBlock::iterator Paired = - findMatchingInsn(MBBI, mergeForward, ScanLimit); - if (Paired != E) { - // Merge the loads into a pair. Keeping the iterator straight is a - // pain, so we let the merge routine tell us what the next instruction - // is after it's done mucking about. - MBBI = mergePairedInsns(MBBI, Paired, mergeForward); - - Modified = true; - ++NumPairCreated; - if (isUnscaledLdst(MI->getOpcode())) - ++NumUnscaledPairCreated; - break; - } - ++MBBI; - break; - } - // FIXME: Do the other instructions. - } - } - - for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); - MBBI != E;) { - MachineInstr *MI = MBBI; - // Do update merging. It's simpler to keep this separate from the above - // switch, though not strictly necessary. - int Opc = MI->getOpcode(); - switch (Opc) { - default: - // Just move on to the next instruction. - ++MBBI; - break; - case ARM64::STRSui: - case ARM64::STRDui: - case ARM64::STRQui: - case ARM64::STRXui: - case ARM64::STRWui: - case ARM64::LDRSui: - case ARM64::LDRDui: - case ARM64::LDRQui: - case ARM64::LDRXui: - case ARM64::LDRWui: - // do the unscaled versions as well - case ARM64::STURSi: - case ARM64::STURDi: - case ARM64::STURQi: - case ARM64::STURWi: - case ARM64::STURXi: - case ARM64::LDURSi: - case ARM64::LDURDi: - case ARM64::LDURQi: - case ARM64::LDURWi: - case ARM64::LDURXi: { - // Make sure this is a reg+imm (as opposed to an address reloc). - if (!MI->getOperand(2).isImm()) { - ++MBBI; - break; - } - // Look ahead up to ScanLimit instructions for a mergable instruction. - MachineBasicBlock::iterator Update = - findMatchingUpdateInsnForward(MBBI, ScanLimit, 0); - if (Update != E) { - // Merge the update into the ld/st. - MBBI = mergePostIdxUpdateInsn(MBBI, Update); - Modified = true; - ++NumPostFolded; - break; - } - // Don't know how to handle pre/post-index versions, so move to the next - // instruction. - if (isUnscaledLdst(Opc)) { - ++MBBI; - break; - } - - // Look back to try to find a pre-index instruction. For example, - // add x0, x0, #8 - // ldr x1, [x0] - // merged into: - // ldr x1, [x0, #8]! - Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit); - if (Update != E) { - // Merge the update into the ld/st. - MBBI = mergePreIdxUpdateInsn(MBBI, Update); - Modified = true; - ++NumPreFolded; - break; - } - - // Look forward to try to find a post-index instruction. For example, - // ldr x1, [x0, #64] - // add x0, x0, #64 - // merged into: - // ldr x1, [x0, #64]! - - // The immediate in the load/store is scaled by the size of the register - // being loaded. The immediate in the add we're looking for, - // however, is not, so adjust here. - int Value = MI->getOperand(2).getImm() * - TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent())) - ->getSize(); - Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value); - if (Update != E) { - // Merge the update into the ld/st. - MBBI = mergePreIdxUpdateInsn(MBBI, Update); - Modified = true; - ++NumPreFolded; - break; - } - - // Nothing found. Just move to the next instruction. - ++MBBI; - break; - } - // FIXME: Do the other instructions. - } - } - - return Modified; -} - -bool ARM64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { - const TargetMachine &TM = Fn.getTarget(); - TII = static_cast(TM.getInstrInfo()); - TRI = TM.getRegisterInfo(); - - bool Modified = false; - for (auto &MBB : Fn) - Modified |= optimizeBlock(MBB); - - return Modified; -} - -// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep -// loads and stores near one another? - -/// createARMLoadStoreOptimizationPass - returns an instance of the load / store -/// optimization pass. -FunctionPass *llvm::createARM64LoadStoreOptimizationPass() { - return new ARM64LoadStoreOpt(); -} diff --git a/lib/Target/ARM64/ARM64MCInstLower.cpp b/lib/Target/ARM64/ARM64MCInstLower.cpp deleted file mode 100644 index 525f484ca4c..00000000000 --- a/lib/Target/ARM64/ARM64MCInstLower.cpp +++ /dev/null @@ -1,201 +0,0 @@ -//===-- ARM64MCInstLower.cpp - Convert ARM64 MachineInstr to an MCInst---===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains code to lower ARM64 MachineInstrs to their corresponding -// MCInst records. -// -//===----------------------------------------------------------------------===// - -#include "ARM64MCInstLower.h" -#include "MCTargetDesc/ARM64MCExpr.h" -#include "Utils/ARM64BaseInfo.h" -#include "llvm/CodeGen/AsmPrinter.h" -#include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/IR/Mangler.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/Support/CodeGen.h" -#include "llvm/Target/TargetMachine.h" -using namespace llvm; - -ARM64MCInstLower::ARM64MCInstLower(MCContext &ctx, Mangler &mang, - AsmPrinter &printer) - : Ctx(ctx), Printer(printer), TargetTriple(printer.getTargetTriple()) {} - -MCSymbol * -ARM64MCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const { - return Printer.getSymbol(MO.getGlobal()); -} - -MCSymbol * -ARM64MCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const { - return Printer.GetExternalSymbolSymbol(MO.getSymbolName()); -} - -MCOperand ARM64MCInstLower::lowerSymbolOperandDarwin(const MachineOperand &MO, - MCSymbol *Sym) const { - // FIXME: We would like an efficient form for this, so we don't have to do a - // lot of extra uniquing. - MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None; - if ((MO.getTargetFlags() & ARM64II::MO_GOT) != 0) { - if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE) - RefKind = MCSymbolRefExpr::VK_GOTPAGE; - else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == - ARM64II::MO_PAGEOFF) - RefKind = MCSymbolRefExpr::VK_GOTPAGEOFF; - else - assert(0 && "Unexpected target flags with MO_GOT on GV operand"); - } else if ((MO.getTargetFlags() & ARM64II::MO_TLS) != 0) { - if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE) - RefKind = MCSymbolRefExpr::VK_TLVPPAGE; - else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == - ARM64II::MO_PAGEOFF) - RefKind = MCSymbolRefExpr::VK_TLVPPAGEOFF; - else - llvm_unreachable("Unexpected target flags with MO_TLS on GV operand"); - } else { - if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE) - RefKind = MCSymbolRefExpr::VK_PAGE; - else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == - ARM64II::MO_PAGEOFF) - RefKind = MCSymbolRefExpr::VK_PAGEOFF; - } - const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx); - if (!MO.isJTI() && MO.getOffset()) - Expr = MCBinaryExpr::CreateAdd( - Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx); - return MCOperand::CreateExpr(Expr); -} - -MCOperand ARM64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO, - MCSymbol *Sym) const { - uint32_t RefFlags = 0; - - if (MO.getTargetFlags() & ARM64II::MO_GOT) - RefFlags |= ARM64MCExpr::VK_GOT; - else if (MO.getTargetFlags() & ARM64II::MO_TLS) { - TLSModel::Model Model; - if (MO.isGlobal()) { - const GlobalValue *GV = MO.getGlobal(); - Model = Printer.TM.getTLSModel(GV); - } else { - assert(MO.isSymbol() && - StringRef(MO.getSymbolName()) == "_TLS_MODULE_BASE_" && - "unexpected external TLS symbol"); - Model = TLSModel::GeneralDynamic; - } - switch (Model) { - case TLSModel::InitialExec: - RefFlags |= ARM64MCExpr::VK_GOTTPREL; - break; - case TLSModel::LocalExec: - RefFlags |= ARM64MCExpr::VK_TPREL; - break; - case TLSModel::LocalDynamic: - RefFlags |= ARM64MCExpr::VK_DTPREL; - break; - case TLSModel::GeneralDynamic: - RefFlags |= ARM64MCExpr::VK_TLSDESC; - break; - } - } else { - // No modifier means this is a generic reference, classified as absolute for - // the cases where it matters (:abs_g0: etc). - RefFlags |= ARM64MCExpr::VK_ABS; - } - - if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE) - RefFlags |= ARM64MCExpr::VK_PAGE; - else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGEOFF) - RefFlags |= ARM64MCExpr::VK_PAGEOFF; - else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G3) - RefFlags |= ARM64MCExpr::VK_G3; - else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G2) - RefFlags |= ARM64MCExpr::VK_G2; - else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G1) - RefFlags |= ARM64MCExpr::VK_G1; - else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G0) - RefFlags |= ARM64MCExpr::VK_G0; - - if (MO.getTargetFlags() & ARM64II::MO_NC) - RefFlags |= ARM64MCExpr::VK_NC; - - const MCExpr *Expr = - MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, Ctx); - if (!MO.isJTI() && MO.getOffset()) - Expr = MCBinaryExpr::CreateAdd( - Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx); - - ARM64MCExpr::VariantKind RefKind; - RefKind = static_cast(RefFlags); - Expr = ARM64MCExpr::Create(Expr, RefKind, Ctx); - - return MCOperand::CreateExpr(Expr); -} - -MCOperand ARM64MCInstLower::LowerSymbolOperand(const MachineOperand &MO, - MCSymbol *Sym) const { - if (TargetTriple.isOSDarwin()) - return lowerSymbolOperandDarwin(MO, Sym); - - assert(TargetTriple.isOSBinFormatELF() && "Expect Darwin or ELF target"); - return lowerSymbolOperandELF(MO, Sym); -} - -bool ARM64MCInstLower::lowerOperand(const MachineOperand &MO, - MCOperand &MCOp) const { - switch (MO.getType()) { - default: - assert(0 && "unknown operand type"); - case MachineOperand::MO_Register: - // Ignore all implicit register operands. - if (MO.isImplicit()) - return false; - MCOp = MCOperand::CreateReg(MO.getReg()); - break; - case MachineOperand::MO_RegisterMask: - // Regmasks are like implicit defs. - return false; - case MachineOperand::MO_Immediate: - MCOp = MCOperand::CreateImm(MO.getImm()); - break; - case MachineOperand::MO_MachineBasicBlock: - MCOp = MCOperand::CreateExpr( - MCSymbolRefExpr::Create(MO.getMBB()->getSymbol(), Ctx)); - break; - case MachineOperand::MO_GlobalAddress: - MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO)); - break; - case MachineOperand::MO_ExternalSymbol: - MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO)); - break; - case MachineOperand::MO_JumpTableIndex: - MCOp = LowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex())); - break; - case MachineOperand::MO_ConstantPoolIndex: - MCOp = LowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex())); - break; - case MachineOperand::MO_BlockAddress: - MCOp = LowerSymbolOperand( - MO, Printer.GetBlockAddressSymbol(MO.getBlockAddress())); - break; - } - return true; -} - -void ARM64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { - OutMI.setOpcode(MI->getOpcode()); - - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - MCOperand MCOp; - if (lowerOperand(MI->getOperand(i), MCOp)) - OutMI.addOperand(MCOp); - } -} diff --git a/lib/Target/ARM64/ARM64MCInstLower.h b/lib/Target/ARM64/ARM64MCInstLower.h deleted file mode 100644 index 7e3a2c8e54f..00000000000 --- a/lib/Target/ARM64/ARM64MCInstLower.h +++ /dev/null @@ -1,52 +0,0 @@ -//===-- ARM64MCInstLower.h - Lower MachineInstr to MCInst ----------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef ARM64_MCINSTLOWER_H -#define ARM64_MCINSTLOWER_H - -#include "llvm/ADT/Triple.h" -#include "llvm/Support/Compiler.h" - -namespace llvm { -class AsmPrinter; -class MCAsmInfo; -class MCContext; -class MCInst; -class MCOperand; -class MCSymbol; -class MachineInstr; -class MachineModuleInfoMachO; -class MachineOperand; -class Mangler; - -/// ARM64MCInstLower - This class is used to lower an MachineInstr -/// into an MCInst. -class LLVM_LIBRARY_VISIBILITY ARM64MCInstLower { - MCContext &Ctx; - AsmPrinter &Printer; - Triple TargetTriple; - -public: - ARM64MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer); - - bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const; - void Lower(const MachineInstr *MI, MCInst &OutMI) const; - - MCOperand lowerSymbolOperandDarwin(const MachineOperand &MO, - MCSymbol *Sym) const; - MCOperand lowerSymbolOperandELF(const MachineOperand &MO, - MCSymbol *Sym) const; - MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const; - - MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const; - MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const; -}; -} - -#endif diff --git a/lib/Target/ARM64/ARM64MachineFunctionInfo.h b/lib/Target/ARM64/ARM64MachineFunctionInfo.h deleted file mode 100644 index 0b6f4f1ec64..00000000000 --- a/lib/Target/ARM64/ARM64MachineFunctionInfo.h +++ /dev/null @@ -1,163 +0,0 @@ -//===- ARM64MachineFuctionInfo.h - ARM64 machine function info --*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file declares ARM64-specific per-machine-function information. -// -//===----------------------------------------------------------------------===// - -#ifndef ARM64MACHINEFUNCTIONINFO_H -#define ARM64MACHINEFUNCTIONINFO_H - -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/MC/MCLinkerOptimizationHint.h" - -namespace llvm { - -/// ARM64FunctionInfo - This class is derived from MachineFunctionInfo and -/// contains private ARM64-specific information for each MachineFunction. -class ARM64FunctionInfo : public MachineFunctionInfo { - - /// Number of bytes of arguments this function has on the stack. If the callee - /// is expected to restore the argument stack this should be a multiple of 16, - /// all usable during a tail call. - /// - /// The alternative would forbid tail call optimisation in some cases: if we - /// want to transfer control from a function with 8-bytes of stack-argument - /// space to a function with 16-bytes then misalignment of this value would - /// make a stack adjustment necessary, which could not be undone by the - /// callee. - unsigned BytesInStackArgArea; - - /// The number of bytes to restore to deallocate space for incoming - /// arguments. Canonically 0 in the C calling convention, but non-zero when - /// callee is expected to pop the args. - unsigned ArgumentStackToRestore; - - /// HasStackFrame - True if this function has a stack frame. Set by - /// processFunctionBeforeCalleeSavedScan(). - bool HasStackFrame; - - /// \brief Amount of stack frame size, not including callee-saved registers. - unsigned LocalStackSize; - - /// \brief Number of TLS accesses using the special (combinable) - /// _TLS_MODULE_BASE_ symbol. - unsigned NumLocalDynamicTLSAccesses; - - /// \brief FrameIndex for start of varargs area for arguments passed on the - /// stack. - int VarArgsStackIndex; - - /// \brief FrameIndex for start of varargs area for arguments passed in - /// general purpose registers. - int VarArgsGPRIndex; - - /// \brief Size of the varargs area for arguments passed in general purpose - /// registers. - unsigned VarArgsGPRSize; - - /// \brief FrameIndex for start of varargs area for arguments passed in - /// floating-point registers. - int VarArgsFPRIndex; - - /// \brief Size of the varargs area for arguments passed in floating-point - /// registers. - unsigned VarArgsFPRSize; - -public: - ARM64FunctionInfo() - : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), - NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), - VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {} - - explicit ARM64FunctionInfo(MachineFunction &MF) - : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), - NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), - VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) { - (void)MF; - } - - unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; } - void setBytesInStackArgArea(unsigned bytes) { BytesInStackArgArea = bytes; } - - unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; } - void setArgumentStackToRestore(unsigned bytes) { - ArgumentStackToRestore = bytes; - } - - bool hasStackFrame() const { return HasStackFrame; } - void setHasStackFrame(bool s) { HasStackFrame = s; } - - void setLocalStackSize(unsigned Size) { LocalStackSize = Size; } - unsigned getLocalStackSize() const { return LocalStackSize; } - - void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; } - unsigned getNumLocalDynamicTLSAccesses() const { - return NumLocalDynamicTLSAccesses; - } - - int getVarArgsStackIndex() const { return VarArgsStackIndex; } - void setVarArgsStackIndex(int Index) { VarArgsStackIndex = Index; } - - int getVarArgsGPRIndex() const { return VarArgsGPRIndex; } - void setVarArgsGPRIndex(int Index) { VarArgsGPRIndex = Index; } - - unsigned getVarArgsGPRSize() const { return VarArgsGPRSize; } - void setVarArgsGPRSize(unsigned Size) { VarArgsGPRSize = Size; } - - int getVarArgsFPRIndex() const { return VarArgsFPRIndex; } - void setVarArgsFPRIndex(int Index) { VarArgsFPRIndex = Index; } - - unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; } - void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; } - - typedef SmallPtrSet SetOfInstructions; - - const SetOfInstructions &getLOHRelated() const { return LOHRelated; } - - // Shortcuts for LOH related types. - class MILOHDirective { - MCLOHType Kind; - - /// Arguments of this directive. Order matters. - SmallVector Args; - - public: - typedef SmallVectorImpl LOHArgs; - - MILOHDirective(MCLOHType Kind, const LOHArgs &Args) - : Kind(Kind), Args(Args.begin(), Args.end()) { - assert(isValidMCLOHType(Kind) && "Invalid LOH directive type!"); - } - - MCLOHType getKind() const { return Kind; } - const LOHArgs &getArgs() const { return Args; } - }; - - typedef MILOHDirective::LOHArgs MILOHArgs; - typedef SmallVector MILOHContainer; - - const MILOHContainer &getLOHContainer() const { return LOHContainerSet; } - - /// Add a LOH directive of this @p Kind and this @p Args. - void addLOHDirective(MCLOHType Kind, const MILOHArgs &Args) { - LOHContainerSet.push_back(MILOHDirective(Kind, Args)); - LOHRelated.insert(Args.begin(), Args.end()); - } - -private: - // Hold the lists of LOHs. - MILOHContainer LOHContainerSet; - SetOfInstructions LOHRelated; -}; -} // End llvm namespace - -#endif // ARM64MACHINEFUNCTIONINFO_H diff --git a/lib/Target/ARM64/ARM64PerfectShuffle.h b/lib/Target/ARM64/ARM64PerfectShuffle.h deleted file mode 100644 index 6759236fd14..00000000000 --- a/lib/Target/ARM64/ARM64PerfectShuffle.h +++ /dev/null @@ -1,6586 +0,0 @@ -//===-- ARM64PerfectShuffle.h - AdvSIMD Perfect Shuffle Table -------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file, which was autogenerated by llvm-PerfectShuffle, contains data -// for the optimal way to build a perfect shuffle using AdvSIMD instructions. -// -//===----------------------------------------------------------------------===// - -// 31 entries have cost 0 -// 242 entries have cost 1 -// 1447 entries have cost 2 -// 3602 entries have cost 3 -// 1237 entries have cost 4 -// 2 entries have cost 5 - -// This table is 6561*4 = 26244 bytes in size. -static const unsigned PerfectShuffleTable[6561+1] = { - 135053414U, // <0,0,0,0>: Cost 1 vdup0 LHS - 1543503974U, // <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS - 2618572962U, // <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0> - 2568054923U, // <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0> - 1476398390U, // <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS - 2550140624U, // <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3> - 2550141434U, // <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3> - 2591945711U, // <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0> - 135053414U, // <0,0,0,u>: Cost 1 vdup0 LHS - 2886516736U, // <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0> - 1812775014U, // <0,0,1,1>: Cost 2 vzipl LHS, LHS - 1618133094U, // <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS - 2625209292U, // <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0> - 2886558034U, // <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5> - 2617246864U, // <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7> - 3659723031U, // <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1> - 2591953904U, // <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1> - 1812775581U, // <0,0,1,u>: Cost 2 vzipl LHS, LHS - 3020734464U, // <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0> - 3020734474U, // <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1> - 1946992742U, // <0,0,2,2>: Cost 2 vtrnl LHS, LHS - 2631181989U, // <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0> - 3020734668U, // <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6> - 3826550569U, // <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6> - 2617247674U, // <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7> - 2591962097U, // <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2> - 1946992796U, // <0,0,2,u>: Cost 2 vtrnl LHS, LHS - 2635163787U, // <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> - 2686419196U, // <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0> - 2686492933U, // <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0> - 2617248156U, // <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3> - 2617248258U, // <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6> - 3826551298U, // <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6> - 3690990200U, // <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7> - 3713551042U, // <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0> - 2635163787U, // <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0> - 2617248658U, // <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1> - 2888450150U, // <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS - 3021570150U, // <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS - 3641829519U, // <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4> - 3021570252U, // <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6> - 1543507254U, // <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS - 2752810294U, // <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS - 3786998152U, // <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5> - 1543507497U, // <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS - 2684354972U, // <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7> - 2617249488U, // <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3> - 3765617070U, // <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7> - 3635865780U, // <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5> - 2617249734U, // <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6> - 2617249796U, // <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5> - 2718712274U, // <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7> - 2617249960U, // <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7> - 2720039396U, // <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7> - 2684355053U, // <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7> - 3963609190U, // <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS - 2617250298U, // <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3> - 3796435464U, // <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7> - 3659762998U, // <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS - 3659763810U, // <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0> - 2617250616U, // <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6> - 2657727309U, // <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0> - 2658390942U, // <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0> - 2659054575U, // <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> - 3635880854U, // <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0> - 3635881401U, // <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7> - 3734787298U, // <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0> - 2617251174U, // <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6> - 3659772002U, // <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0> - 3659772189U, // <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7> - 2617251436U, // <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7> - 2659054575U, // <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0> - 135053414U, // <0,0,u,0>: Cost 1 vdup0 LHS - 1817419878U, // <0,0,u,1>: Cost 2 vzipl LHS, LHS - 1947435110U, // <0,0,u,2>: Cost 2 vtrnl LHS, LHS - 2568120467U, // <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u> - 1476463926U, // <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS - 1543510170U, // <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS - 2752813210U, // <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS - 2592011255U, // <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u> - 135053414U, // <0,0,u,u>: Cost 1 vdup0 LHS - 2618581002U, // <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1> - 1557446758U, // <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS - 2618581155U, // <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1> - 2690548468U, // <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0> - 2626543954U, // <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5> - 4094985216U, // <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7> - 2592019278U, // <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1> - 2592019448U, // <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0> - 1557447325U, // <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS - 1476476938U, // <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1> - 2886517556U, // <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1> - 2886517654U, // <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0> - 2886517720U, // <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3> - 1476480310U, // <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS - 2886558864U, // <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7> - 2550223354U, // <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3> - 2550223856U, // <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1> - 1476482862U, // <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS - 1494401126U, // <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS - 3020735284U, // <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1> - 2562172349U, // <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2> - 835584U, // <0,1,2,3>: Cost 0 copy LHS - 1494404406U, // <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS - 3020735488U, // <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7> - 2631190458U, // <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7> - 1518294010U, // <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2> - 835584U, // <0,1,2,u>: Cost 0 copy LHS - 2692318156U, // <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0> - 2691875800U, // <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3> - 2691875806U, // <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0> - 2692539367U, // <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0> - 2562182454U, // <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS - 2691875840U, // <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7> - 2692760578U, // <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0> - 2639817411U, // <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1> - 2691875863U, // <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3> - 2568159334U, // <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS - 4095312692U, // <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1> - 2568160934U, // <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1> - 2568161432U, // <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4> - 2568162614U, // <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS - 1557450038U, // <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS - 2754235702U, // <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS - 2592052220U, // <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4> - 1557450281U, // <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS - 3765617775U, // <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1> - 2647781007U, // <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1> - 3704934138U, // <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0> - 2691875984U, // <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7> - 2657734598U, // <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6> - 2650435539U, // <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1> - 2651099172U, // <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1> - 2651762805U, // <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1> - 2691876029U, // <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7> - 2592063590U, // <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS - 3765617871U, // <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7> - 2654417337U, // <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1> - 3765617889U, // <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7> - 2592066870U, // <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS - 3765617907U, // <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7> - 2657071869U, // <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1> - 1583993678U, // <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1> - 1584657311U, // <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1> - 2657735672U, // <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0> - 2657735808U, // <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1> - 2631193772U, // <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0> - 2661053667U, // <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1> - 2657736038U, // <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6> - 3721524621U, // <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0> - 2657736158U, // <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0> - 2657736300U, // <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7> - 2657736322U, // <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2> - 1494450278U, // <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS - 1557452590U, // <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS - 2754238254U, // <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS - 835584U, // <0,1,u,3>: Cost 0 copy LHS - 1494453558U, // <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS - 1557452954U, // <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS - 2754238618U, // <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS - 1518343168U, // <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u> - 835584U, // <0,1,u,u>: Cost 0 copy LHS - 2752299008U, // <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0> - 1544847462U, // <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS - 1678557286U, // <0,2,0,2>: Cost 2 vuzpl LHS, LHS - 2696521165U, // <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0> - 2752340172U, // <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6> - 2691876326U, // <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7> - 2618589695U, // <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7> - 2592093185U, // <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0> - 1678557340U, // <0,2,0,u>: Cost 2 vuzpl LHS, LHS - 2618589942U, // <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2> - 2752299828U, // <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1> - 2886518376U, // <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2> - 2752299766U, // <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> - 2550295862U, // <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS - 2752340992U, // <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7> - 2886559674U, // <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7> - 3934208106U, // <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7> - 2752340771U, // <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2> - 1476558868U, // <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2> - 2226628029U, // <0,2,2,1>: Cost 3 vrev <2,0,1,2> - 2752300648U, // <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2> - 3020736114U, // <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3> - 1476562230U, // <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS - 2550304464U, // <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3> - 2618591162U, // <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7> - 2550305777U, // <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2> - 1476564782U, // <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS - 2618591382U, // <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2> - 2752301206U, // <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> - 3826043121U, // <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3> - 2752301468U, // <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> - 2618591746U, // <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6> - 2752301570U, // <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> - 3830688102U, // <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3> - 2698807012U, // <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0> - 2752301269U, // <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> - 2562261094U, // <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS - 4095313828U, // <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3> - 2226718152U, // <0,2,4,2>: Cost 3 vrev <2,0,2,4> - 2568235169U, // <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4> - 2562264374U, // <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS - 1544850742U, // <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS - 1678560566U, // <0,2,4,6>: Cost 2 vuzpl LHS, RHS - 2592125957U, // <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4> - 1678560584U, // <0,2,4,u>: Cost 2 vuzpl LHS, RHS - 2691876686U, // <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7> - 2618592976U, // <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3> - 3765618528U, // <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7> - 3765618536U, // <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6> - 2618593222U, // <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6> - 2752303108U, // <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5> - 2618593378U, // <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0> - 2824785206U, // <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS - 2824785207U, // <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS - 2752303950U, // <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1> - 3830690081U, // <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2> - 2618593786U, // <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3> - 2691876794U, // <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7> - 2752303990U, // <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5> - 3830690445U, // <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6> - 2752303928U, // <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6> - 2657743695U, // <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2> - 2691876839U, // <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7> - 2659070961U, // <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> - 2659734594U, // <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2> - 3734140051U, // <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2> - 2701166596U, // <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0> - 2662389094U, // <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6> - 2662389126U, // <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2> - 3736794583U, // <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2> - 2752304748U, // <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7> - 2659070961U, // <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2> - 1476608026U, // <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u> - 1544853294U, // <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS - 1678563118U, // <0,2,u,2>: Cost 2 vuzpl LHS, LHS - 3021178482U, // <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3> - 1476611382U, // <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS - 1544853658U, // <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS - 1678563482U, // <0,2,u,6>: Cost 2 vuzpl LHS, RHS - 2824785449U, // <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS - 1678563172U, // <0,2,u,u>: Cost 2 vuzpl LHS, LHS - 2556329984U, // <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0> - 2686421142U, // <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2> - 2562303437U, // <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0> - 4094986652U, // <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3> - 2556333366U, // <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS - 4094986754U, // <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6> - 3798796488U, // <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7> - 3776530634U, // <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0> - 2556335918U, // <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS - 2886518934U, // <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2> - 2556338933U, // <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1> - 2691877105U, // <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3> - 2886519196U, // <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3> - 2886519298U, // <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6> - 4095740418U, // <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6> - 3659944242U, // <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1> - 3769600286U, // <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3> - 2886519582U, // <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2> - 1482604646U, // <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS - 1482605302U, // <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2> - 2556348008U, // <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2> - 3020736924U, // <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3> - 1482607926U, // <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS - 3020737026U, // <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6> - 2598154746U, // <0,3,2,6>: Cost 3 vext1 , <6,2,7,3> - 2598155258U, // <0,3,2,7>: Cost 3 vext1 , <7,0,1,2> - 1482610478U, // <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS - 3692341398U, // <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2> - 2635851999U, // <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3> - 3636069840U, // <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3> - 2691877276U, // <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3> - 3961522690U, // <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6> - 3826797058U, // <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6> - 3703622282U, // <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7> - 3769600452U, // <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7> - 2640497430U, // <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3> - 3962194070U, // <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2> - 2232617112U, // <0,3,4,1>: Cost 3 vrev <3,0,1,4> - 2232690849U, // <0,3,4,2>: Cost 3 vrev <3,0,2,4> - 4095314332U, // <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3> - 3962194434U, // <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6> - 2691877378U, // <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6> - 3826765110U, // <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS - 3665941518U, // <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4> - 2691877405U, // <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6> - 3630112870U, // <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS - 3630113526U, // <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2> - 4035199734U, // <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2> - 3769600578U, // <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7> - 2232846516U, // <0,3,5,4>: Cost 3 vrev <3,0,4,5> - 3779037780U, // <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7> - 2718714461U, // <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7> - 2706106975U, // <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0> - 2233141464U, // <0,3,5,u>: Cost 3 vrev <3,0,u,5> - 2691877496U, // <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7> - 3727511914U, // <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3> - 3765619338U, // <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7> - 3765619347U, // <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7> - 3765987996U, // <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7> - 3306670270U, // <0,3,6,5>: Cost 4 vrev <3,0,5,6> - 3792456365U, // <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6> - 2706770608U, // <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0> - 2706844345U, // <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0> - 3769600707U, // <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1> - 2659742787U, // <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3> - 3636102612U, // <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7> - 3769600740U, // <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7> - 3769600747U, // <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5> - 3769600758U, // <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7> - 3659993400U, // <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7> - 3781176065U, // <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0> - 2664388218U, // <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3> - 1482653798U, // <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS - 1482654460U, // <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u> - 2556397160U, // <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2> - 3021179292U, // <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3> - 1482657078U, // <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS - 3021179394U, // <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6> - 2598203898U, // <0,3,u,6>: Cost 3 vext1 , <6,2,7,3> - 2708097874U, // <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0> - 1482659630U, // <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS - 2617278468U, // <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4> - 2618605670U, // <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS - 2618605734U, // <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4> - 3642091695U, // <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0> - 2753134796U, // <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6> - 2718714770U, // <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1> - 3021245750U, // <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS - 3665982483U, // <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0> - 3021245768U, // <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS - 2568355942U, // <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS - 3692348212U, // <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1> - 3692348310U, // <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0> - 2568358064U, // <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1> - 2568359222U, // <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS - 1812778294U, // <0,4,1,5>: Cost 2 vzipl LHS, RHS - 3022671158U, // <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS - 2592248852U, // <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1> - 1812778537U, // <0,4,1,u>: Cost 2 vzipl LHS, RHS - 2568364134U, // <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS - 2238573423U, // <0,4,2,1>: Cost 3 vrev <4,0,1,2> - 3692349032U, // <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2> - 2631214761U, // <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4> - 2568367414U, // <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS - 2887028022U, // <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS - 1946996022U, // <0,4,2,6>: Cost 2 vtrnl LHS, RHS - 2592257045U, // <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2> - 1946996040U, // <0,4,2,u>: Cost 2 vtrnl LHS, RHS - 3692349590U, // <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2> - 3826878614U, // <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2> - 3826878625U, // <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4> - 3692349852U, // <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3> - 3692349954U, // <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6> - 3826878978U, // <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6> - 4095200566U, // <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS - 3713583814U, // <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4> - 3692350238U, // <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2> - 2550464552U, // <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4> - 3962194914U, // <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0> - 3693677631U, // <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3> - 3642124467U, // <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4> - 2718715088U, // <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4> - 2618608950U, // <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS - 2753137974U, // <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS - 3666015255U, // <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4> - 2618609193U, // <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS - 2568388710U, // <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS - 2568389526U, // <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0> - 3636159963U, // <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5> - 2568390836U, // <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5> - 2568391990U, // <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS - 2718715180U, // <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6> - 1618136374U, // <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS - 2592281624U, // <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5> - 1618136392U, // <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS - 2550480938U, // <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6> - 3826880801U, // <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2> - 2562426332U, // <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6> - 3786190181U, // <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0> - 2718715252U, // <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6> - 3826881165U, // <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6> - 2712669568U, // <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0> - 2657760081U, // <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4> - 2718715284U, // <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2> - 3654090854U, // <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS - 3934229326U, // <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1> - 3734156437U, // <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4> - 3734820070U, // <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4> - 3654094134U, // <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS - 2713259464U, // <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> - 2713333201U, // <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0> - 3654095866U, // <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2> - 2713259464U, // <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0> - 2568413286U, // <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS - 2618611502U, // <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS - 2753140526U, // <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS - 2568415415U, // <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u> - 2568416566U, // <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS - 1817423158U, // <0,4,u,5>: Cost 2 vzipl LHS, RHS - 1947438390U, // <0,4,u,6>: Cost 2 vtrnl LHS, RHS - 2592306203U, // <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u> - 1947438408U, // <0,4,u,u>: Cost 2 vtrnl LHS, RHS - 3630219264U, // <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0> - 2625912934U, // <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS - 3692355748U, // <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2> - 3693019384U, // <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5> - 3630222646U, // <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS - 3699655062U, // <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1> - 2718715508U, // <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1> - 3087011126U, // <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS - 2625913501U, // <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS - 1500659814U, // <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS - 2886520528U, // <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3> - 2574403176U, // <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2> - 2574403734U, // <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2> - 1500662674U, // <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1> - 2886520836U, // <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5> - 2886520930U, // <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0> - 2718715600U, // <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3> - 1500665646U, // <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS - 2556493926U, // <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS - 2244546120U, // <0,5,2,1>: Cost 3 vrev <5,0,1,2> - 3692357256U, // <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7> - 2568439994U, // <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2> - 2556497206U, // <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS - 3020738564U, // <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5> - 4027877161U, // <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6> - 3093220662U, // <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS - 3093220663U, // <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS - 3699656854U, // <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2> - 3699656927U, // <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3> - 3699657006U, // <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1> - 3699657116U, // <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3> - 2637859284U, // <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5> - 3790319453U, // <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0> - 3699657354U, // <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7> - 2716725103U, // <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0> - 2716798840U, // <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0> - 2661747602U, // <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1> - 3630252810U, // <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4> - 3636225507U, // <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4> - 3716910172U, // <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5> - 3962195892U, // <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6> - 2625916214U, // <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS - 3718901071U, // <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5> - 2718715846U, // <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6> - 2625916457U, // <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS - 3791278034U, // <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0> - 3791351771U, // <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0> - 3318386260U, // <0,5,5,2>: Cost 4 vrev <5,0,2,5> - 3791499245U, // <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0> - 3318533734U, // <0,5,5,4>: Cost 4 vrev <5,0,4,5> - 2718715908U, // <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5> - 2657767522U, // <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0> - 2718715928U, // <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7> - 2718715937U, // <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7> - 2592358502U, // <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS - 3792015404U, // <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0> - 3731509754U, // <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3> - 3785748546U, // <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4> - 2592361782U, // <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS - 2592362594U, // <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0> - 3785748576U, // <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7> - 1644974178U, // <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0> - 1645047915U, // <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0> - 2562506854U, // <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS - 2562507670U, // <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0> - 2562508262U, // <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7> - 3636250774U, // <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2> - 2562510134U, // <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS - 2718716072U, // <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7> - 2718716074U, // <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0> - 2719379635U, // <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0> - 2562512686U, // <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS - 1500717158U, // <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS - 2625918766U, // <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS - 2719674583U, // <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0> - 2568489152U, // <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u> - 1500720025U, // <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u> - 2625919130U, // <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS - 2586407243U, // <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u> - 1646301444U, // <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0> - 1646375181U, // <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0> - 2586411110U, // <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS - 2619949158U, // <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS - 2619949220U, // <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2> - 3785748789U, // <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4> - 2619949386U, // <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6> - 2586415202U, // <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0> - 2586415436U, // <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0> - 2952793398U, // <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS - 2619949725U, // <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS - 2562531430U, // <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS - 3693691700U, // <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1> - 2886521338U, // <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3> - 3693691864U, // <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3> - 2562534710U, // <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS - 2580450932U, // <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1> - 2886521656U, // <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6> - 2966736182U, // <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS - 2966736183U, // <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS - 1500741734U, // <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS - 2250518817U, // <0,6,2,1>: Cost 3 vrev <6,0,1,2> - 2574485096U, // <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2> - 2631894694U, // <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1> - 1500744604U, // <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2> - 2574487248U, // <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3> - 3020739384U, // <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6> - 2954136886U, // <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS - 1500747566U, // <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS - 3693693078U, // <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2> - 3705637136U, // <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7> - 3705637192U, // <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0> - 3693693340U, // <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3> - 2637867477U, // <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6> - 3705637424U, // <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7> - 3666154056U, // <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0> - 2722697800U, // <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0> - 2722771537U, // <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0> - 2562556006U, // <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS - 4095316257U, // <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2> - 2562557420U, // <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4> - 3636299926U, // <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2> - 2562559286U, // <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS - 2619952438U, // <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS - 2723287696U, // <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0> - 4027895094U, // <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS - 2619952681U, // <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS - 2718716594U, // <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> - 3648250774U, // <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0> - 3792458436U, // <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7> - 3705638767U, // <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0> - 3648252831U, // <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5> - 3797619416U, // <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0> - 3792458472U, // <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7> - 4035202358U, // <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS - 2718716594U, // <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7> - 3786412796U, // <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0> - 3792458504U, // <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3> - 3728200126U, // <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6> - 3798135575U, // <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0> - 3786412836U, // <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4> - 3792458543U, // <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6> - 2718716728U, // <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6> - 2718716738U, // <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7> - 2718716747U, // <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7> - 2718716750U, // <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1> - 2724909910U, // <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0> - 3636323823U, // <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7> - 2725057384U, // <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0> - 2718716790U, // <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5> - 2718716800U, // <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6> - 3792458629U, // <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2> - 2725352332U, // <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0> - 2718716822U, // <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1> - 1500790886U, // <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS - 2619954990U, // <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS - 2562590192U, // <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u> - 2725721017U, // <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0> - 1500793762U, // <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u> - 2619955354U, // <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS - 2725942228U, // <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0> - 2954186038U, // <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS - 1500796718U, // <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS - 2256401391U, // <0,7,0,0>: Cost 3 vrev <7,0,0,0> - 2632564838U, // <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS - 2256548865U, // <0,7,0,2>: Cost 3 vrev <7,0,2,0> - 3700998396U, // <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0> - 2718716952U, // <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5> - 2718716962U, // <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6> - 2621284845U, // <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7> - 3904685542U, // <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7> - 2632565405U, // <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS - 2256409584U, // <0,7,1,0>: Cost 3 vrev <7,0,0,1> - 3706307380U, // <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1> - 2632565654U, // <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0> - 3769603168U, // <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5> - 2256704532U, // <0,7,1,4>: Cost 3 vrev <7,0,4,1> - 3769603184U, // <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3> - 3700999366U, // <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7> - 2886522476U, // <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7> - 2256999480U, // <0,7,1,u>: Cost 3 vrev <7,0,u,1> - 2586501222U, // <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS - 1182749690U, // <0,7,2,1>: Cost 2 vrev <7,0,1,2> - 3636356595U, // <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2> - 2727711916U, // <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0> - 2586504502U, // <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS - 2632566606U, // <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7> - 2586505559U, // <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2> - 3020740204U, // <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7> - 1183265849U, // <0,7,2,u>: Cost 2 vrev <7,0,u,2> - 3701000342U, // <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2> - 3706308849U, // <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3> - 3330315268U, // <0,7,3,2>: Cost 4 vrev <7,0,2,3> - 3706309020U, // <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3> - 3706309122U, // <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6> - 3712281127U, // <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7> - 2639202936U, // <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> - 3802412321U, // <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0> - 2640530202U, // <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7> - 3654287462U, // <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS - 2256507900U, // <0,7,4,1>: Cost 3 vrev <7,0,1,4> - 2256581637U, // <0,7,4,2>: Cost 3 vrev <7,0,2,4> - 3660262008U, // <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7> - 3786413405U, // <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6> - 2632568118U, // <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS - 3718917457U, // <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7> - 3787003255U, // <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5> - 2632568361U, // <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS - 3706310268U, // <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0> - 3792459156U, // <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7> - 3330331654U, // <0,7,5,2>: Cost 4 vrev <7,0,2,5> - 3722899255U, // <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7> - 2256737304U, // <0,7,5,4>: Cost 3 vrev <7,0,4,5> - 3724226521U, // <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7> - 2718717377U, // <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7> - 2729997763U, // <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0> - 2720044499U, // <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7> - 3712946517U, // <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0> - 2256524286U, // <0,7,6,1>: Cost 3 vrev <7,0,1,6> - 3792459246U, // <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7> - 3796440567U, // <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7> - 3654307126U, // <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS - 2656457394U, // <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7> - 3792459281U, // <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6> - 2730661396U, // <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0> - 2658448293U, // <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7> - 3787003431U, // <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1> - 3654312854U, // <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0> - 3654313446U, // <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7> - 3804771905U, // <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0> - 3654315318U, // <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS - 3654315651U, // <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7> - 3660288348U, // <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7> - 2718717548U, // <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7> - 2664420990U, // <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7> - 2256466935U, // <0,7,u,0>: Cost 3 vrev <7,0,0,u> - 1182798848U, // <0,7,u,1>: Cost 2 vrev <7,0,1,u> - 2256614409U, // <0,7,u,2>: Cost 3 vrev <7,0,2,u> - 2731693714U, // <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0> - 2256761883U, // <0,7,u,4>: Cost 3 vrev <7,0,4,u> - 2632571034U, // <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS - 2669066421U, // <0,7,u,6>: Cost 3 vext2 , - 2731988662U, // <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0> - 1183315007U, // <0,7,u,u>: Cost 2 vrev <7,0,u,u> - 135053414U, // <0,u,0,0>: Cost 1 vdup0 LHS - 1544896614U, // <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS - 1678999654U, // <0,u,0,2>: Cost 2 vuzpl LHS, LHS - 2691880677U, // <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, - 1476988214U, // <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS - 2718791419U, // <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, - 3021248666U, // <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS - 2592535607U, // <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0> - 135053414U, // <0,u,0,u>: Cost 1 vdup0 LHS - 1476993097U, // <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1> - 1812780846U, // <0,u,1,1>: Cost 2 vzipl LHS, LHS - 1618138926U, // <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS - 2752742134U, // <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2> - 1476996406U, // <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS - 1812781210U, // <0,u,1,5>: Cost 2 vzipl LHS, RHS - 2887006416U, // <0,u,1,6>: Cost 3 vzipl LHS, - 2966736200U, // <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS - 1812781413U, // <0,u,1,u>: Cost 2 vzipl LHS, LHS - 1482973286U, // <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS - 1482973987U, // <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2> - 1946998574U, // <0,u,2,2>: Cost 2 vtrnl LHS, LHS - 835584U, // <0,u,2,3>: Cost 0 copy LHS - 1482976566U, // <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS - 3020781631U, // <0,u,2,5>: Cost 3 vtrnl LHS, - 1946998938U, // <0,u,2,6>: Cost 2 vtrnl LHS, RHS - 1518810169U, // <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2> - 835584U, // <0,u,2,u>: Cost 0 copy LHS - 2618640534U, // <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2> - 2752743574U, // <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2> - 2636556597U, // <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u> - 2752743836U, // <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3> - 2618640898U, // <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6> - 2752743938U, // <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6> - 2639202936U, // <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7> - 2639874762U, // <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u> - 2752743637U, // <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2> - 2562703462U, // <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS - 2888455982U, // <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS - 3021575982U, // <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS - 2568677591U, // <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4> - 2562706742U, // <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS - 1544899894U, // <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS - 1679002934U, // <0,u,4,6>: Cost 2 vuzpl LHS, RHS - 2718718033U, // <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, - 1679002952U, // <0,u,4,u>: Cost 2 vuzpl LHS, RHS - 2568683622U, // <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS - 2568684438U, // <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0> - 3765622902U, // <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, - 2691881087U, // <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, - 2568686902U, // <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS - 2650492890U, // <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u> - 1618139290U, // <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS - 2824834358U, // <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS - 1618139308U, // <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS - 2592579686U, // <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS - 2262496983U, // <0,u,6,1>: Cost 3 vrev - 2654474688U, // <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u> - 2691881168U, // <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, - 2592582966U, // <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS - 2656465587U, // <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u> - 2657129220U, // <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u> - 1584051029U, // <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u> - 1584714662U, // <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u> - 2562728038U, // <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS - 2562728854U, // <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0> - 2562729473U, // <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7> - 2661111018U, // <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u> - 2562731318U, // <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS - 2718718258U, // <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, - 2586620261U, // <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7> - 2657793644U, // <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7> - 2562733870U, // <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS - 135053414U, // <0,u,u,0>: Cost 1 vdup0 LHS - 1544902446U, // <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS - 1679005486U, // <0,u,u,2>: Cost 2 vuzpl LHS, LHS - 835584U, // <0,u,u,3>: Cost 0 copy LHS - 1483025718U, // <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS - 1544902810U, // <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS - 1679005850U, // <0,u,u,6>: Cost 2 vuzpl LHS, RHS - 1518859327U, // <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u> - 835584U, // <0,u,u,u>: Cost 0 copy LHS - 2689744896U, // <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0> - 1610694666U, // <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1> - 2689744916U, // <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2> - 2619310332U, // <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0> - 2684657701U, // <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1> - 2620637598U, // <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0> - 3708977654U, // <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7> - 3666351168U, // <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0> - 1611210825U, // <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1> - 2556780646U, // <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS - 2556781355U, // <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1> - 1616003174U, // <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS - 3693052888U, // <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3> - 2556783926U, // <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS - 2580672143U, // <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1> - 2724839566U, // <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7> - 3654415354U, // <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2> - 1616003228U, // <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS - 2685690019U, // <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1> - 2685763756U, // <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1> - 2698297524U, // <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0> - 2685911230U, // <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1> - 2689745100U, // <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6> - 3764814038U, // <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7> - 2724839640U, // <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0> - 2592625658U, // <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2> - 2686279915U, // <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1> - 3087843328U, // <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0> - 3087843338U, // <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1> - 67944550U, // <1,0,3,2>: Cost 1 vrev LHS - 2568743135U, // <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3> - 2562772278U, // <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS - 4099850454U, // <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7> - 3704998538U, // <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7> - 2592633923U, // <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3> - 68386972U, // <1,0,3,u>: Cost 1 vrev LHS - 2620640146U, // <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1> - 2689745234U, // <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5> - 2689745244U, // <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6> - 3760980320U, // <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1> - 3761054057U, // <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1> - 2619313462U, // <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS - 3761201531U, // <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1> - 3666383940U, // <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4> - 2619313705U, // <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS - 4029300736U, // <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0> - 2895249510U, // <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS - 3028287590U, // <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS - 3642501345U, // <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5> - 2215592058U, // <1,0,5,4>: Cost 3 vrev <0,1,4,5> - 3724242907U, // <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0> - 3724906540U, // <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0> - 3911118134U, // <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS - 3028287644U, // <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS - 3762086375U, // <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1> - 2698297846U, // <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7> - 3760022015U, // <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7> - 3642509538U, // <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6> - 3762381323U, // <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1> - 3730215604U, // <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0> - 3730879237U, // <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0> - 2657801046U, // <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0> - 2658464679U, // <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0> - 2659128312U, // <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0> - 4047898278U, // <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1> - 2215460970U, // <1,0,7,2>: Cost 3 vrev <0,1,2,7> - 3734861035U, // <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0> - 3731543398U, // <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6> - 3736188301U, // <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0> - 2663110110U, // <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0> - 3731543660U, // <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7> - 2664437376U, // <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0> - 3087884288U, // <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0> - 1616003730U, // <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1> - 67985515U, // <1,0,u,2>: Cost 1 vrev LHS - 2689893028U, // <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1> - 2689745586U, // <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6> - 2619316378U, // <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS - 2669082807U, // <1,0,u,6>: Cost 3 vext2 , - 2592674888U, // <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u> - 68427937U, // <1,0,u,u>: Cost 1 vrev LHS - 1543585802U, // <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1> - 1548894310U, // <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS - 2618654892U, // <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1> - 2689745654U, // <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2> - 2622636370U, // <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5> - 2620645791U, // <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1> - 3696378367U, // <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7> - 3666424905U, // <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0> - 1548894866U, // <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1> - 1483112550U, // <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS - 202162278U, // <1,1,1,1>: Cost 1 vdup1 LHS - 2622636950U, // <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0> - 2622637016U, // <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3> - 1483115830U, // <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS - 2622637200U, // <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7> - 2622637263U, // <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7> - 2592691274U, // <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1> - 202162278U, // <1,1,1,u>: Cost 1 vdup1 LHS - 2550890588U, // <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2> - 2617329183U, // <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1> - 2622637672U, // <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2> - 2622637734U, // <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1> - 2550893878U, // <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS - 3696379744U, // <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7> - 2622638010U, // <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7> - 3804554170U, // <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0> - 2622638139U, // <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1> - 2622638230U, // <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2> - 3087844148U, // <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1> - 4161585244U, // <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2> - 2014101606U, // <1,1,3,3>: Cost 2 vtrnr LHS, LHS - 2622638594U, // <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6> - 2689745920U, // <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7> - 3763487753U, // <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7> - 2592707660U, // <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3> - 2014101611U, // <1,1,3,u>: Cost 2 vtrnr LHS, LHS - 2556878950U, // <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS - 2221335351U, // <1,1,4,1>: Cost 3 vrev <1,1,1,4> - 3696380988U, // <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0> - 3763487805U, // <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5> - 2556882230U, // <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS - 1548897590U, // <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS - 2758184246U, // <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS - 3666457677U, // <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4> - 1548897833U, // <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS - 2693653615U, // <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1> - 2617331408U, // <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3> - 4029302934U, // <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2> - 2689746064U, // <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7> - 2221564755U, // <1,1,5,4>: Cost 3 vrev <1,1,4,5> - 2955559250U, // <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5> - 2617331810U, // <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0> - 2825293110U, // <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS - 2689746109U, // <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7> - 3696382241U, // <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2> - 2689746127U, // <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7> - 2617332218U, // <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3> - 3763487969U, // <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7> - 3696382605U, // <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6> - 4029309266U, // <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5> - 2617332536U, // <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6> - 2724840702U, // <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0> - 2725504263U, // <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0> - 2617332720U, // <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1> - 2659800138U, // <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> - 3691074717U, // <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3> - 4167811174U, // <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS - 2617333094U, // <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6> - 3295396702U, // <1,1,7,5>: Cost 4 vrev <1,1,5,7> - 3803891014U, // <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0> - 2617333356U, // <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7> - 2659800138U, // <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1> - 1483112550U, // <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS - 202162278U, // <1,1,u,1>: Cost 1 vdup1 LHS - 2622642056U, // <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, - 2014142566U, // <1,1,u,3>: Cost 2 vtrnr LHS, LHS - 1483115830U, // <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS - 1548900506U, // <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS - 2622642384U, // <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, - 2825293353U, // <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS - 202162278U, // <1,1,u,u>: Cost 1 vdup1 LHS - 2635251712U, // <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0> - 1561509990U, // <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS - 2618663085U, // <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2> - 2696529358U, // <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1> - 2635252050U, // <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5> - 3769533926U, // <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7> - 2621317617U, // <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2> - 2659140170U, // <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1> - 1561510557U, // <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS - 2623308516U, // <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2> - 2635252532U, // <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1> - 2631271318U, // <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0> - 2958180454U, // <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS - 2550959414U, // <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS - 2635252880U, // <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7> - 2635252952U, // <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7> - 3732882731U, // <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0> - 2958180459U, // <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS - 2629281213U, // <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2> - 2635253280U, // <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2> - 2618664552U, // <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2> - 2689746546U, // <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3> - 3764815485U, // <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5> - 3760023176U, // <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7> - 2635253690U, // <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7> - 2659141610U, // <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1> - 2689746591U, // <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3> - 403488870U, // <1,2,3,0>: Cost 1 vext1 LHS, LHS - 1477231350U, // <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2> - 1477232232U, // <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2> - 1477233052U, // <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3> - 403492150U, // <1,2,3,4>: Cost 1 vext1 LHS, RHS - 1525010128U, // <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3> - 1525010938U, // <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3> - 1525011450U, // <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2> - 403494702U, // <1,2,3,u>: Cost 1 vext1 LHS, LHS - 2641226607U, // <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2> - 3624723446U, // <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6> - 3301123609U, // <1,2,4,2>: Cost 4 vrev <2,1,2,4> - 2598759198U, // <1,2,4,3>: Cost 3 vext1 , <3,u,1,2> - 2659142864U, // <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4> - 1561513270U, // <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS - 2659143028U, // <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6> - 2659143112U, // <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0> - 1561513513U, // <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS - 2550988902U, // <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS - 2550989824U, // <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7> - 3624732264U, // <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2> - 2955559014U, // <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS - 2550992182U, // <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS - 2659143684U, // <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5> - 2659143778U, // <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0> - 2659143848U, // <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7> - 2550994734U, // <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS - 2700289945U, // <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1> - 2635256232U, // <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2> - 2659144186U, // <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3> - 2689746874U, // <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7> - 3763488705U, // <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5> - 3763488716U, // <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7> - 2659144504U, // <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6> - 2657817432U, // <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2> - 2689746919U, // <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7> - 1585402874U, // <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2> - 2659144770U, // <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2> - 3708998858U, // <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3> - 2635257059U, // <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1> - 2659145062U, // <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6> - 3732886916U, // <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0> - 3732886998U, // <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1> - 2659145255U, // <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1> - 1590711938U, // <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2> - 403529835U, // <1,2,u,0>: Cost 1 vext1 LHS, LHS - 1477272310U, // <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2> - 1477273192U, // <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2> - 1477273750U, // <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2> - 403533110U, // <1,2,u,4>: Cost 1 vext1 LHS, RHS - 1561516186U, // <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS - 1525051898U, // <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3> - 1525052410U, // <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2> - 403535662U, // <1,2,u,u>: Cost 1 vext1 LHS, LHS - 2819407872U, // <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0> - 1551564902U, // <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS - 2819408630U, // <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2> - 2619334911U, // <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3> - 2625306962U, // <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5> - 3832725879U, // <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6> - 3699048959U, // <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7> - 3776538827U, // <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1> - 1551565469U, // <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS - 2618671862U, // <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2> - 2819408692U, // <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1> - 2624643975U, // <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3> - 1745666150U, // <1,3,1,3>: Cost 2 vuzpr LHS, LHS - 2557005110U, // <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS - 2625307792U, // <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7> - 3698386127U, // <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7> - 2592838748U, // <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1> - 1745666155U, // <1,3,1,u>: Cost 2 vuzpr LHS, LHS - 2819408790U, // <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> - 2625308193U, // <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3> - 2819408036U, // <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> - 2819851890U, // <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> - 2819408794U, // <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> - 3893149890U, // <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5> - 2819408076U, // <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> - 3772041583U, // <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3> - 2819408042U, // <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> - 1483276390U, // <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS - 1483277128U, // <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3> - 2557019752U, // <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2> - 2819408856U, // <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3> - 1483279670U, // <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS - 2819409614U, // <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5> - 2598826490U, // <1,3,3,6>: Cost 3 vext1 , <6,2,7,3> - 3087844352U, // <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7> - 1483282222U, // <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS - 2568970342U, // <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS - 2568971224U, // <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3> - 3832761290U, // <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3> - 2233428219U, // <1,3,4,3>: Cost 3 vrev <3,1,3,4> - 2568973622U, // <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS - 1551568182U, // <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS - 2819410434U, // <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6> - 3666605151U, // <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4> - 1551568425U, // <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS - 2563006566U, // <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS - 2568979456U, // <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7> - 2563008035U, // <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5> - 2233436412U, // <1,3,5,3>: Cost 3 vrev <3,1,3,5> - 2563009846U, // <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS - 2867187716U, // <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5> - 2655834214U, // <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4> - 1745669430U, // <1,3,5,7>: Cost 2 vuzpr LHS, RHS - 1745669431U, // <1,3,5,u>: Cost 2 vuzpr LHS, RHS - 2867187810U, // <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0> - 3699052931U, // <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1> - 2654507460U, // <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3> - 3766291091U, // <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7> - 2655834726U, // <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3> - 3923384562U, // <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, - 2657161992U, // <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3> - 2819852218U, // <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> - 2819852219U, // <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> - 2706926275U, // <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1> - 2659816524U, // <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3> - 3636766245U, // <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7> - 2867187903U, // <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3> - 2625312102U, // <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6> - 2867188598U, // <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5> - 3728250344U, // <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1> - 2867187880U, // <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7> - 2707516171U, // <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1> - 1483317350U, // <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS - 1483318093U, // <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u> - 2819410718U, // <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2> - 1745666717U, // <1,3,u,3>: Cost 2 vuzpr LHS, LHS - 1483320630U, // <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS - 1551571098U, // <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS - 2819410758U, // <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6> - 1745669673U, // <1,3,u,7>: Cost 2 vuzpr LHS, RHS - 1745666722U, // <1,3,u,u>: Cost 2 vuzpr LHS, LHS - 2617352205U, // <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4> - 2619342950U, // <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS - 3692421295U, // <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4> - 2619343104U, // <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4> - 2617352530U, // <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5> - 1634880402U, // <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1> - 2713930652U, // <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2> - 3732898396U, // <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1> - 1635101613U, // <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1> - 3693085430U, // <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2> - 2623988535U, // <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4> - 3693085590U, // <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0> - 3692422134U, // <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6> - 3693085726U, // <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1> - 2892401974U, // <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS - 3026619702U, // <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS - 3800206324U, // <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0> - 2892402217U, // <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS - 3966978927U, // <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2> - 3966979018U, // <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3> - 3693086312U, // <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2> - 2635269798U, // <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1> - 3966979280U, // <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4> - 2893204790U, // <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS - 3693086650U, // <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7> - 3666662502U, // <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2> - 2893205033U, // <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS - 2563063910U, // <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS - 2563064730U, // <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4> - 2563065386U, // <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3> - 3693087132U, // <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3> - 2619345410U, // <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6> - 3087843666U, // <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5> - 3087843676U, // <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6> - 3666670695U, // <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3> - 3087843669U, // <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u> - 2620672914U, // <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1> - 3630842706U, // <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4> - 3313069003U, // <1,4,4,2>: Cost 4 vrev <4,1,2,4> - 3642788100U, // <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4> - 2713930960U, // <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4> - 2619346230U, // <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS - 2713930980U, // <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6> - 3736882642U, // <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1> - 2619346473U, // <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS - 2557108326U, // <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS - 2557109075U, // <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5> - 2598913774U, // <1,4,5,2>: Cost 3 vext1 , <2,3,u,1> - 3630852246U, // <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2> - 2557111606U, // <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS - 2895252790U, // <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS - 1616006454U, // <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS - 3899059510U, // <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS - 1616006472U, // <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS - 2557116518U, // <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS - 2557117236U, // <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1> - 3630859880U, // <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2> - 2569062550U, // <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2> - 2557119798U, // <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS - 3763490174U, // <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7> - 3763490183U, // <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7> - 2712751498U, // <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> - 2557122350U, // <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS - 2659161084U, // <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4> - 3732903040U, // <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1> - 3734230174U, // <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4> - 3734893807U, // <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4> - 3660729654U, // <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS - 3786493384U, // <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0> - 2713341394U, // <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1> - 3660731386U, // <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2> - 2664470148U, // <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4> - 2557132902U, // <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS - 2619348782U, // <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS - 2563106351U, // <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u> - 2713783816U, // <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1> - 2622666815U, // <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, - 1640189466U, // <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1> - 1616006697U, // <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS - 2712751498U, // <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1> - 1616006715U, // <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS - 2620014592U, // <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0> - 1546272870U, // <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS - 2618687664U, // <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5> - 3693093120U, // <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4> - 1546273106U, // <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5> - 2620678563U, // <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5> - 2714668660U, // <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1> - 3772042877U, // <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1> - 1546273437U, // <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS - 2620015350U, // <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2> - 2620015412U, // <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1> - 2620015510U, // <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0> - 2618688512U, // <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7> - 2620015677U, // <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5> - 2620015727U, // <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1> - 2620015859U, // <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7> - 3093728566U, // <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS - 2620015981U, // <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3> - 3692430816U, // <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1> - 2620016163U, // <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5> - 2620016232U, // <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2> - 2620016294U, // <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1> - 3693758221U, // <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5> - 3692431209U, // <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7> - 2620016570U, // <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7> - 4173598006U, // <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS - 2620016699U, // <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1> - 2620016790U, // <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2> - 2569110672U, // <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7> - 3693758785U, // <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2> - 2620017052U, // <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3> - 2620017154U, // <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6> - 3135623172U, // <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5> - 4161587048U, // <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6> - 2014104886U, // <1,5,3,7>: Cost 2 vtrnr LHS, RHS - 2014104887U, // <1,5,3,u>: Cost 2 vtrnr LHS, RHS - 2620017554U, // <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1> - 2620017634U, // <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0> - 3693759551U, // <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3> - 3642861837U, // <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4> - 2575092710U, // <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4> - 1546276150U, // <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS - 2759855414U, // <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS - 2713931718U, // <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6> - 1546276393U, // <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS - 2557182054U, // <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS - 2557182812U, // <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5> - 3630925347U, // <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5> - 4029301675U, // <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3> - 2557185334U, // <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS - 2713931780U, // <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5> - 2667794530U, // <1,5,5,6>: Cost 3 vext2 , <5,6,7,0> - 2713931800U, // <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7> - 2557187886U, // <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS - 2718208036U, // <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1> - 2620019115U, // <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5> - 2667794938U, // <1,5,6,2>: Cost 3 vext2 , <6,2,7,3> - 3787673666U, // <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4> - 3693761165U, // <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6> - 3319279297U, // <1,5,6,5>: Cost 4 vrev <5,1,5,6> - 2667795256U, // <1,5,6,6>: Cost 3 vext2 , <6,6,6,6> - 2713931874U, // <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0> - 2713931883U, // <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0> - 2557198438U, // <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS - 2557199156U, // <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1> - 2569143974U, // <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1> - 2569144592U, // <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7> - 2557201718U, // <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS - 2713931944U, // <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7> - 3787673770U, // <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0> - 2719387828U, // <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1> - 2557204270U, // <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS - 2620020435U, // <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, - 1546278702U, // <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS - 2620020616U, // <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, - 2620020668U, // <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, - 1594054682U, // <1,5,u,4>: Cost 2 vext2 , - 1546279066U, // <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS - 2620020944U, // <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, - 2014145846U, // <1,5,u,7>: Cost 2 vtrnr LHS, RHS - 2014145847U, // <1,5,u,u>: Cost 2 vtrnr LHS, RHS - 3692437504U, // <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0> - 2618695782U, // <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS - 2618695857U, // <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6> - 3794161970U, // <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1> - 2620023122U, // <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5> - 2620686756U, // <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6> - 2621350389U, // <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6> - 4028599606U, // <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS - 2618696349U, // <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS - 3692438262U, // <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2> - 2625995572U, // <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1> - 3692438422U, // <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0> - 3692438488U, // <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3> - 2625995820U, // <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6> - 3692438672U, // <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7> - 3692438720U, // <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1> - 2958183734U, // <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS - 2958183735U, // <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS - 2721526201U, // <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1> - 3692439097U, // <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0> - 3692439144U, // <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2> - 3692439206U, // <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1> - 3636948278U, // <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS - 3787674092U, // <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7> - 2618697658U, // <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7> - 2970799414U, // <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS - 2970799415U, // <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS - 2563211366U, // <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS - 3699738854U, // <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1> - 2563212860U, // <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3> - 3692439964U, // <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3> - 2563214646U, // <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS - 4191820018U, // <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, - 2587103648U, // <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3> - 3087845306U, // <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7> - 3087845307U, // <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u> - 3693767570U, // <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1> - 3693767650U, // <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0> - 3636962877U, // <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4> - 3325088134U, // <1,6,4,3>: Cost 4 vrev <6,1,3,4> - 3693767898U, // <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5> - 2618699062U, // <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS - 3833670966U, // <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS - 4028632374U, // <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS - 2618699305U, // <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS - 3693768264U, // <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2> - 3630998373U, // <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5> - 3636971070U, // <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5> - 3642943767U, // <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5> - 3693768628U, // <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6> - 3732918276U, // <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5> - 2620690530U, // <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0> - 2955562294U, // <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS - 2955562295U, // <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS - 2724180733U, // <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1> - 3631006566U, // <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6> - 3631007674U, // <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7> - 3692442184U, // <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0> - 3631009078U, // <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS - 3787674416U, // <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7> - 2713932600U, // <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6> - 2713932610U, // <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7> - 2713932619U, // <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7> - 1651102542U, // <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1> - 2724918103U, // <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1> - 2698302306U, // <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3> - 3642960153U, // <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7> - 2713932662U, // <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5> - 2725213051U, // <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1> - 2724844426U, // <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7> - 4035956022U, // <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS - 1651692438U, // <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1> - 1651766175U, // <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1> - 2618701614U, // <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS - 3135663508U, // <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2> - 3692443580U, // <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, - 2713932743U, // <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5> - 2618701978U, // <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS - 2622683344U, // <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, - 3087886266U, // <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7> - 1652356071U, // <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1> - 2726171632U, // <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1> - 2626666598U, // <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS - 3695100067U, // <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1> - 3707044102U, // <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1> - 2726466580U, // <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1> - 3654921933U, // <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0> - 2621358582U, // <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7> - 2622022215U, // <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7> - 2626667165U, // <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS - 2593128550U, // <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS - 2626667316U, // <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1> - 3700409238U, // <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0> - 2257294428U, // <1,7,1,3>: Cost 3 vrev <7,1,3,1> - 2593131830U, // <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS - 2626667646U, // <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7> - 2627331279U, // <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7> - 2593133696U, // <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1> - 2628658545U, // <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7> - 2587164774U, // <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS - 3701073445U, // <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7> - 3700409960U, // <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2> - 2638612134U, // <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1> - 2587168054U, // <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS - 3706382167U, // <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7> - 2587169192U, // <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2> - 3660911610U, // <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2> - 2587170606U, // <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS - 1507459174U, // <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS - 2569257984U, // <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7> - 2581202536U, // <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2> - 2569259294U, // <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3> - 1507462454U, // <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS - 1507462864U, // <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3> - 2581205498U, // <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3> - 2581206010U, // <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2> - 1507465006U, // <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS - 2728826164U, // <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1> - 3654951732U, // <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1> - 3330987094U, // <1,7,4,2>: Cost 4 vrev <7,1,2,4> - 3331060831U, // <1,7,4,3>: Cost 4 vrev <7,1,3,4> - 3787674971U, // <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4> - 2626669878U, // <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS - 3785979241U, // <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0> - 3787085176U, // <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6> - 2626670121U, // <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS - 2569273446U, // <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS - 2569274368U, // <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7> - 3643016808U, // <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2> - 2569275680U, // <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5> - 2569276726U, // <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS - 4102034790U, // <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6> - 2651222067U, // <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7> - 3899378998U, // <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS - 2569279278U, // <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS - 2730153430U, // <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1> - 2724845022U, // <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0> - 3643025338U, // <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7> - 3643025697U, // <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6> - 3643026742U, // <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS - 3654971091U, // <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6> - 3787675153U, // <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6> - 2724845076U, // <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0> - 2725508637U, // <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0> - 2730817063U, // <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1> - 3631088436U, // <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1> - 3660949158U, // <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1> - 3801904705U, // <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0> - 3631090998U, // <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS - 2662503828U, // <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7> - 3660951981U, // <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7> - 2713933420U, // <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7> - 2731406959U, // <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1> - 1507500134U, // <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS - 2626672430U, // <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS - 2581243496U, // <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2> - 2569300259U, // <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u> - 1507503414U, // <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS - 1507503829U, // <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u> - 2581246458U, // <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3> - 2581246970U, // <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2> - 1507505966U, // <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS - 1543643153U, // <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u> - 1546297446U, // <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS - 2819448852U, // <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2> - 2619375876U, // <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u> - 1546297685U, // <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u> - 1658771190U, // <1,u,0,5>: Cost 2 vext3 , - 2736789248U, // <1,u,0,6>: Cost 3 vext3 , - 2659189376U, // <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1> - 1546298013U, // <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS - 1483112550U, // <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS - 202162278U, // <1,u,1,1>: Cost 1 vdup1 LHS - 1616009006U, // <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS - 1745707110U, // <1,u,1,3>: Cost 2 vuzpr LHS, LHS - 1483115830U, // <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS - 2620040336U, // <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7> - 3026622618U, // <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS - 2958183752U, // <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS - 202162278U, // <1,u,1,u>: Cost 1 vdup1 LHS - 2819449750U, // <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0> - 2893207342U, // <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS - 2819448996U, // <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2> - 2819450482U, // <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3> - 2819449754U, // <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4> - 2893207706U, // <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS - 2819449036U, // <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6> - 2970799432U, // <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS - 2819449002U, // <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u> - 403931292U, // <1,u,3,0>: Cost 1 vext1 LHS, LHS - 1477673718U, // <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2> - 115726126U, // <1,u,3,2>: Cost 1 vrev LHS - 2014102173U, // <1,u,3,3>: Cost 2 vtrnr LHS, LHS - 403934518U, // <1,u,3,4>: Cost 1 vext1 LHS, RHS - 1507536601U, // <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3> - 1525453306U, // <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3> - 2014105129U, // <1,u,3,7>: Cost 2 vtrnr LHS, RHS - 403937070U, // <1,u,3,u>: Cost 1 vext1 LHS, LHS - 2620042157U, // <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1> - 2620042237U, // <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0> - 2263217967U, // <1,u,4,2>: Cost 3 vrev - 2569341224U, // <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4> - 2569342262U, // <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS - 1546300726U, // <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS - 2819449180U, // <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6> - 2724845649U, // <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, - 1546300969U, // <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS - 2551431270U, // <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS - 2551432192U, // <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7> - 3028293422U, // <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS - 2955559068U, // <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS - 2551434550U, // <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS - 2895255706U, // <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS - 1616009370U, // <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS - 1745710390U, // <1,u,5,7>: Cost 2 vuzpr LHS, RHS - 1745710391U, // <1,u,5,u>: Cost 2 vuzpr LHS, RHS - 2653221159U, // <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u> - 2725509303U, // <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, - 2659193338U, // <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3> - 2689751248U, // <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, - 2867228774U, // <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4> - 3764820194U, // <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, - 2657202957U, // <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u> - 2819450810U, // <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7> - 2819450811U, // <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u> - 1585452032U, // <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u> - 2557420340U, // <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1> - 2569365158U, // <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1> - 2569365803U, // <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7> - 2557422902U, // <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS - 2662512021U, // <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u> - 2724845884U, // <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, - 2659194476U, // <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7> - 1590761096U, // <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u> - 403972257U, // <1,u,u,0>: Cost 1 vext1 LHS, LHS - 202162278U, // <1,u,u,1>: Cost 1 vdup1 LHS - 115767091U, // <1,u,u,2>: Cost 1 vrev LHS - 1745707677U, // <1,u,u,3>: Cost 2 vuzpr LHS, LHS - 403975478U, // <1,u,u,4>: Cost 1 vext1 LHS, RHS - 1546303642U, // <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS - 1616009613U, // <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS - 1745710633U, // <1,u,u,7>: Cost 2 vuzpr LHS, RHS - 403978030U, // <1,u,u,u>: Cost 1 vext1 LHS, LHS - 2551463936U, // <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0> - 2685698058U, // <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1> - 1610776596U, // <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2> - 2619384069U, // <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0> - 2551467318U, // <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS - 3899836596U, // <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5> - 2621374968U, // <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0> - 4168271334U, // <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7> - 1611219018U, // <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2> - 2551472138U, // <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1> - 2690564186U, // <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0> - 1611956326U, // <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS - 2826092646U, // <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS - 2551475510U, // <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS - 3692463248U, // <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7> - 2587308473U, // <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1> - 3661050874U, // <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2> - 1611956380U, // <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS - 1477738598U, // <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS - 2551481078U, // <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2> - 2551481796U, // <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0> - 2551482518U, // <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2> - 1477741878U, // <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS - 2551484112U, // <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3> - 2551484759U, // <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2> - 2551485434U, // <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2> - 1477744430U, // <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS - 2953625600U, // <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0> - 2953627302U, // <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1> - 2953625764U, // <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2> - 4027369695U, // <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3> - 3625233718U, // <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS - 3899836110U, // <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5> - 4032012618U, // <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6> - 3899835392U, // <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7> - 2953625770U, // <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u> - 2551496806U, // <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS - 2685698386U, // <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5> - 2685698396U, // <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6> - 3625240726U, // <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2> - 2551500086U, // <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS - 2618723638U, // <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS - 2765409590U, // <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS - 3799990664U, // <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5> - 2685698450U, // <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6> - 3625246822U, // <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS - 3289776304U, // <2,0,5,1>: Cost 4 vrev <0,2,1,5> - 2690564526U, // <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7> - 3289923778U, // <2,0,5,3>: Cost 4 vrev <0,2,3,5> - 2216255691U, // <2,0,5,4>: Cost 3 vrev <0,2,4,5> - 3726307332U, // <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5> - 3726307426U, // <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0> - 2826095926U, // <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS - 2216550639U, // <2,0,5,u>: Cost 3 vrev <0,2,u,5> - 4162420736U, // <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0> - 2901885030U, // <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS - 2685698559U, // <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7> - 3643173171U, // <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6> - 2216263884U, // <2,0,6,4>: Cost 3 vrev <0,2,4,6> - 3730289341U, // <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0> - 3726308152U, // <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6> - 3899836346U, // <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7> - 2216558832U, // <2,0,6,u>: Cost 3 vrev <0,2,u,6> - 2659202049U, // <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> - 3726308437U, // <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3> - 2726249034U, // <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1> - 3734934772U, // <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0> - 3726308710U, // <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6> - 3726308814U, // <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2> - 3736925671U, // <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0> - 3726308972U, // <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7> - 2659202049U, // <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0> - 1477787750U, // <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS - 2953668262U, // <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1> - 1611956893U, // <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS - 2551531670U, // <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2> - 1477791030U, // <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS - 2618726554U, // <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS - 2765412506U, // <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS - 2826096169U, // <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS - 1611956947U, // <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS - 2569453670U, // <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS - 2619392102U, // <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS - 3759440619U, // <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0> - 1616823030U, // <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2> - 2569456950U, // <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS - 2690712328U, // <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2> - 3661115841U, // <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0> - 2622046794U, // <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1> - 1617191715U, // <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2> - 2551545958U, // <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS - 2685698868U, // <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1> - 2628682646U, // <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0> - 2685698888U, // <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3> - 2551549238U, // <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS - 3693134992U, // <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7> - 3661124034U, // <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1> - 3625292794U, // <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2> - 2685698933U, // <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3> - 2551554150U, // <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS - 3893649571U, // <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1> - 2551555688U, // <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2> - 2685698966U, // <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0> - 2551557430U, // <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS - 3763422123U, // <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3> - 3693135802U, // <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7> - 2726249402U, // <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0> - 2685699011U, // <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0> - 2551562342U, // <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS - 2953625610U, // <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1> - 2953627798U, // <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2> - 2953626584U, // <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3> - 2551565622U, // <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS - 2953625938U, // <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5> - 2587398596U, // <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> - 4032013519U, // <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7> - 2953625617U, // <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u> - 2690565154U, // <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5> - 3625313270U, // <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6> - 3771532340U, // <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5> - 1148404634U, // <2,1,4,3>: Cost 2 vrev <1,2,3,4> - 3625315638U, // <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS - 2619395382U, // <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS - 3837242678U, // <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS - 3799991394U, // <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6> - 1148773319U, // <2,1,4,u>: Cost 2 vrev <1,2,u,4> - 2551578726U, // <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS - 2551579648U, // <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7> - 3625321952U, // <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1> - 2685699216U, // <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7> - 2551582006U, // <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS - 3740913668U, // <2,1,5,5>: Cost 4 vext2 , <5,5,5,5> - 3661156806U, // <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5> - 3893652790U, // <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS - 2685699261U, // <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7> - 2551586918U, // <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS - 3625329398U, // <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2> - 2551588794U, // <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7> - 3088679014U, // <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS - 2551590198U, // <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS - 4029382994U, // <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5> - 3625333560U, // <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6> - 3731624800U, // <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1> - 2551592750U, // <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS - 2622051322U, // <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2> - 3733615699U, // <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1> - 3795125538U, // <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0> - 2222171037U, // <2,1,7,3>: Cost 3 vrev <1,2,3,7> - 3740915046U, // <2,1,7,4>: Cost 4 vext2 , <7,4,5,6> - 3296060335U, // <2,1,7,5>: Cost 4 vrev <1,2,5,7> - 3736933864U, // <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1> - 3805300055U, // <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u> - 2669827714U, // <2,1,7,u>: Cost 3 vext2 , <7,u,1,2> - 2551603302U, // <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS - 2953666570U, // <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1> - 2953668758U, // <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2> - 1148437406U, // <2,1,u,3>: Cost 2 vrev <1,2,3,u> - 2551606582U, // <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS - 2953666898U, // <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5> - 2587398596U, // <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3> - 2669828370U, // <2,1,u,7>: Cost 3 vext2 , - 1148806091U, // <2,1,u,u>: Cost 2 vrev <1,2,u,u> - 1543667732U, // <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2> - 1548976230U, // <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS - 2685699524U, // <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0> - 2685699535U, // <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2> - 2551614774U, // <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS - 3704422830U, // <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7> - 3893657642U, // <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6> - 3770574323U, // <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2> - 1548976796U, // <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2> - 2622718710U, // <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2> - 2622718772U, // <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1> - 2622718870U, // <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0> - 2819915878U, // <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS - 3625364790U, // <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS - 2622719120U, // <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7> - 3760031292U, // <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3> - 3667170468U, // <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1> - 2819915883U, // <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS - 1489829990U, // <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS - 2563572470U, // <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2> - 269271142U, // <2,2,2,2>: Cost 1 vdup2 LHS - 2685699698U, // <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3> - 1489833270U, // <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS - 2685699720U, // <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7> - 2622719930U, // <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7> - 2593436837U, // <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2> - 269271142U, // <2,2,2,u>: Cost 1 vdup2 LHS - 2685699750U, // <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1> - 2690565806U, // <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0> - 2953627240U, // <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2> - 1879883878U, // <2,2,3,3>: Cost 2 vzipr LHS, LHS - 2685699790U, // <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5> - 3893659342U, // <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5> - 2958270812U, // <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6> - 2593445030U, // <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3> - 1879883883U, // <2,2,3,u>: Cost 2 vzipr LHS, LHS - 2551644262U, // <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS - 3625386742U, // <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2> - 2551645902U, // <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5> - 3759441686U, // <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5> - 2551647542U, // <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS - 1548979510U, // <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS - 2764901686U, // <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS - 3667195047U, // <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4> - 1548979753U, // <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS - 3696463432U, // <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2> - 2617413328U, // <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3> - 2685699936U, // <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7> - 4027383910U, // <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS - 2228201085U, // <2,2,5,4>: Cost 3 vrev <2,2,4,5> - 2617413636U, // <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5> - 2617413730U, // <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0> - 2819919158U, // <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS - 2819919159U, // <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS - 3625402554U, // <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6> - 3760031652U, // <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3> - 2617414138U, // <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3> - 2685700026U, // <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7> - 3625405750U, // <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS - 3760031692U, // <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7> - 3088679116U, // <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6> - 2657891169U, // <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2> - 2685700071U, // <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7> - 2726250474U, // <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1> - 3704427616U, // <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5> - 2660545701U, // <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2> - 4030718054U, // <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS - 2617415014U, // <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6> - 3302033032U, // <2,2,7,5>: Cost 4 vrev <2,2,5,7> - 3661246929U, // <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7> - 2617415276U, // <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7> - 2731558962U, // <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1> - 1489829990U, // <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS - 1548982062U, // <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS - 269271142U, // <2,2,u,2>: Cost 1 vdup2 LHS - 1879924838U, // <2,2,u,3>: Cost 2 vzipr LHS, LHS - 1489833270U, // <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS - 1548982426U, // <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS - 2953666908U, // <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6> - 2819919401U, // <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS - 269271142U, // <2,2,u,u>: Cost 1 vdup2 LHS - 1544339456U, // <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0> - 470597734U, // <2,3,0,1>: Cost 1 vext2 LHS, LHS - 1548984484U, // <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2> - 2619408648U, // <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3> - 1548984658U, // <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5> - 2665857454U, // <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7> - 2622726655U, // <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7> - 2593494188U, // <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0> - 470598301U, // <2,3,0,u>: Cost 1 vext2 LHS, LHS - 1544340214U, // <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2> - 1544340276U, // <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1> - 1544340374U, // <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0> - 1548985304U, // <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3> - 2551696694U, // <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS - 1548985488U, // <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7> - 2622727375U, // <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7> - 2665858347U, // <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0> - 1548985709U, // <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3> - 2622727613U, // <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2> - 2622727711U, // <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1> - 1544341096U, // <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2> - 1544341158U, // <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1> - 2622727958U, // <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5> - 2622728032U, // <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7> - 1548986298U, // <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7> - 2665859050U, // <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1> - 1548986427U, // <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1> - 1548986518U, // <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2> - 2622728415U, // <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3> - 1489913458U, // <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3> - 1544341916U, // <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3> - 1548986882U, // <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6> - 2665859632U, // <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7> - 2234304870U, // <2,3,3,6>: Cost 3 vrev <3,2,6,3> - 2958271632U, // <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7> - 1548987166U, // <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2> - 1483948134U, // <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS - 1483948954U, // <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4> - 2622729276U, // <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0> - 2557692054U, // <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2> - 1483951414U, // <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS - 470601014U, // <2,3,4,5>: Cost 1 vext2 LHS, RHS - 1592118644U, // <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6> - 2593526960U, // <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4> - 470601257U, // <2,3,4,u>: Cost 1 vext2 LHS, RHS - 2551726182U, // <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS - 1592118992U, // <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3> - 2665860862U, // <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4> - 2551728642U, // <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6> - 1592119238U, // <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6> - 1592119300U, // <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5> - 1592119394U, // <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0> - 1592119464U, // <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7> - 1592119545U, // <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7> - 2622730529U, // <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2> - 2557707164U, // <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6> - 1592119802U, // <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3> - 2665861682U, // <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5> - 2622730893U, // <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6> - 2665861810U, // <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7> - 1592120120U, // <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6> - 1592120142U, // <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1> - 1592120223U, // <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1> - 1592120314U, // <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2> - 2659890261U, // <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3> - 2660553894U, // <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3> - 2665862371U, // <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1> - 1592120678U, // <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6> - 2665862534U, // <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2> - 2665862614U, // <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1> - 1592120940U, // <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7> - 1592120962U, // <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2> - 1548990163U, // <2,3,u,0>: Cost 2 vext2 LHS, - 470603566U, // <2,3,u,1>: Cost 1 vext2 LHS, LHS - 1548990341U, // <2,3,u,2>: Cost 2 vext2 LHS, - 1548990396U, // <2,3,u,3>: Cost 2 vext2 LHS, - 1548990527U, // <2,3,u,4>: Cost 2 vext2 LHS, - 470603930U, // <2,3,u,5>: Cost 1 vext2 LHS, RHS - 1548990672U, // <2,3,u,6>: Cost 2 vext2 LHS, - 1592121600U, // <2,3,u,7>: Cost 2 vext2 LHS, - 470604133U, // <2,3,u,u>: Cost 1 vext2 LHS, LHS - 2617425942U, // <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4> - 2618753126U, // <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS - 2618753208U, // <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4> - 2619416841U, // <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4> - 2587593628U, // <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2> - 2712832914U, // <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1> - 1634962332U, // <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> - 3799993252U, // <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1> - 1634962332U, // <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2> - 2619417334U, // <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2> - 3692495668U, // <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1> - 2625389466U, // <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4> - 2826125414U, // <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS - 3699794995U, // <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4> - 3692496016U, // <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7> - 3763424238U, // <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3> - 3667317942U, // <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1> - 2826125419U, // <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS - 2629371336U, // <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4> - 3699131946U, // <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3> - 2630698602U, // <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4> - 2618754766U, // <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5> - 2826126234U, // <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4> - 2899119414U, // <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS - 3033337142U, // <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS - 3800214597U, // <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0> - 2899119657U, // <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS - 2635344033U, // <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4> - 4032012325U, // <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1> - 3692497228U, // <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4> - 3692497308U, // <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3> - 3001404624U, // <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4> - 2953627342U, // <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5> - 2953625804U, // <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6> - 3899868160U, // <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7> - 2953625806U, // <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u> - 2710916266U, // <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2> - 3899869648U, // <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1> - 3899869658U, // <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2> - 3899868930U, // <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3> - 2712833232U, // <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4> - 2618756406U, // <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS - 2765737270U, // <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS - 4168304426U, // <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7> - 2618756649U, // <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS - 2551800011U, // <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5> - 2569716470U, // <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2> - 2563745405U, // <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5> - 2569718102U, // <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5> - 2551803190U, // <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS - 3625545732U, // <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5> - 1611959606U, // <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS - 2826128694U, // <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS - 1611959624U, // <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS - 1478066278U, // <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS - 2551808758U, // <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2> - 2551809516U, // <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4> - 2551810198U, // <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2> - 1478069558U, // <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS - 2901888310U, // <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS - 2551812920U, // <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6> - 2726251914U, // <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1> - 1478072110U, // <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS - 2659234821U, // <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> - 3786722726U, // <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2> - 3734303911U, // <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4> - 3734967544U, // <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4> - 3727005030U, // <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6> - 2726251976U, // <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0> - 2726251986U, // <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1> - 3727005292U, // <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7> - 2659234821U, // <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4> - 1478082662U, // <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS - 2618758958U, // <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS - 2551826024U, // <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2> - 2551826582U, // <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2> - 1478085942U, // <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS - 2953668302U, // <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5> - 1611959849U, // <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS - 2826128937U, // <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS - 1611959867U, // <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS - 3691839488U, // <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0> - 2618097766U, // <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS - 2620088484U, // <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2> - 2619425034U, // <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5> - 2620088667U, // <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5> - 2620752300U, // <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5> - 3693830655U, // <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7> - 3094531382U, // <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS - 2618098333U, // <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS - 3691840246U, // <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2> - 3691840308U, // <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1> - 2626061206U, // <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0> - 2618098688U, // <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7> - 2626061364U, // <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5> - 3691840656U, // <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7> - 3789082310U, // <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2> - 2712833744U, // <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3> - 2628715896U, // <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5> - 3693831613U, // <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2> - 4026698642U, // <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1> - 2632033896U, // <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2> - 3691841190U, // <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1> - 2632034061U, // <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5> - 3691841352U, // <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1> - 3691841466U, // <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7> - 3088354614U, // <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS - 3088354615U, // <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS - 2557829222U, // <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS - 2557830059U, // <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3> - 2575746766U, // <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5> - 3691841948U, // <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3> - 2619427330U, // <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6> - 2581720847U, // <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3> - 2953628162U, // <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6> - 2953626624U, // <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7> - 2953626625U, // <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u> - 2569781350U, // <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS - 3631580076U, // <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4> - 2569782990U, // <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5> - 2569783646U, // <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4> - 2569784630U, // <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS - 2618101046U, // <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS - 3893905922U, // <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6> - 3094564150U, // <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS - 2618101289U, // <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS - 2551873638U, // <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS - 3637560320U, // <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7> - 3637560966U, // <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5> - 3723030343U, // <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5> - 2551876918U, // <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS - 2712834052U, // <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5> - 4028713474U, // <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6> - 2712834072U, // <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7> - 2712834081U, // <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7> - 2575769702U, // <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS - 3631596462U, // <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6> - 2655924730U, // <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3> - 3643541856U, // <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6> - 2655924849U, // <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5> - 3787755607U, // <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7> - 4029385218U, // <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6> - 3088682294U, // <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS - 3088682295U, // <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS - 2563833958U, // <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS - 2551890678U, // <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2> - 2563835528U, // <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7> - 3637577878U, // <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2> - 2563837238U, // <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS - 2712834216U, // <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7> - 2712834220U, // <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2> - 4174449974U, // <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS - 2563839790U, // <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS - 2563842150U, // <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS - 2618103598U, // <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS - 2563843721U, // <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u> - 2569816418U, // <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u> - 2622748735U, // <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, - 2618103962U, // <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS - 2953669122U, // <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6> - 2953667584U, // <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7> - 2618104165U, // <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS - 2620096512U, // <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0> - 1546354790U, // <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS - 2620096676U, // <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2> - 3693838588U, // <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0> - 1546355036U, // <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6> - 3694502317U, // <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6> - 2551911246U, // <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1> - 2720723287U, // <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2> - 1546355357U, // <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS - 2620097270U, // <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2> - 2620097332U, // <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1> - 2620097430U, // <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0> - 2820243558U, // <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS - 2620097598U, // <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6> - 2620097680U, // <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7> - 3693839585U, // <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7> - 2721386920U, // <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2> - 2820243563U, // <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS - 2714014137U, // <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1> - 2712834500U, // <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3> - 2620098152U, // <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2> - 2620098214U, // <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1> - 2632042254U, // <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6> - 2712834540U, // <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7> - 2820243660U, // <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6> - 2958265654U, // <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS - 2620098619U, // <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1> - 2620098710U, // <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2> - 3893986982U, // <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1> - 2569848762U, // <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7> - 2620098972U, // <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3> - 2620099074U, // <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6> - 3893987022U, // <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5> - 3001404644U, // <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6> - 1879887158U, // <2,6,3,7>: Cost 2 vzipr LHS, RHS - 1879887159U, // <2,6,3,u>: Cost 2 vzipr LHS, RHS - 2620099484U, // <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2> - 2620099566U, // <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3> - 2620099644U, // <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0> - 3643599207U, // <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4> - 2575830080U, // <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4> - 1546358070U, // <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS - 2667875700U, // <2,6,4,6>: Cost 3 vext2 , <4,6,4,6> - 4028042550U, // <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS - 1546358313U, // <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS - 3693841992U, // <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2> - 2667876048U, // <2,6,5,1>: Cost 3 vext2 , <5,1,7,3> - 2712834756U, // <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7> - 3643607400U, // <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5> - 2252091873U, // <2,6,5,4>: Cost 3 vrev <6,2,4,5> - 2667876356U, // <2,6,5,5>: Cost 3 vext2 , <5,5,5,5> - 2667876450U, // <2,6,5,6>: Cost 3 vext2 , <5,6,7,0> - 2820246838U, // <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS - 2820246839U, // <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS - 2563899494U, // <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS - 3893988683U, // <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1> - 2563901072U, // <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6> - 3893987236U, // <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3> - 2563902774U, // <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS - 3893988723U, // <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5> - 2712834872U, // <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6> - 2955644214U, // <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS - 2955644215U, // <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS - 2712834894U, // <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1> - 2724926296U, // <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2> - 2725000033U, // <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2> - 2702365544U, // <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0> - 2712834934U, // <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5> - 3776107393U, // <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7> - 2725294981U, // <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2> - 2726253452U, // <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0> - 2712834966U, // <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1> - 2620102355U, // <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, - 1546360622U, // <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS - 2620102536U, // <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, - 2820244125U, // <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS - 1594136612U, // <2,6,u,4>: Cost 2 vext2 , - 1546360986U, // <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS - 2620102864U, // <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, - 1879928118U, // <2,6,u,7>: Cost 2 vzipr LHS, RHS - 1879928119U, // <2,6,u,u>: Cost 2 vzipr LHS, RHS - 2726179825U, // <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2> - 1652511738U, // <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2> - 2621431972U, // <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2> - 2257949868U, // <2,7,0,3>: Cost 3 vrev <7,2,3,0> - 2726474773U, // <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2> - 2620768686U, // <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7> - 2621432319U, // <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7> - 2599760953U, // <2,7,0,7>: Cost 3 vext1 , <7,0,u,2> - 1653027897U, // <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2> - 2639348470U, // <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> - 3695174452U, // <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1> - 3695174550U, // <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0> - 3694511104U, // <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7> - 3713090594U, // <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5> - 3693184144U, // <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7> - 2627405016U, // <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7> - 3799995519U, // <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0> - 2639348470U, // <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2> - 3695175101U, // <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2> - 3643655168U, // <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7> - 2257892517U, // <2,7,2,2>: Cost 3 vrev <7,2,2,2> - 3695175334U, // <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1> - 3695175465U, // <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6> - 2632714080U, // <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7> - 2633377713U, // <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7> - 3695175658U, // <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1> - 2634704979U, // <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7> - 1514094694U, // <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS - 2569921680U, // <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7> - 2587838056U, // <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2> - 2569922927U, // <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3> - 1514097974U, // <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS - 2581868321U, // <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3> - 1514099194U, // <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3> - 2587841530U, // <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2> - 1514100526U, // <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS - 2708706617U, // <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6> - 3649643418U, // <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4> - 3649644330U, // <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7> - 2257982640U, // <2,7,4,3>: Cost 3 vrev <7,2,3,4> - 3649645641U, // <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4> - 2621435190U, // <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS - 2712835441U, // <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u> - 3799995762U, // <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0> - 2621435433U, // <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS - 2729497990U, // <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2> - 3643679744U, // <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7> - 3637708424U, // <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7> - 3643681137U, // <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5> - 2599800118U, // <2,7,5,4>: Cost 3 vext1 , RHS - 3786577334U, // <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5> - 3786577345U, // <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7> - 2599802214U, // <2,7,5,7>: Cost 3 vext1 , <7,4,5,6> - 2599802670U, // <2,7,5,u>: Cost 3 vext1 , LHS - 2581889126U, // <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS - 3643687936U, // <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7> - 2663240186U, // <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3> - 3643689330U, // <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6> - 2581892406U, // <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS - 2581892900U, // <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6> - 2587865597U, // <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6> - 3786577428U, // <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0> - 2581894958U, // <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS - 2726254119U, // <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1> - 3804640817U, // <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2> - 3637724826U, // <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7> - 3734992123U, // <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7> - 2552040758U, // <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS - 3799995992U, // <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5> - 2663241198U, // <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7> - 2712835692U, // <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7> - 2731562607U, // <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1> - 1514135654U, // <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS - 1657820802U, // <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2> - 2587879016U, // <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2> - 2569963892U, // <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u> - 1514138934U, // <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS - 2621438106U, // <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS - 1514140159U, // <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u> - 2587882490U, // <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2> - 1514141486U, // <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS - 1544380416U, // <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0> - 470638699U, // <2,u,0,1>: Cost 1 vext2 LHS, LHS - 1544380580U, // <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2> - 1658631909U, // <2,u,0,3>: Cost 2 vext3 , - 1544380754U, // <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5> - 2665898414U, // <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7> - 1658853120U, // <2,u,0,6>: Cost 2 vext3 , - 3094531625U, // <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS - 470639261U, // <2,u,0,u>: Cost 1 vext2 LHS, LHS - 1544381174U, // <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2> - 1544381236U, // <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1> - 1544381334U, // <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0> - 1544381400U, // <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3> - 2618123325U, // <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5> - 1544381584U, // <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7> - 2618123489U, // <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7> - 2726254427U, // <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, - 1544381823U, // <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3> - 1478328422U, // <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS - 2618123807U, // <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1> - 269271142U, // <2,u,2,2>: Cost 1 vdup2 LHS - 1544382118U, // <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1> - 1478331702U, // <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS - 2618124136U, // <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6> - 1544382394U, // <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7> - 3088354857U, // <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS - 269271142U, // <2,u,2,u>: Cost 1 vdup2 LHS - 1544382614U, // <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2> - 2953627374U, // <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1> - 1490282143U, // <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3> - 1879883932U, // <2,u,3,3>: Cost 2 vzipr LHS, LHS - 1544382978U, // <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6> - 2953627378U, // <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5> - 1514172931U, // <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3> - 1879887176U, // <2,u,3,7>: Cost 2 vzipr LHS, RHS - 1879883937U, // <2,u,3,u>: Cost 2 vzipr LHS, LHS - 1484316774U, // <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS - 1484317639U, // <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4> - 2552088270U, // <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5> - 1190213513U, // <2,u,4,3>: Cost 2 vrev - 1484320054U, // <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS - 470641974U, // <2,u,4,5>: Cost 1 vext2 LHS, RHS - 1592159604U, // <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6> - 3094564393U, // <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS - 470642217U, // <2,u,4,u>: Cost 1 vext2 LHS, RHS - 2552094959U, // <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5> - 1592159952U, // <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3> - 2564040353U, // <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5> - 2690275455U, // <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, - 1592160198U, // <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6> - 1592160260U, // <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5> - 1611962522U, // <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS - 1592160424U, // <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7> - 1611962540U, // <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS - 1478361190U, // <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS - 2552103670U, // <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2> - 1592160762U, // <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3> - 2685704400U, // <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, - 1478364470U, // <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS - 2901891226U, // <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS - 1592161080U, // <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6> - 1592161102U, // <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1> - 1478367022U, // <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS - 1592161274U, // <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2> - 2659931226U, // <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u> - 2564056739U, // <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7> - 2665903331U, // <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1> - 1592161638U, // <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6> - 2665903494U, // <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2> - 2587947527U, // <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7> - 1592161900U, // <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7> - 1592161922U, // <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2> - 1478377574U, // <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS - 470644526U, // <2,u,u,1>: Cost 1 vext2 LHS, LHS - 269271142U, // <2,u,u,2>: Cost 1 vdup2 LHS - 1879924892U, // <2,u,u,3>: Cost 2 vzipr LHS, LHS - 1478380854U, // <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS - 470644890U, // <2,u,u,5>: Cost 1 vext2 LHS, RHS - 1611962765U, // <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS - 1879928136U, // <2,u,u,7>: Cost 2 vzipr LHS, RHS - 470645093U, // <2,u,u,u>: Cost 1 vext2 LHS, LHS - 1611448320U, // <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0> - 1611890698U, // <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1> - 1611890708U, // <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2> - 3763576860U, // <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1> - 2689835045U, // <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1> - 3698508206U, // <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7> - 3763576887U, // <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1> - 3667678434U, // <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0> - 1616093258U, // <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2> - 1490337894U, // <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS - 2685632602U, // <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0> - 537706598U, // <3,0,1,2>: Cost 1 vext3 LHS, LHS - 2624766936U, // <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3> - 1490341174U, // <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS - 2624767120U, // <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7> - 2732966030U, // <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7> - 2593944803U, // <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1> - 537706652U, // <3,0,1,u>: Cost 1 vext3 LHS, LHS - 1611890852U, // <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2> - 2685632684U, // <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1> - 2685632692U, // <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0> - 2685632702U, // <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1> - 1611890892U, // <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6> - 2732966102U, // <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7> - 2624767930U, // <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7> - 2685632744U, // <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7> - 1611890924U, // <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2> - 2624768150U, // <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2> - 2685632764U, // <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0> - 2685632774U, // <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1> - 2624768412U, // <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3> - 2624768514U, // <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6> - 3702491714U, // <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7> - 2624768632U, // <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7> - 3702491843U, // <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1> - 2686959934U, // <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3> - 2689835336U, // <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4> - 1611891026U, // <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5> - 1611891036U, // <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6> - 3763577184U, // <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1> - 2689835374U, // <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6> - 1551027510U, // <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS - 2666573172U, // <3,0,4,6>: Cost 3 vext2 , <4,6,4,6> - 3667711206U, // <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4> - 1616093586U, // <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6> - 2685190556U, // <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7> - 2666573520U, // <3,0,5,1>: Cost 3 vext2 , <5,1,7,3> - 3040886886U, // <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS - 3625912834U, // <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6> - 2666573766U, // <3,0,5,4>: Cost 3 vext2 , <5,4,7,6> - 2666573828U, // <3,0,5,5>: Cost 3 vext2 , <5,5,5,5> - 2732966354U, // <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7> - 2666573992U, // <3,0,5,7>: Cost 3 vext2 , <5,7,5,7> - 3040886940U, // <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS - 2685190637U, // <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7> - 2732966390U, // <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7> - 2689835519U, // <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7> - 3667724438U, // <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2> - 3763577355U, // <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1> - 3806708243U, // <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0> - 2666574648U, // <3,0,6,6>: Cost 3 vext2 , <6,6,6,6> - 2657948520U, // <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0> - 2689835573U, // <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7> - 2666574842U, // <3,0,7,0>: Cost 3 vext2 , <7,0,1,2> - 2685633095U, // <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7> - 2660603052U, // <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0> - 3643844997U, // <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7> - 2666575206U, // <3,0,7,4>: Cost 3 vext2 , <7,4,5,6> - 3655790391U, // <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7> - 3731690968U, // <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3> - 2666575468U, // <3,0,7,7>: Cost 3 vext2 , <7,7,7,7> - 2664584850U, // <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0> - 1616093834U, // <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2> - 1611891346U, // <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1> - 537707165U, // <3,0,u,2>: Cost 1 vext3 LHS, LHS - 2689835684U, // <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1> - 1616093874U, // <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6> - 1551030426U, // <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS - 2624772304U, // <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, - 2594002154U, // <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u> - 537707219U, // <3,0,u,u>: Cost 1 vext3 LHS, LHS - 2552201318U, // <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS - 2618802278U, // <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS - 2618802366U, // <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1> - 1611449078U, // <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2> - 2552204598U, // <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS - 2732966663U, // <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1> - 3906258396U, // <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6> - 3667752171U, // <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0> - 1611891491U, // <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2> - 2689835819U, // <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1> - 1611449140U, // <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1> - 2624775063U, // <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1> - 1611891528U, // <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3> - 2689835859U, // <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5> - 2689835868U, // <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5> - 3763577701U, // <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5> - 3765273452U, // <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3> - 1611891573U, // <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3> - 2629420494U, // <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1> - 2689835911U, // <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3> - 2564163248U, // <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2> - 1611449238U, // <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0> - 2564164918U, // <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS - 2689835947U, // <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3> - 3692545978U, // <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7> - 2732966842U, // <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0> - 1611891651U, // <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0> - 1484456038U, // <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS - 1611891672U, // <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3> - 2685633502U, // <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0> - 2685633512U, // <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1> - 1484459318U, // <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS - 1611891712U, // <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7> - 2689836041U, // <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7> - 2733409294U, // <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3> - 1611891735U, // <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3> - 2552234086U, // <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS - 2732966955U, // <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5> - 2732966964U, // <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5> - 2685633597U, // <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5> - 2552237366U, // <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS - 2618805558U, // <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS - 2769472822U, // <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS - 3667784943U, // <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4> - 2685633642U, // <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5> - 2689836143U, // <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1> - 2564187280U, // <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7> - 2564187827U, // <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5> - 1611891856U, // <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7> - 2689836183U, // <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5> - 3759375522U, // <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7> - 3720417378U, // <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0> - 2832518454U, // <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS - 1611891901U, // <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7> - 3763578048U, // <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1> - 2689836239U, // <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7> - 2732967128U, // <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7> - 2685633761U, // <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7> - 3763578088U, // <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5> - 2689836275U, // <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7> - 3763578108U, // <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7> - 2732967166U, // <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0> - 2685633806U, // <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7> - 3631972454U, // <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS - 2659947612U, // <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1> - 4036102294U, // <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2> - 3095396454U, // <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS - 3631975734U, // <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS - 2222982144U, // <3,1,7,5>: Cost 3 vrev <1,3,5,7> - 3296797705U, // <3,1,7,6>: Cost 4 vrev <1,3,6,7> - 3720418924U, // <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7> - 3095396459U, // <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS - 1484496998U, // <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS - 1611892077U, // <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3> - 2685633907U, // <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0> - 1611892092U, // <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0> - 1484500278U, // <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS - 1611892117U, // <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7> - 2685633950U, // <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7> - 2832518697U, // <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS - 1611892140U, // <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3> - 2623455232U, // <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0> - 1549713510U, // <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS - 2689836484U, // <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0> - 2685633997U, // <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0> - 2623455570U, // <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5> - 2732967398U, // <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7> - 2689836524U, // <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4> - 2229044964U, // <3,2,0,7>: Cost 3 vrev <2,3,7,0> - 1549714077U, // <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS - 1549714166U, // <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2> - 2623456052U, // <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1> - 2623456150U, // <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0> - 2685634079U, // <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1> - 2552286518U, // <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS - 2623456400U, // <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7> - 2689836604U, // <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3> - 3667834101U, // <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1> - 1155385070U, // <3,2,1,u>: Cost 2 vrev <2,3,u,1> - 2689836629U, // <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1> - 2689836640U, // <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3> - 1611449960U, // <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2> - 1611892338U, // <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3> - 2689836669U, // <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5> - 2689836680U, // <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7> - 2689836688U, // <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6> - 3763578518U, // <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3> - 1611892383U, // <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3> - 1611450022U, // <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1> - 2685191854U, // <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0> - 2685191865U, // <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2> - 2685191875U, // <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3> - 1611450062U, // <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5> - 2732967635U, // <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1> - 2732967645U, // <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2> - 2732967652U, // <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0> - 1611450094U, // <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1> - 2558279782U, // <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS - 2558280602U, // <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4> - 2732967692U, // <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4> - 2685634326U, // <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5> - 2558283062U, // <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS - 1549716790U, // <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS - 2689836844U, // <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0> - 2229077736U, // <3,2,4,7>: Cost 3 vrev <2,3,7,4> - 1549717033U, // <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS - 2552316006U, // <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS - 2228643507U, // <3,2,5,1>: Cost 3 vrev <2,3,1,5> - 2689836896U, // <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7> - 2685634408U, // <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6> - 1155122894U, // <3,2,5,4>: Cost 2 vrev <2,3,4,5> - 2665263108U, // <3,2,5,5>: Cost 3 vext2 , <5,5,5,5> - 2689836932U, // <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7> - 2665263272U, // <3,2,5,7>: Cost 3 vext2 , <5,7,5,7> - 1155417842U, // <3,2,5,u>: Cost 2 vrev <2,3,u,5> - 2689836953U, // <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1> - 2689836964U, // <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3> - 2689836976U, // <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6> - 1611892666U, // <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7> - 2689836993U, // <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5> - 2689837004U, // <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7> - 2689837013U, // <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7> - 2665263950U, // <3,2,6,7>: Cost 3 vext2 , <6,7,0,1> - 1611892711U, // <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7> - 2665264122U, // <3,2,7,0>: Cost 3 vext2 , <7,0,1,2> - 2623460419U, // <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3> - 4169138340U, // <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2> - 2962358374U, // <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS - 2665264486U, // <3,2,7,4>: Cost 3 vext2 , <7,4,5,6> - 2228954841U, // <3,2,7,5>: Cost 3 vrev <2,3,5,7> - 2229028578U, // <3,2,7,6>: Cost 3 vrev <2,3,6,7> - 2665264748U, // <3,2,7,7>: Cost 3 vext2 , <7,7,7,7> - 2962358379U, // <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS - 1611892795U, // <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1> - 1549719342U, // <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS - 1611449960U, // <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2> - 1611892824U, // <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3> - 1611892835U, // <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5> - 1549719706U, // <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS - 2689837168U, // <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0> - 2665265408U, // <3,2,u,7>: Cost 3 vext2 , - 1611892867U, // <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1> - 2685192331U, // <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0> - 1611450518U, // <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2> - 2685634717U, // <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0> - 2564294806U, // <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2> - 2685634736U, // <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1> - 2732968122U, // <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2> - 3763579075U, // <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2> - 4034053264U, // <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7> - 1611450581U, // <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2> - 2685192415U, // <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3> - 1550385992U, // <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3> - 2685192433U, // <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3> - 2685634808U, // <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1> - 2558332214U, // <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS - 2685634828U, // <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3> - 3759376661U, // <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3> - 2703477022U, // <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3> - 1555031423U, // <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3> - 2564309094U, // <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS - 2630100513U, // <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3> - 1557022322U, // <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3> - 2685192520U, // <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0> - 2564312374U, // <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS - 2732968286U, // <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4> - 2685634918U, // <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3> - 2704140655U, // <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3> - 1561004120U, // <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3> - 1496547430U, // <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS - 2624129256U, // <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3> - 2630764866U, // <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3> - 336380006U, // <3,3,3,3>: Cost 1 vdup3 LHS - 1496550710U, // <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS - 2732968368U, // <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5> - 2624129683U, // <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7> - 2594182400U, // <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3> - 336380006U, // <3,3,3,u>: Cost 1 vdup3 LHS - 2558353510U, // <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS - 2558354411U, // <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4> - 2564327108U, // <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4> - 2564327938U, // <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6> - 2960343962U, // <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4> - 1611893250U, // <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6> - 2771619126U, // <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS - 4034086032U, // <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7> - 1611893277U, // <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6> - 2558361702U, // <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS - 2558362604U, // <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5> - 2558363342U, // <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5> - 2732968512U, // <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5> - 2558364982U, // <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS - 3101279950U, // <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5> - 2665934946U, // <3,3,5,6>: Cost 3 vext2 , <5,6,7,0> - 2826636598U, // <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS - 2826636599U, // <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS - 2732968568U, // <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7> - 3763579521U, // <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7> - 2732968586U, // <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7> - 2732968595U, // <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7> - 2732968604U, // <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7> - 3763579557U, // <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7> - 2732968621U, // <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6> - 2657973099U, // <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3> - 2658636732U, // <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3> - 2558378086U, // <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS - 2558378990U, // <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7> - 2564351687U, // <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7> - 2661291264U, // <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3> - 2558381366U, // <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS - 2732968694U, // <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7> - 3781126907U, // <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3> - 3095397376U, // <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7> - 2558383918U, // <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS - 1496547430U, // <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS - 1611893534U, // <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2> - 1592858504U, // <3,3,u,2>: Cost 2 vext2 , - 336380006U, // <3,3,u,3>: Cost 1 vdup3 LHS - 1496550710U, // <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS - 1611893574U, // <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6> - 2690280268U, // <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3> - 2826636841U, // <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS - 336380006U, // <3,3,u,u>: Cost 1 vdup3 LHS - 2624798720U, // <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0> - 1551056998U, // <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS - 2624798884U, // <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2> - 3693232384U, // <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4> - 2624799058U, // <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5> - 1659227026U, // <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1> - 1659227036U, // <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2> - 3667973382U, // <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0> - 1551057565U, // <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS - 2624799478U, // <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2> - 2624799540U, // <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1> - 1551057818U, // <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4> - 2624799704U, // <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3> - 2564377910U, // <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS - 2689838050U, // <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0> - 2689838062U, // <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3> - 2628117807U, // <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4> - 1555039616U, // <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4> - 3626180710U, // <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS - 2624800298U, // <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3> - 2624800360U, // <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2> - 2624800422U, // <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1> - 2624800514U, // <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3> - 2709965878U, // <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3> - 2689838140U, // <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0> - 2634090504U, // <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4> - 2689838158U, // <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0> - 2624800918U, // <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2> - 2636081403U, // <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4> - 2636745036U, // <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4> - 2624801180U, // <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3> - 2624801232U, // <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1> - 2905836854U, // <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS - 3040054582U, // <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS - 3702524611U, // <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1> - 2624801566U, // <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2> - 2564399206U, // <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS - 2564400026U, // <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4> - 2564400845U, // <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4> - 2570373542U, // <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4> - 1659227344U, // <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4> - 1551060278U, // <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS - 1659227364U, // <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6> - 3668006154U, // <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4> - 1551060521U, // <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS - 1490665574U, // <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS - 2689838341U, // <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3> - 1490667214U, // <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5> - 2564409494U, // <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2> - 1490668854U, // <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS - 2689838381U, // <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7> - 537709878U, // <3,4,5,6>: Cost 1 vext3 LHS, RHS - 2594272523U, // <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5> - 537709896U, // <3,4,5,u>: Cost 1 vext3 LHS, RHS - 2689838411U, // <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1> - 2558444534U, // <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6> - 2666607098U, // <3,4,6,2>: Cost 3 vext2 , <6,2,7,3> - 2558446082U, // <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6> - 1659227508U, // <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6> - 2689838462U, // <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7> - 2689838471U, // <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7> - 2657981292U, // <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4> - 1659227540U, // <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2> - 2666607610U, // <3,4,7,0>: Cost 3 vext2 , <7,0,1,2> - 3702527072U, // <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5> - 2660635824U, // <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4> - 3644139945U, // <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7> - 2666607974U, // <3,4,7,4>: Cost 3 vext2 , <7,4,5,6> - 2732969416U, // <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0> - 2732969425U, // <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0> - 2666608236U, // <3,4,7,7>: Cost 3 vext2 , <7,7,7,7> - 2664617622U, // <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4> - 1490690150U, // <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS - 1551062830U, // <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS - 1490691793U, // <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u> - 2624804796U, // <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, - 1490693430U, // <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS - 1551063194U, // <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS - 537710121U, // <3,4,u,6>: Cost 1 vext3 LHS, RHS - 2594297102U, // <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u> - 537710139U, // <3,4,u,u>: Cost 1 vext3 LHS, RHS - 3692576768U, // <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0> - 2618835046U, // <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS - 2618835138U, // <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5> - 3692577024U, // <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4> - 2689838690U, // <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1> - 2732969579U, // <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1> - 2732969588U, // <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1> - 2246963055U, // <3,5,0,7>: Cost 3 vrev <5,3,7,0> - 2618835613U, // <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS - 2594308198U, // <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS - 3692577588U, // <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1> - 2624807835U, // <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5> - 2625471468U, // <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5> - 2626135101U, // <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5> - 2594311888U, // <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3> - 3699877107U, // <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7> - 1641680592U, // <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3> - 1641754329U, // <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3> - 3692578274U, // <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3> - 2630116899U, // <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5> - 3692578408U, // <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2> - 2625472206U, // <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5> - 2632107798U, // <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5> - 2715938575U, // <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3> - 3692578746U, // <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7> - 2716086049U, // <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3> - 2634762330U, // <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5> - 3692578966U, // <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2> - 2636089596U, // <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5> - 3699214668U, // <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4> - 2638080412U, // <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3> - 2618837506U, // <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6> - 2832844494U, // <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5> - 4033415682U, // <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6> - 3095072054U, // <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS - 3095072055U, // <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS - 2600304742U, // <3,5,4,0>: Cost 3 vext1 , LHS - 3763580815U, // <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5> - 2564474582U, // <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4> - 3699879044U, // <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0> - 2600308022U, // <3,5,4,4>: Cost 3 vext1 , RHS - 2618838326U, // <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS - 2772454710U, // <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS - 1659228102U, // <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6> - 1659228111U, // <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6> - 2570453094U, // <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS - 2624810704U, // <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3> - 2570454734U, // <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5> - 2570455472U, // <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5> - 2570456374U, // <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS - 1659228164U, // <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5> - 2732969998U, // <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6> - 1659228184U, // <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7> - 1659228193U, // <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7> - 2732970020U, // <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1> - 2732970035U, // <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7> - 2564490968U, // <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6> - 2732970050U, // <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4> - 2732970060U, // <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5> - 2732970071U, // <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7> - 2732970080U, // <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7> - 1659228258U, // <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0> - 1659228267U, // <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0> - 1484783718U, // <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS - 1484784640U, // <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7> - 2558527080U, // <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2> - 2558527638U, // <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2> - 1484786998U, // <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS - 1659228328U, // <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7> - 2732970154U, // <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0> - 2558531180U, // <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7> - 1484789550U, // <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS - 1484791910U, // <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS - 1484792833U, // <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u> - 2558535272U, // <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2> - 2558535830U, // <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2> - 1484795190U, // <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS - 1659228409U, // <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7> - 2772457626U, // <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS - 1646326023U, // <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3> - 1484797742U, // <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS - 2558541926U, // <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS - 2689839393U, // <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2> - 2689839404U, // <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4> - 3706519808U, // <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4> - 2689839420U, // <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2> - 2732970314U, // <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7> - 2732970316U, // <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0> - 2960313654U, // <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS - 2689839456U, // <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2> - 3763581290U, // <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3> - 3763581297U, // <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1> - 2624816028U, // <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6> - 3763581315U, // <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1> - 2626143294U, // <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6> - 3763581335U, // <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3> - 2721321376U, // <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3> - 2721395113U, // <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3> - 2628797826U, // <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6> - 2594390118U, // <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS - 2721616324U, // <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3> - 2630788725U, // <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6> - 3763581395U, // <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0> - 2632115991U, // <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6> - 2632779624U, // <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6> - 2594394618U, // <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3> - 1648316922U, // <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3> - 1648390659U, // <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3> - 3693914262U, // <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2> - 3638281176U, // <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3> - 3696568678U, // <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3> - 2638088604U, // <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3> - 2632780290U, // <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6> - 3712494145U, // <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6> - 3698559612U, // <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2> - 2959674678U, // <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS - 2959674679U, // <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS - 3763581536U, // <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6> - 2722943590U, // <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3> - 2732970609U, // <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5> - 3698560147U, // <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6> - 2732970628U, // <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6> - 2689839757U, // <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6> - 2732970640U, // <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0> - 2960346422U, // <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS - 2689839784U, // <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6> - 2576498790U, // <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS - 3650241270U, // <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2> - 2732970692U, // <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7> - 2576501250U, // <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6> - 2576501906U, // <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5> - 3650244622U, // <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6> - 4114633528U, // <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6> - 2732970735U, // <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5> - 2576504622U, // <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS - 2732970749U, // <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1> - 2724270856U, // <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3> - 2624819706U, // <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3> - 3656223234U, // <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6> - 2732970788U, // <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4> - 2732970800U, // <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7> - 1659228984U, // <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6> - 1659228994U, // <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7> - 1659229003U, // <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7> - 1659229006U, // <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1> - 2558600201U, // <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7> - 2558601146U, // <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7> - 2725081963U, // <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3> - 1659229046U, // <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5> - 2715423611U, // <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1> - 2722059141U, // <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2> - 2962361654U, // <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS - 1659229078U, // <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1> - 1659229087U, // <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1> - 2689840041U, // <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2> - 2558609339U, // <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u> - 2576525853U, // <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6> - 1659229127U, // <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5> - 2689840081U, // <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6> - 1659228984U, // <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6> - 1652298720U, // <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3> - 1659229159U, // <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1> - 2626813952U, // <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0> - 1553072230U, // <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS - 2626814116U, // <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2> - 3700556028U, // <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0> - 2626814290U, // <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5> - 2582507375U, // <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0> - 2588480072U, // <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0> - 2732971055U, // <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1> - 1553072797U, // <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS - 2626814710U, // <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2> - 2626814772U, // <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1> - 2626814870U, // <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0> - 2625487854U, // <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7> - 2582514998U, // <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS - 1553073296U, // <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7> - 2627478753U, // <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7> - 2727367810U, // <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3> - 1555064195U, // <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7> - 2588491878U, // <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS - 3700557318U, // <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3> - 2626815592U, // <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2> - 2626815654U, // <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1> - 2588495158U, // <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS - 2632787817U, // <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7> - 1559709626U, // <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7> - 2728031443U, // <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3> - 1561036892U, // <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7> - 2626816150U, // <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2> - 2626816268U, // <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3> - 2633451878U, // <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3> - 2626816412U, // <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3> - 2626816514U, // <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6> - 2638760514U, // <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7> - 2639424147U, // <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7> - 2826961920U, // <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7> - 2626816798U, // <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2> - 2582536294U, // <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS - 2582537360U, // <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7> - 2588510138U, // <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7> - 3700558996U, // <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7> - 2582539574U, // <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS - 1553075510U, // <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS - 2588512844U, // <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4> - 2564625766U, // <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6> - 1553075753U, // <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS - 2732971398U, // <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2> - 2626817744U, // <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3> - 3700559649U, // <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3> - 2626817903U, // <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0> - 2258728203U, // <3,7,5,4>: Cost 3 vrev <7,3,4,5> - 2732971446U, // <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5> - 2732971457U, // <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7> - 2826964278U, // <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS - 2826964279U, // <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS - 2732971478U, // <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1> - 2732971486U, // <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0> - 2633454074U, // <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3> - 2633454152U, // <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0> - 2732971518U, // <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5> - 2732971526U, // <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4> - 2732971537U, // <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6> - 2732971540U, // <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0> - 2726041124U, // <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7> - 2570616934U, // <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS - 2570617856U, // <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7> - 2564646635U, // <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7> - 2570619332U, // <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7> - 2570620214U, // <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS - 2582564726U, // <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7> - 2588537423U, // <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7> - 1659229804U, // <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7> - 1659229804U, // <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7> - 2626819795U, // <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, - 1553078062U, // <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS - 2626819973U, // <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, - 2826961565U, // <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS - 2626820159U, // <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, - 1553078426U, // <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS - 1595545808U, // <3,7,u,6>: Cost 2 vext2 , - 1659229804U, // <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7> - 1553078629U, // <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS - 1611448320U, // <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0> - 1611896531U, // <3,u,0,1>: Cost 2 vext3 LHS, - 1659672284U, // <3,u,0,2>: Cost 2 vext3 LHS, - 1616099045U, // <3,u,0,3>: Cost 2 vext3 LHS, - 2685638381U, // <3,u,0,4>: Cost 3 vext3 LHS, - 1663874806U, // <3,u,0,5>: Cost 2 vext3 LHS, - 1663874816U, // <3,u,0,6>: Cost 2 vext3 LHS, - 2960313672U, // <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS - 1611896594U, // <3,u,0,u>: Cost 2 vext3 LHS, - 1549763324U, // <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u> - 1550426957U, // <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u> - 537712430U, // <3,u,1,2>: Cost 1 vext3 LHS, LHS - 1616541495U, // <3,u,1,3>: Cost 2 vext3 LHS, - 1490930998U, // <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS - 1553081489U, // <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u> - 2627486946U, // <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u> - 1659230043U, // <3,u,1,7>: Cost 2 vext3 LHS, - 537712484U, // <3,u,1,u>: Cost 1 vext3 LHS, LHS - 1611890852U, // <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2> - 2624833102U, // <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3> - 1557063287U, // <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u> - 1616099205U, // <3,u,2,3>: Cost 2 vext3 LHS, - 1611890892U, // <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6> - 2689841054U, // <3,u,2,5>: Cost 3 vext3 LHS, - 1559717819U, // <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u> - 1659230124U, // <3,u,2,7>: Cost 2 vext3 LHS, - 1616541618U, // <3,u,2,u>: Cost 2 vext3 LHS, - 1611896764U, // <3,u,3,0>: Cost 2 vext3 LHS, - 1484973079U, // <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3> - 2685638607U, // <3,u,3,2>: Cost 3 vext3 LHS, - 336380006U, // <3,u,3,3>: Cost 1 vdup3 LHS - 1611896804U, // <3,u,3,4>: Cost 2 vext3 LHS, - 1616541679U, // <3,u,3,5>: Cost 2 vext3 LHS, - 2690283512U, // <3,u,3,6>: Cost 3 vext3 LHS, - 2959674696U, // <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS - 336380006U, // <3,u,3,u>: Cost 1 vdup3 LHS - 2558722150U, // <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS - 1659672602U, // <3,u,4,1>: Cost 2 vext3 LHS, - 1659672612U, // <3,u,4,2>: Cost 2 vext3 LHS, - 2689841196U, // <3,u,4,3>: Cost 3 vext3 LHS, - 1659227344U, // <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4> - 1611896895U, // <3,u,4,5>: Cost 2 vext3 LHS, - 1663875144U, // <3,u,4,6>: Cost 2 vext3 LHS, - 1659230289U, // <3,u,4,7>: Cost 2 vext3 LHS, - 1611896922U, // <3,u,4,u>: Cost 2 vext3 LHS, - 1490960486U, // <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS - 2689841261U, // <3,u,5,1>: Cost 3 vext3 LHS, - 1490962162U, // <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5> - 1616541823U, // <3,u,5,3>: Cost 2 vext3 LHS, - 1490963766U, // <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS - 1659228164U, // <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5> - 537712794U, // <3,u,5,6>: Cost 1 vext3 LHS, RHS - 1659230371U, // <3,u,5,7>: Cost 2 vext3 LHS, - 537712812U, // <3,u,5,u>: Cost 1 vext3 LHS, RHS - 2689841327U, // <3,u,6,0>: Cost 3 vext3 LHS, - 2558739482U, // <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6> - 2689841351U, // <3,u,6,2>: Cost 3 vext3 LHS, - 1616099536U, // <3,u,6,3>: Cost 2 vext3 LHS, - 1659227508U, // <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6> - 2690283746U, // <3,u,6,5>: Cost 3 vext3 LHS, - 1659228984U, // <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6> - 1659230445U, // <3,u,6,7>: Cost 2 vext3 LHS, - 1616099581U, // <3,u,6,u>: Cost 2 vext3 LHS, - 1485004902U, // <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS - 1485005851U, // <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7> - 2558748264U, // <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2> - 3095397021U, // <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS - 1485008182U, // <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS - 1659228328U, // <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7> - 2722060599U, // <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, - 1659229804U, // <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7> - 1485010734U, // <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS - 1616099665U, // <3,u,u,0>: Cost 2 vext3 LHS, - 1611897179U, // <3,u,u,1>: Cost 2 vext3 LHS, - 537712997U, // <3,u,u,2>: Cost 1 vext3 LHS, LHS - 336380006U, // <3,u,u,3>: Cost 1 vdup3 LHS - 1616099705U, // <3,u,u,4>: Cost 2 vext3 LHS, - 1611897219U, // <3,u,u,5>: Cost 2 vext3 LHS, - 537713037U, // <3,u,u,6>: Cost 1 vext3 LHS, RHS - 1659230607U, // <3,u,u,7>: Cost 2 vext3 LHS, - 537713051U, // <3,u,u,u>: Cost 1 vext3 LHS, LHS - 2691907584U, // <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0> - 2691907594U, // <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1> - 2691907604U, // <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2> - 3709862144U, // <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4> - 2684682280U, // <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4> - 3694600633U, // <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0> - 3291431290U, // <4,0,0,6>: Cost 4 vrev <0,4,6,0> - 3668342067U, // <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0> - 2691907657U, // <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1> - 2570715238U, // <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS - 2570716058U, // <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4> - 1618165862U, // <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS - 2570717648U, // <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1> - 2570718518U, // <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS - 2594607206U, // <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4> - 3662377563U, // <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1> - 2594608436U, // <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1> - 1618165916U, // <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS - 2685714598U, // <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4> - 3759530159U, // <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4> - 2685862072U, // <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4> - 2631476937U, // <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0> - 2685714636U, // <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6> - 3765649622U, // <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7> - 2686157020U, // <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> - 3668358453U, // <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2> - 2686304494U, // <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4> - 3632529510U, // <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS - 2686451968U, // <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> - 2686525705U, // <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4> - 3760341266U, // <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4> - 3632532790U, // <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS - 3913254606U, // <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5> - 3705219740U, // <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7> - 3713845990U, // <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0> - 2686451968U, // <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4> - 2552823910U, // <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS - 2691907922U, // <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5> - 2691907932U, // <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6> - 3626567830U, // <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2> - 2552827190U, // <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS - 2631478582U, // <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS - 3626570017U, // <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2> - 3668374839U, // <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4> - 2552829742U, // <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS - 2558804070U, // <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS - 1839644774U, // <4,0,5,1>: Cost 2 vzipl RHS, LHS - 2913386660U, // <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2> - 2570750420U, // <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5> - 2558807350U, // <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS - 3987128750U, // <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7> - 3987128822U, // <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7> - 2594641208U, // <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5> - 1839645341U, // <4,0,5,u>: Cost 2 vzipl RHS, LHS - 2552840294U, // <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS - 3047604234U, // <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1> - 1973862502U, // <4,0,6,2>: Cost 2 vtrnl RHS, LHS - 2570758613U, // <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6> - 2552843574U, // <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS - 2217664887U, // <4,0,6,5>: Cost 3 vrev <0,4,5,6> - 3662418528U, // <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6> - 2658022257U, // <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0> - 1973862556U, // <4,0,6,u>: Cost 2 vtrnl RHS, LHS - 3731764218U, // <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2> - 3988324454U, // <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS - 4122034278U, // <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS - 3735082246U, // <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0> - 3731764536U, // <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5> - 3937145718U, // <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5> - 3737073145U, // <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0> - 3731764844U, // <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7> - 4122034332U, // <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS - 2552856678U, // <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS - 1841635430U, // <4,0,u,1>: Cost 2 vzipl RHS, LHS - 1618166429U, // <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS - 2570774999U, // <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u> - 2552859958U, // <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS - 2631481498U, // <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS - 2686157020U, // <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4> - 2594665787U, // <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u> - 1618166483U, // <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS - 2617548837U, // <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1> - 2622857318U, // <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS - 3693281484U, // <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6> - 2691908342U, // <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2> - 2622857554U, // <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5> - 3764470538U, // <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4> - 3695272459U, // <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1> - 3733094980U, // <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4> - 2622857885U, // <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS - 3696599798U, // <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2> - 2691097399U, // <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4> - 2631484314U, // <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4> - 2691908424U, // <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3> - 3696600125U, // <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5> - 3696600175U, // <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1> - 3696600307U, // <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7> - 3668423997U, // <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1> - 2691908469U, // <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3> - 2570797158U, // <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS - 2570797978U, // <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4> - 3696600680U, // <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2> - 1618166682U, // <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4> - 2570800438U, // <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS - 3765650347U, // <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3> - 3696601018U, // <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7> - 3668432190U, // <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2> - 1618535367U, // <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4> - 2564833382U, // <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS - 2691908568U, // <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3> - 2691908578U, // <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4> - 2692572139U, // <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4> - 2564836662U, // <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS - 2691908608U, // <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7> - 2588725862U, // <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> - 3662468090U, // <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2> - 2691908631U, // <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3> - 3760194590U, // <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1> - 3693947874U, // <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0> - 3765650484U, // <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5> - 3113877606U, // <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS - 3760194630U, // <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5> - 2622860598U, // <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS - 3297436759U, // <4,1,4,6>: Cost 4 vrev <1,4,6,4> - 3800007772U, // <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0> - 2622860841U, // <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS - 1479164006U, // <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS - 2552906486U, // <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2> - 2552907299U, // <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5> - 2552907926U, // <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2> - 1479167286U, // <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS - 2913387664U, // <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7> - 2600686074U, // <4,1,5,6>: Cost 3 vext1 , <6,2,7,3> - 2600686586U, // <4,1,5,7>: Cost 3 vext1 , <7,0,1,2> - 1479169838U, // <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS - 2552914022U, // <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS - 2558886708U, // <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1> - 4028205206U, // <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2> - 3089858662U, // <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS - 2552917302U, // <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS - 2223637584U, // <4,1,6,5>: Cost 3 vrev <1,4,5,6> - 4121347081U, // <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7> - 3721155406U, // <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1> - 2552919854U, // <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS - 2659357716U, // <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> - 3733763173U, // <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1> - 3734426806U, // <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1> - 2695226671U, // <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4> - 3721155942U, // <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6> - 3721155976U, // <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4> - 3662500458U, // <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7> - 3721156204U, // <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7> - 2659357716U, // <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1> - 1479188582U, // <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS - 2552931062U, // <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2> - 2552931944U, // <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2> - 1622148480U, // <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4> - 1479191862U, // <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS - 2622863514U, // <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS - 2588725862U, // <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> - 2600686586U, // <4,1,u,7>: Cost 3 vext1 , <7,0,1,2> - 1479194414U, // <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS - 2617557030U, // <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2> - 2622865510U, // <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS - 2622865612U, // <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6> - 3693289753U, // <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2> - 2635473244U, // <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6> - 3765650918U, // <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7> - 2696775148U, // <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4> - 3695944285U, // <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2> - 2622866077U, // <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS - 3696607990U, // <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2> - 3696608052U, // <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1> - 3696608150U, // <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0> - 3895574630U, // <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS - 2691909162U, // <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> - 3696608400U, // <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7> - 3760784956U, // <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3> - 3773908549U, // <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3> - 2691909162U, // <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3> - 3696608748U, // <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4> - 3696608828U, // <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3> - 2691909224U, // <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2> - 2691909234U, // <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3> - 3759605368U, // <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0> - 3696609156U, // <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7> - 3760785040U, // <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6> - 3668505927U, // <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2> - 2691909279U, // <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3> - 2691909286U, // <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1> - 3764840111U, // <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1> - 3765651129U, // <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2> - 2698544836U, // <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4> - 2685863630U, // <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5> - 2698692310U, // <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4> - 3772507871U, // <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4> - 2698839784U, // <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4> - 2691909358U, // <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1> - 2564915302U, // <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS - 2564916122U, // <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4> - 2564917004U, // <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4> - 2699208469U, // <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4> - 2564918582U, // <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS - 2622868790U, // <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS - 2229667632U, // <4,2,4,6>: Cost 3 vrev <2,4,6,4> - 3800082229U, // <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0> - 2622869033U, // <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS - 2552979558U, // <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS - 2558952342U, // <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0> - 2564925032U, // <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2> - 2967060582U, // <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS - 2552982838U, // <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS - 3987130190U, // <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7> - 2913388474U, // <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7> - 3895577910U, // <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS - 2552985390U, // <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS - 1479245926U, // <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS - 2552988406U, // <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2> - 2552989288U, // <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2> - 2954461286U, // <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS - 1479249206U, // <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS - 2229610281U, // <4,2,6,5>: Cost 3 vrev <2,4,5,6> - 2600767994U, // <4,2,6,6>: Cost 3 vext1 , <6,2,7,3> - 2600768506U, // <4,2,6,7>: Cost 3 vext1 , <7,0,1,2> - 1479251758U, // <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS - 2659365909U, // <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> - 3733771366U, // <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2> - 3734434999U, // <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2> - 2701199368U, // <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4> - 4175774618U, // <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4> - 3303360298U, // <4,2,7,5>: Cost 4 vrev <2,4,5,7> - 3727136217U, // <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4> - 3727136364U, // <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7> - 2659365909U, // <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2> - 1479262310U, // <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS - 2553004790U, // <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2> - 2553005672U, // <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2> - 2954477670U, // <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS - 1479265590U, // <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS - 2622871706U, // <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS - 2229700404U, // <4,2,u,6>: Cost 3 vrev <2,4,6,u> - 2600784890U, // <4,2,u,7>: Cost 3 vext1 , <7,0,1,2> - 1479268142U, // <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS - 3765651595U, // <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0> - 2691909782U, // <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2> - 2702452897U, // <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4> - 3693297946U, // <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3> - 3760711856U, // <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1> - 2235533820U, // <4,3,0,5>: Cost 3 vrev <3,4,5,0> - 3309349381U, // <4,3,0,6>: Cost 4 vrev <3,4,6,0> - 3668563278U, // <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0> - 2691909845U, // <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2> - 2235173328U, // <4,3,1,0>: Cost 3 vrev <3,4,0,1> - 3764840678U, // <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1> - 2630173594U, // <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4> - 2703190267U, // <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4> - 3760195840U, // <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0> - 3765651724U, // <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3> - 3309357574U, // <4,3,1,6>: Cost 4 vrev <3,4,6,1> - 3769633054U, // <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3> - 2703558952U, // <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4> - 3626770534U, // <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS - 2630174250U, // <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3> - 3765651777U, // <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2> - 2703853900U, // <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4> - 3626773814U, // <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS - 2704001374U, // <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4> - 3765651814U, // <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3> - 3769633135U, // <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3> - 2634819681U, // <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3> - 3765651839U, // <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1> - 3765651848U, // <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1> - 3710552404U, // <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3> - 2691910044U, // <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> - 2704591270U, // <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4> - 3769633202U, // <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7> - 3703917212U, // <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7> - 3769633220U, // <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7> - 2691910044U, // <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3> - 2691910096U, // <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1> - 2691910106U, // <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2> - 2564990741U, // <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4> - 3765651946U, // <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0> - 2691910136U, // <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5> - 2686454274U, // <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6> - 2235640329U, // <4,3,4,6>: Cost 3 vrev <3,4,6,4> - 3801483792U, // <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2> - 2691910168U, // <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1> - 2559025254U, // <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS - 2559026237U, // <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5> - 2564998862U, // <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5> - 2570971548U, // <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3> - 2559028534U, // <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS - 4163519477U, // <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5> - 3309390346U, // <4,3,5,6>: Cost 4 vrev <3,4,6,5> - 2706139747U, // <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4> - 2559031086U, // <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS - 2559033446U, // <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS - 2559034430U, // <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6> - 2565007127U, // <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6> - 2570979740U, // <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3> - 2559036726U, // <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS - 1161841154U, // <4,3,6,5>: Cost 2 vrev <3,4,5,6> - 4028203932U, // <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6> - 2706803380U, // <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4> - 1162062365U, // <4,3,6,u>: Cost 2 vrev <3,4,u,6> - 3769633475U, // <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1> - 3769633488U, // <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5> - 3638757144U, // <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7> - 3769633508U, // <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7> - 3769633515U, // <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5> - 3769633526U, // <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7> - 3662647932U, // <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7> - 3781208837U, // <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4> - 3769633547U, // <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1> - 2559049830U, // <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS - 2691910430U, // <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2> - 2565023513U, // <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u> - 2707835698U, // <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4> - 2559053110U, // <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS - 1161857540U, // <4,3,u,5>: Cost 2 vrev <3,4,5,u> - 2235673101U, // <4,3,u,6>: Cost 3 vrev <3,4,6,u> - 2708130646U, // <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4> - 1162078751U, // <4,3,u,u>: Cost 2 vrev <3,4,u,u> - 2617573416U, // <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4> - 1570373734U, // <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS - 2779676774U, // <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS - 3760196480U, // <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1> - 2576977100U, // <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0> - 2718747538U, // <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1> - 2718747548U, // <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2> - 3668637015U, // <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0> - 1570374301U, // <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS - 2644116214U, // <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2> - 2644116276U, // <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1> - 2691910602U, // <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3> - 2644116440U, // <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3> - 2711227356U, // <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3> - 2709310438U, // <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4> - 3765652462U, // <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3> - 3768970231U, // <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3> - 2695891968U, // <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3> - 3703260634U, // <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4> - 3765652499U, // <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4> - 2644117096U, // <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2> - 2631509709U, // <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4> - 2644117269U, // <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4> - 3705251698U, // <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7> - 2710047808U, // <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4> - 3783863369U, // <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4> - 2634827874U, // <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4> - 2644117654U, // <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2> - 3638797210U, // <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4> - 3638798082U, // <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3> - 2637482406U, // <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> - 2638146039U, // <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4> - 3913287374U, // <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5> - 3765652625U, // <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4> - 3713878762U, // <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4> - 2637482406U, // <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4> - 1503264870U, // <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS - 2577007514U, // <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4> - 2577008232U, // <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2> - 2571037175U, // <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4> - 161926454U, // <4,4,4,4>: Cost 1 vdup0 RHS - 1570377014U, // <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS - 2779680054U, // <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS - 2594927963U, // <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4> - 161926454U, // <4,4,4,u>: Cost 1 vdup0 RHS - 2571042918U, // <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS - 2571043738U, // <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4> - 3638814495U, // <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5> - 2571045368U, // <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5> - 2571046198U, // <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS - 1839648054U, // <4,4,5,5>: Cost 2 vzipl RHS, RHS - 1618169142U, // <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS - 2594936156U, // <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5> - 1618169160U, // <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS - 2553135206U, // <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS - 3626877686U, // <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2> - 2565080782U, // <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5> - 2571053561U, // <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6> - 2553138486U, // <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS - 2241555675U, // <4,4,6,5>: Cost 3 vrev <4,4,5,6> - 1973865782U, // <4,4,6,6>: Cost 2 vtrnl RHS, RHS - 2658055029U, // <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4> - 1973865800U, // <4,4,6,u>: Cost 2 vtrnl RHS, RHS - 2644120570U, // <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2> - 3638829978U, // <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4> - 3638830881U, // <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7> - 3735115018U, // <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4> - 2662036827U, // <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> - 2713292236U, // <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4> - 2713365973U, // <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4> - 2644121196U, // <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7> - 2662036827U, // <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4> - 1503297638U, // <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS - 1570379566U, // <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS - 2779682606U, // <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS - 2571069947U, // <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u> - 161926454U, // <4,4,u,4>: Cost 1 vdup0 RHS - 1841638710U, // <4,4,u,5>: Cost 2 vzipl RHS, RHS - 1618169385U, // <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS - 2594960735U, // <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u> - 161926454U, // <4,4,u,u>: Cost 1 vdup0 RHS - 2631516160U, // <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0> - 1557774438U, // <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS - 2618908875U, // <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5> - 2571078140U, // <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0> - 2626871634U, // <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5> - 3705258414U, // <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7> - 2594968438U, // <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5> - 2594968928U, // <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0> - 1557775005U, // <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS - 2631516918U, // <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2> - 2624217939U, // <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5> - 2631517078U, // <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0> - 2821341286U, // <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS - 3895086054U, // <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4> - 2626872471U, // <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5> - 3895083131U, // <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6> - 2718748368U, // <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3> - 2821341291U, // <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS - 2571092070U, // <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS - 3699287585U, // <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3> - 2630854269U, // <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5> - 1557776078U, // <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5> - 2631517974U, // <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5> - 3692652384U, // <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7> - 2631518138U, // <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7> - 4164013366U, // <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS - 1561094243U, // <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5> - 2631518358U, // <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2> - 3895084710U, // <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1> - 2631518540U, // <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4> - 2631518620U, // <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3> - 2631518716U, // <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0> - 2631518784U, // <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5> - 2658060980U, // <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4> - 2640145131U, // <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5> - 2631519006U, // <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2> - 2571108454U, // <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS - 3632907342U, // <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4> - 2571110094U, // <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5> - 2571110912U, // <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4> - 2571111734U, // <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS - 1557777718U, // <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS - 2645454195U, // <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5> - 2718748614U, // <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6> - 1557777961U, // <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS - 1503346790U, // <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS - 2913398480U, // <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3> - 2631519998U, // <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4> - 2577090710U, // <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2> - 1503349978U, // <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5> - 2631520260U, // <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5> - 2913390690U, // <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0> - 2821344566U, // <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS - 1503352622U, // <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS - 1497383014U, // <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS - 2559181904U, // <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6> - 2565154601U, // <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6> - 1497385474U, // <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6> - 1497386294U, // <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS - 3047608324U, // <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5> - 2571129656U, // <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6> - 27705344U, // <4,5,6,7>: Cost 0 copy RHS - 27705344U, // <4,5,6,u>: Cost 0 copy RHS - 2565161062U, // <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS - 2565161882U, // <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4> - 2565162794U, // <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7> - 2661381387U, // <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5> - 2565164342U, // <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS - 2718748840U, // <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7> - 2718748846U, // <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4> - 2719412407U, // <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4> - 2565166894U, // <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS - 1497399398U, // <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS - 1557780270U, // <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS - 2631522181U, // <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, - 1497401860U, // <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u> - 1497402678U, // <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS - 1557780634U, // <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS - 2631522512U, // <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, - 27705344U, // <4,5,u,7>: Cost 0 copy RHS - 27705344U, // <4,5,u,u>: Cost 0 copy RHS - 2618916864U, // <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0> - 1545175142U, // <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS - 1545175244U, // <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6> - 3692658940U, // <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0> - 2618917202U, // <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5> - 3852910806U, // <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7> - 2253525648U, // <4,6,0,6>: Cost 3 vrev <6,4,6,0> - 4040764726U, // <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS - 1545175709U, // <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS - 2618917622U, // <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2> - 2618917684U, // <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1> - 2618917782U, // <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0> - 2618917848U, // <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3> - 3692659773U, // <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5> - 2618918032U, // <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7> - 3692659937U, // <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7> - 4032146742U, // <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS - 2618918253U, // <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3> - 2618918380U, // <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4> - 2618918460U, // <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3> - 2618918504U, // <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2> - 2618918566U, // <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1> - 2618918679U, // <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6> - 2618918788U, // <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7> - 2618918842U, // <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7> - 2718749178U, // <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3> - 2618918971U, // <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1> - 2618919062U, // <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2> - 2636171526U, // <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6> - 3692661057U, // <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2> - 2618919324U, // <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3> - 2618919426U, // <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6> - 2638826058U, // <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6> - 3913303030U, // <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6> - 2722730572U, // <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4> - 2618919710U, // <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2> - 2565210214U, // <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS - 2718749286U, // <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3> - 2565211952U, // <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4> - 2571184649U, // <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4> - 2565213494U, // <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS - 1545178422U, // <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS - 1705430326U, // <4,6,4,6>: Cost 2 vuzpl RHS, RHS - 2595075437U, // <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4> - 1545178665U, // <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS - 2565218406U, // <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS - 2645462736U, // <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3> - 2913399290U, // <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3> - 3913305394U, // <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3> - 2645462982U, // <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6> - 2779172868U, // <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5> - 2913391416U, // <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6> - 2821426486U, // <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS - 2821426487U, // <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS - 1503428710U, // <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS - 2577171190U, // <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2> - 2645463546U, // <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3> - 2577172630U, // <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2> - 1503431908U, // <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6> - 2253501069U, // <4,6,6,5>: Cost 3 vrev <6,4,5,6> - 2618921784U, // <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6> - 2954464566U, // <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS - 1503434542U, // <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS - 2645464058U, // <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2> - 2779173882U, // <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2> - 3638978355U, // <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7> - 2725090156U, // <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4> - 2645464422U, // <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6> - 2779174246U, // <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> - 3852915914U, // <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3> - 2779174508U, // <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> - 2779173945U, // <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2> - 1503445094U, // <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS - 1545180974U, // <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS - 1705432878U, // <4,6,u,2>: Cost 2 vuzpl RHS, LHS - 2618922940U, // <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, - 1503448294U, // <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u> - 1545181338U, // <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS - 1705433242U, // <4,6,u,6>: Cost 2 vuzpl RHS, RHS - 2954480950U, // <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS - 1545181541U, // <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS - 3706601472U, // <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0> - 2632859750U, // <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS - 2726343685U, // <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4> - 3701293312U, // <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4> - 3706601810U, // <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5> - 2259424608U, // <4,7,0,5>: Cost 3 vrev <7,4,5,0> - 3695321617U, // <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7> - 3800454194U, // <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4> - 2632860317U, // <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS - 2259064116U, // <4,7,1,0>: Cost 3 vrev <7,4,0,1> - 3700630324U, // <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1> - 2632860570U, // <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4> - 3769635936U, // <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5> - 3656920374U, // <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS - 3700630681U, // <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7> - 3701294314U, // <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7> - 3793818754U, // <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3> - 2259654012U, // <4,7,1,u>: Cost 3 vrev <7,4,u,1> - 3656925286U, // <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS - 3706603050U, // <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3> - 3706603112U, // <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2> - 2727744688U, // <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4> - 3705939745U, // <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7> - 2632861554U, // <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7> - 3706603450U, // <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7> - 3792491731U, // <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3> - 2634852453U, // <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7> - 3706603670U, // <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2> - 3662906266U, // <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4> - 3725183326U, // <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4> - 3706603932U, // <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3> - 3701295618U, // <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6> - 2638834251U, // <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7> - 2639497884U, // <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7> - 3802445093U, // <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4> - 2640825150U, // <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7> - 2718750004U, // <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1> - 3706604490U, // <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3> - 3656943474U, // <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7> - 3779884371U, // <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5> - 2259383643U, // <4,7,4,4>: Cost 3 vrev <7,4,4,4> - 2632863030U, // <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS - 2259531117U, // <4,7,4,6>: Cost 3 vrev <7,4,6,4> - 3907340074U, // <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7> - 2632863273U, // <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS - 2913391610U, // <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2> - 3645006848U, // <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7> - 2589181646U, // <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5> - 3645008403U, // <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5> - 2913391974U, // <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6> - 2583211973U, // <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5> - 2589184670U, // <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5> - 2913392236U, // <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7> - 2913392258U, // <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2> - 1509474406U, // <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS - 3047609338U, // <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2> - 2583217768U, // <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2> - 2583218326U, // <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2> - 1509477686U, // <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS - 1509478342U, // <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6> - 2583220730U, // <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3> - 3047609964U, // <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7> - 1509480238U, // <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS - 3650994278U, // <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS - 3650995098U, // <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4> - 3650996010U, // <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7> - 3804804677U, // <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4> - 3650997486U, // <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7> - 2662725039U, // <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7> - 3662942880U, // <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7> - 2718750316U, // <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7> - 2664715938U, // <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7> - 1509490790U, // <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS - 2632865582U, // <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS - 2583234152U, // <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2> - 2583234710U, // <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2> - 1509494070U, // <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS - 1509494728U, // <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u> - 2583237114U, // <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3> - 3047757420U, // <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7> - 1509496622U, // <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS - 2618933248U, // <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0> - 1545191526U, // <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS - 1545191630U, // <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u> - 2691913445U, // <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, - 2618933586U, // <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5> - 2265397305U, // <4,u,0,5>: Cost 3 vrev - 2595189625U, // <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u> - 2595190139U, // <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0> - 1545192093U, // <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS - 2618934006U, // <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2> - 2618934068U, // <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1> - 1618171694U, // <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS - 2618934232U, // <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3> - 2695894848U, // <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, - 2618934416U, // <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7> - 3692676321U, // <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7> - 2718750555U, // <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, - 1618171748U, // <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS - 2553397350U, // <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS - 2630215215U, // <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u> - 2618934888U, // <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2> - 1557800657U, // <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u> - 2618935065U, // <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u> - 2733864859U, // <4,u,2,5>: Cost 3 vext3 , - 2618935226U, // <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7> - 2718750636U, // <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, - 1561118822U, // <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u> - 2618935446U, // <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2> - 2779318422U, // <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2> - 2636851545U, // <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u> - 2618935708U, // <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3> - 2618935810U, // <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6> - 2691913711U, // <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, - 2588725862U, // <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3> - 2640169710U, // <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u> - 2618936094U, // <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2> - 1503559782U, // <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS - 2692282391U, // <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, - 2565359426U, // <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4> - 2571332123U, // <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4> - 161926454U, // <4,u,4,4>: Cost 1 vdup0 RHS - 1545194806U, // <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS - 1705577782U, // <4,u,4,6>: Cost 2 vuzpl RHS, RHS - 2718750801U, // <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, - 161926454U, // <4,u,4,u>: Cost 1 vdup0 RHS - 1479164006U, // <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS - 1839650606U, // <4,u,5,1>: Cost 2 vzipl RHS, LHS - 2565367502U, // <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5> - 3089777309U, // <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS - 1479167286U, // <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS - 1839650970U, // <4,u,5,5>: Cost 2 vzipl RHS, RHS - 1618172058U, // <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS - 3089780265U, // <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS - 1618172076U, // <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS - 1479688294U, // <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS - 2553430774U, // <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2> - 1973868334U, // <4,u,6,2>: Cost 2 vtrnl RHS, LHS - 1497606685U, // <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6> - 1479691574U, // <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS - 1509552079U, // <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6> - 1973868698U, // <4,u,6,6>: Cost 2 vtrnl RHS, RHS - 27705344U, // <4,u,6,7>: Cost 0 copy RHS - 27705344U, // <4,u,6,u>: Cost 0 copy RHS - 2565382246U, // <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS - 2565383066U, // <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4> - 2565384005U, // <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7> - 2661405966U, // <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u> - 2565385526U, // <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS - 2779321702U, // <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6> - 2589274793U, // <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7> - 2779321964U, // <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7> - 2565388078U, // <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS - 1479704678U, // <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS - 1545197358U, // <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS - 1618172261U, // <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS - 1497623071U, // <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u> - 161926454U, // <4,u,u,4>: Cost 1 vdup0 RHS - 1545197722U, // <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS - 1618172301U, // <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS - 27705344U, // <4,u,u,7>: Cost 0 copy RHS - 27705344U, // <4,u,u,u>: Cost 0 copy RHS - 2687123456U, // <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0> - 2687123466U, // <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1> - 2687123476U, // <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2> - 3710599434U, // <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5> - 2642166098U, // <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5> - 3657060306U, // <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0> - 3292094923U, // <5,0,0,6>: Cost 4 vrev <0,5,6,0> - 3669005700U, // <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0> - 2687123530U, // <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2> - 2559434854U, // <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS - 2559435887U, // <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1> - 1613381734U, // <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS - 3698656256U, // <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7> - 2559438134U, // <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS - 2583326675U, // <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1> - 3715908851U, // <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7> - 3657069562U, // <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2> - 1613381788U, // <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS - 2686017700U, // <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2> - 2685796528U, // <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> - 2698625208U, // <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4> - 2685944002U, // <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5> - 2686017739U, // <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5> - 2686091476U, // <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5> - 2725167324U, // <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4> - 2595280230U, // <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> - 2686312687U, // <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5> - 3760128248U, // <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5> - 3759685888U, // <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4> - 2686533898U, // <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5> - 3760349459U, // <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5> - 2638187004U, // <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0> - 3776348452U, // <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4> - 3713256094U, // <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0> - 3914064896U, // <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7> - 2686976320U, // <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5> - 2559459430U, // <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS - 1613381970U, // <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> - 2687123804U, // <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6> - 3761013092U, // <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5> - 2559462710U, // <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS - 2638187830U, // <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS - 3761234303U, // <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5> - 2646150600U, // <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0> - 1613381970U, // <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5> - 3766763926U, // <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1> - 2919268454U, // <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS - 3053486182U, // <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS - 3723210589U, // <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0> - 3766763966U, // <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5> - 2650796031U, // <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0> - 3719893090U, // <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0> - 3914067254U, // <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS - 2919269021U, // <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS - 4047519744U, // <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0> - 2920038502U, // <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS - 3759759871U, // <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7> - 3645164070U, // <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6> - 3762414095U, // <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5> - 3993780690U, // <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7> - 3719893816U, // <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6> - 2662077302U, // <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5> - 2920039069U, // <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS - 2565455974U, // <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS - 2565456790U, // <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0> - 2565457742U, // <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7> - 3639199894U, // <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2> - 2565459254U, // <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS - 2589347938U, // <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0> - 2589348530U, // <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7> - 4188456422U, // <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7> - 2565461806U, // <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS - 2687124106U, // <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2> - 1616036502U, // <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5> - 1613382301U, // <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS - 2689925800U, // <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5> - 2687124146U, // <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6> - 2638190746U, // <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS - 2589356723U, // <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u> - 2595280230U, // <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6> - 1613382355U, // <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS - 2646818816U, // <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0> - 1573077094U, // <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS - 2646818980U, // <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2> - 2687124214U, // <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2> - 2641510738U, // <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5> - 2641510814U, // <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0> - 3720561142U, // <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7> - 3298141357U, // <5,1,0,7>: Cost 4 vrev <1,5,7,0> - 1573077661U, // <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS - 2223891567U, // <5,1,1,0>: Cost 3 vrev <1,5,0,1> - 2687124276U, // <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1> - 2646819734U, // <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0> - 2687124296U, // <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3> - 2691326803U, // <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5> - 2691400540U, // <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5> - 3765216101U, // <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5> - 3765289838U, // <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5> - 2687124341U, // <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3> - 3297641584U, // <5,1,2,0>: Cost 4 vrev <1,5,0,2> - 3763520391U, // <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3> - 2646820456U, // <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2> - 2687124374U, // <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0> - 2691990436U, // <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5> - 2687124395U, // <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3> - 2646820794U, // <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7> - 3808199610U, // <5,1,2,7>: Cost 4 vext3 , <1,2,7,0> - 2687124419U, // <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0> - 2577440870U, // <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS - 2687124440U, // <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3> - 3759686627U, // <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5> - 2692580332U, // <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5> - 2687124469U, // <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5> - 2685207552U, // <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7> - 3760866313U, // <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7> - 2692875280U, // <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5> - 2687124503U, // <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3> - 1567771538U, // <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1> - 2693096491U, // <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5> - 2693170228U, // <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5> - 2687124541U, // <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5> - 2646822096U, // <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4> - 1573080374U, // <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS - 2646822260U, // <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6> - 3298174129U, // <5,1,4,7>: Cost 4 vrev <1,5,7,4> - 1573080602U, // <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1> - 2687124591U, // <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1> - 2646822543U, // <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1> - 3760866433U, // <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1> - 2687124624U, // <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7> - 2687124631U, // <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5> - 2646822916U, // <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5> - 2646823010U, // <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0> - 2646823080U, // <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7> - 2687124663U, // <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1> - 2553577574U, // <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS - 3763520719U, // <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7> - 2646823418U, // <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3> - 3760866529U, // <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7> - 2553580854U, // <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS - 2687124723U, // <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7> - 2646823736U, // <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6> - 2646823758U, // <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1> - 2646823839U, // <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1> - 2559557734U, // <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS - 2559558452U, // <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1> - 2571503270U, // <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1> - 2040971366U, // <5,1,7,3>: Cost 2 vtrnr RHS, LHS - 2559561014U, // <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS - 2595393232U, // <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3> - 4188455035U, // <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6> - 2646824556U, // <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7> - 2040971371U, // <5,1,7,u>: Cost 2 vtrnr RHS, LHS - 1591662326U, // <5,1,u,0>: Cost 2 vext2 , - 1573082926U, // <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS - 2695824760U, // <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5> - 2040979558U, // <5,1,u,3>: Cost 2 vtrnr RHS, LHS - 2687124874U, // <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5> - 1573083290U, // <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS - 2646825168U, // <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, - 2646825216U, // <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, - 2040979563U, // <5,1,u,u>: Cost 2 vtrnr RHS, LHS - 3702652928U, // <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0> - 2628911206U, // <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS - 2641518756U, // <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2> - 3759760847U, // <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2> - 3760866775U, // <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1> - 3759539680U, // <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1> - 3760866796U, // <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4> - 3304114054U, // <5,2,0,7>: Cost 4 vrev <2,5,7,0> - 2628911773U, // <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS - 2623603464U, // <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2> - 3698008921U, // <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2> - 3633325603U, // <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5> - 2687125027U, // <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5> - 3633327414U, // <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS - 3759539760U, // <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0> - 3760866876U, // <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3> - 3304122247U, // <5,2,1,7>: Cost 4 vrev <2,5,7,1> - 2687125072U, // <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5> - 3633332326U, // <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS - 3759760992U, // <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3> - 2687125096U, // <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2> - 2687125106U, // <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3> - 2697963133U, // <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5> - 3759466120U, // <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7> - 3760866960U, // <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6> - 3771926168U, // <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5> - 2687125151U, // <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3> - 2687125158U, // <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1> - 2698405555U, // <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5> - 2577516238U, // <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5> - 3759687365U, // <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5> - 1624884942U, // <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5> - 2698700503U, // <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5> - 3772368608U, // <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5> - 3702655716U, // <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7> - 1625179890U, // <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5> - 2641521555U, // <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2> - 3772368642U, // <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3> - 2699142925U, // <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5> - 2698626838U, // <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5> - 2698626848U, // <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6> - 2628914486U, // <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS - 2645503353U, // <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2> - 3304146826U, // <5,2,4,7>: Cost 4 vrev <2,5,7,4> - 2628914729U, // <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS - 2553643110U, // <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS - 3758950227U, // <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3> - 3759761248U, // <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7> - 2982396006U, // <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS - 2553646390U, // <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS - 2553647108U, // <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5> - 3760867204U, // <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7> - 3702657141U, // <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1> - 2982396011U, // <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS - 3627393126U, // <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS - 3760867236U, // <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3> - 2645504506U, // <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3> - 2687125434U, // <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7> - 2700617665U, // <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5> - 3760867276U, // <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7> - 3763521493U, // <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7> - 3719246670U, // <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1> - 2687125479U, // <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7> - 2565603430U, // <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS - 2553660150U, // <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2> - 2565605216U, // <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7> - 2961178726U, // <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS - 2565606710U, // <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS - 4034920552U, // <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5> - 3114713292U, // <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6> - 3702658668U, // <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7> - 2961178731U, // <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS - 2687125563U, // <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1> - 2628917038U, // <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS - 2565613409U, // <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u> - 2687125592U, // <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3> - 1628203107U, // <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5> - 2628917402U, // <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS - 2702092405U, // <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5> - 3304179598U, // <5,2,u,7>: Cost 4 vrev <2,5,7,u> - 1628498055U, // <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5> - 3760867467U, // <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0> - 2687125654U, // <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2> - 3759761565U, // <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0> - 3633391766U, // <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2> - 2687125680U, // <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1> - 3760277690U, // <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2> - 3310013014U, // <5,3,0,6>: Cost 4 vrev <3,5,6,0> - 2236344927U, // <5,3,0,7>: Cost 3 vrev <3,5,7,0> - 2687125717U, // <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2> - 3760867551U, // <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3> - 3760867558U, // <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1> - 2624938923U, // <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3> - 2703198460U, // <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5> - 3760867587U, // <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3> - 2636219536U, // <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7> - 3698681075U, // <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7> - 2703493408U, // <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5> - 2628920721U, // <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3> - 3766765870U, // <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1> - 3698681379U, // <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5> - 3760867649U, // <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2> - 2698627404U, // <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4> - 2703935830U, // <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5> - 2698627422U, // <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4> - 3760867686U, // <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3> - 3769788783U, // <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3> - 2701945209U, // <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4> - 3760867711U, // <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1> - 2636220684U, // <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3> - 3772369298U, // <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2> - 2687125916U, // <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3> - 2704599463U, // <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5> - 2704673200U, // <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5> - 3709962935U, // <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7> - 3772369346U, // <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5> - 2704894411U, // <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5> - 2704968148U, // <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5> - 3698682850U, // <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0> - 2642857014U, // <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3> - 2705189359U, // <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5> - 2705263096U, // <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5> - 2685946370U, // <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6> - 3779152394U, // <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5> - 2236377699U, // <5,3,4,7>: Cost 3 vrev <3,5,7,4> - 2687126045U, // <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6> - 2571632742U, // <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS - 2559689870U, // <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5> - 2571634382U, // <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5> - 2571635264U, // <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5> - 2571636022U, // <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS - 2559692804U, // <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5> - 3720581218U, // <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0> - 2236385892U, // <5,3,5,7>: Cost 3 vrev <3,5,7,5> - 2571638574U, // <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS - 2565668966U, // <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS - 3633439887U, // <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6> - 2565670760U, // <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6> - 2565671426U, // <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6> - 2565672246U, // <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS - 3639414630U, // <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0> - 4047521640U, // <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6> - 2725169844U, // <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4> - 2565674798U, // <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS - 1485963366U, // <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS - 1485964432U, // <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7> - 2559706728U, // <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2> - 2559707286U, // <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2> - 1485966646U, // <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS - 2559708880U, // <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3> - 2601513466U, // <5,3,7,6>: Cost 3 vext1 , <6,2,7,3> - 3114714112U, // <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7> - 1485969198U, // <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS - 1485971558U, // <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS - 1485972625U, // <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u> - 2559714920U, // <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2> - 2559715478U, // <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2> - 1485974838U, // <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS - 2687126342U, // <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6> - 2601521658U, // <5,3,u,6>: Cost 3 vext1 , <6,2,7,3> - 2236410471U, // <5,3,u,7>: Cost 3 vrev <3,5,7,u> - 1485977390U, // <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS - 3627491430U, // <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS - 2636890214U, // <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS - 3703333028U, // <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2> - 3782249348U, // <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5> - 2642198866U, // <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5> - 2687126418U, // <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1> - 2242243887U, // <5,4,0,6>: Cost 3 vrev <4,5,6,0> - 3316059448U, // <5,4,0,7>: Cost 4 vrev <4,5,7,0> - 2636890781U, // <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS - 2241809658U, // <5,4,1,0>: Cost 3 vrev <4,5,0,1> - 3698025307U, // <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4> - 3698688940U, // <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4> - 3698689024U, // <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7> - 3700016206U, // <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4> - 2687126498U, // <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0> - 3760868336U, // <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5> - 3316067641U, // <5,4,1,7>: Cost 4 vrev <4,5,7,1> - 2242399554U, // <5,4,1,u>: Cost 3 vrev <4,5,u,1> - 3703334371U, // <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4> - 3703998004U, // <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4> - 3704661637U, // <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4> - 2636891854U, // <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5> - 3705988903U, // <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4> - 2698628150U, // <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3> - 3760868415U, // <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3> - 3783871562U, // <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5> - 2666752099U, // <5,4,2,u>: Cost 3 vext2 , <2,u,4,5> - 3639459942U, // <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS - 3709970701U, // <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4> - 2636892510U, // <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4> - 3710634396U, // <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3> - 2638219776U, // <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4> - 3766987908U, // <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0> - 2710719634U, // <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5> - 3914097664U, // <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7> - 2640874308U, // <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4> - 2583642214U, // <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS - 2642201574U, // <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4> - 3710635062U, // <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3> - 3717270664U, // <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4> - 2713963728U, // <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4> - 1637567706U, // <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5> - 2242276659U, // <5,4,4,6>: Cost 3 vrev <4,5,6,4> - 2646183372U, // <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4> - 1637788917U, // <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5> - 2559762534U, // <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS - 2559763607U, // <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5> - 2698628366U, // <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3> - 3633506454U, // <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2> - 2559765814U, // <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS - 2583654395U, // <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5> - 1613385014U, // <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS - 3901639990U, // <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS - 1613385032U, // <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS - 2559770726U, // <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS - 2559771648U, // <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7> - 3633514088U, // <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2> - 2571717122U, // <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6> - 2559774006U, // <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS - 2712636796U, // <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5> - 3760868743U, // <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7> - 2712784270U, // <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5> - 2559776558U, // <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS - 2565750886U, // <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS - 2565751706U, // <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4> - 2565752690U, // <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7> - 2571725387U, // <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7> - 2565754166U, // <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS - 3114713426U, // <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5> - 94817590U, // <5,4,7,6>: Cost 1 vrev RHS - 2595616175U, // <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7> - 94965064U, // <5,4,7,u>: Cost 1 vrev RHS - 2559787110U, // <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS - 2559788186U, // <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u> - 2242014483U, // <5,4,u,2>: Cost 3 vrev <4,5,2,u> - 2667419628U, // <5,4,u,3>: Cost 3 vext2 , - 2559790390U, // <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS - 1640222238U, // <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5> - 94825783U, // <5,4,u,6>: Cost 1 vrev RHS - 2714111536U, // <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5> - 94973257U, // <5,4,u,u>: Cost 1 vrev RHS - 2646851584U, // <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0> - 1573109862U, // <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS - 2646851748U, // <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2> - 3760279130U, // <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2> - 2687127138U, // <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1> - 2248142847U, // <5,5,0,5>: Cost 3 vrev <5,5,5,0> - 3720593910U, // <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7> - 4182502710U, // <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS - 1573110429U, // <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS - 2646852342U, // <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2> - 2624291676U, // <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5> - 2646852502U, // <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0> - 2646852568U, // <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3> - 2715217591U, // <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5> - 2628936848U, // <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7> - 3698033907U, // <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7> - 2713964240U, // <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3> - 2628937107U, // <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5> - 3645497446U, // <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS - 3760869099U, // <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3> - 2646853224U, // <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2> - 2698628862U, // <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4> - 3772370694U, // <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3> - 2713964303U, // <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3> - 2646853562U, // <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7> - 4038198272U, // <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7> - 2701946667U, // <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4> - 2646853782U, // <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2> - 3698034922U, // <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5> - 3702679919U, // <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3> - 2637564336U, // <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5> - 2646854146U, // <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6> - 2638891602U, // <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5> - 3702680247U, // <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7> - 3702680259U, // <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1> - 2646854430U, // <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2> - 2646854546U, // <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1> - 2642209767U, // <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5> - 3711306806U, // <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3> - 3645516369U, // <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4> - 1570458842U, // <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> - 1573113142U, // <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS - 2645527932U, // <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5> - 2713964486U, // <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6> - 1573113374U, // <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5> - 1509982310U, // <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS - 2646855376U, // <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3> - 2583725672U, // <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2> - 2583726230U, // <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2> - 1509985590U, // <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS - 229035318U, // <5,5,5,5>: Cost 1 vdup1 RHS - 2646855778U, // <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0> - 2646855848U, // <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7> - 229035318U, // <5,5,5,u>: Cost 1 vdup1 RHS - 2577760358U, // <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS - 3633587361U, // <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6> - 2646856186U, // <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3> - 3633588738U, // <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6> - 2718535756U, // <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5> - 2644202223U, // <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5> - 2973780482U, // <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6> - 2646856526U, // <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1> - 2646856607U, // <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1> - 2571796582U, // <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS - 3633595392U, // <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7> - 2571798222U, // <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5> - 2571799124U, // <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7> - 2571799862U, // <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS - 3114717188U, // <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5> - 4034923010U, // <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6> - 2040974646U, // <5,5,7,7>: Cost 2 vtrnr RHS, RHS - 2040974647U, // <5,5,7,u>: Cost 2 vtrnr RHS, RHS - 1509982310U, // <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS - 1573115694U, // <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS - 2571806414U, // <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5> - 2571807317U, // <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u> - 1509985590U, // <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS - 229035318U, // <5,5,u,5>: Cost 1 vdup1 RHS - 2646857936U, // <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, - 2040982838U, // <5,5,u,7>: Cost 2 vtrnr RHS, RHS - 229035318U, // <5,5,u,u>: Cost 1 vdup1 RHS - 2638233600U, // <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0> - 1564491878U, // <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS - 2632261796U, // <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2> - 2638233856U, // <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4> - 2638233938U, // <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5> - 3706003885U, // <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6> - 3706003967U, // <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7> - 4047473974U, // <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS - 1564492445U, // <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS - 2638234358U, // <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2> - 2638234420U, // <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1> - 2638234518U, // <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0> - 2638234584U, // <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3> - 2626290768U, // <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6> - 2638234768U, // <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7> - 3700032719U, // <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7> - 2982366518U, // <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS - 2628945300U, // <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6> - 3706004925U, // <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2> - 3711976966U, // <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3> - 2638235240U, // <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2> - 2638235302U, // <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1> - 2632263465U, // <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6> - 2638235496U, // <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6> - 2638235578U, // <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7> - 2713965050U, // <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3> - 2634917997U, // <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6> - 2638235798U, // <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2> - 3711977695U, // <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3> - 3710650720U, // <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6> - 2638236060U, // <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3> - 1564494338U, // <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6> - 2638236234U, // <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6> - 3711978104U, // <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7> - 4034227510U, // <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS - 1567148870U, // <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6> - 2577817702U, // <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS - 3700034544U, // <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5> - 2723033713U, // <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5> - 2638236818U, // <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5> - 2644208859U, // <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6> - 1564495158U, // <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS - 2645536125U, // <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6> - 2723402398U, // <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5> - 1564495401U, // <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS - 2577825894U, // <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS - 2662125264U, // <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3> - 3775836867U, // <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6> - 3711979343U, // <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4> - 2650181556U, // <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6> - 2662125572U, // <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5> - 2638237732U, // <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1> - 2982399286U, // <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS - 2982399287U, // <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS - 2583806054U, // <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS - 3711979910U, // <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4> - 2662126074U, // <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3> - 2583808514U, // <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6> - 2583809334U, // <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS - 2583810062U, // <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6> - 2638238520U, // <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6> - 2973781302U, // <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS - 2973781303U, // <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS - 430358630U, // <5,6,7,0>: Cost 1 vext1 RHS, LHS - 1504101110U, // <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2> - 1504101992U, // <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2> - 1504102550U, // <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2> - 430361910U, // <5,6,7,4>: Cost 1 vext1 RHS, RHS - 1504104390U, // <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6> - 1504105272U, // <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6> - 1504106092U, // <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7> - 430364462U, // <5,6,7,u>: Cost 1 vext1 RHS, LHS - 430366822U, // <5,6,u,0>: Cost 1 vext1 RHS, LHS - 1564497710U, // <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS - 1504110184U, // <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2> - 1504110742U, // <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2> - 430370103U, // <5,6,u,4>: Cost 1 vext1 RHS, RHS - 1564498074U, // <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS - 1504113146U, // <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3> - 1504113658U, // <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2> - 430372654U, // <5,6,u,u>: Cost 1 vext1 RHS, LHS - 2625634304U, // <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0> - 1551892582U, // <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS - 2625634468U, // <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2> - 2571889247U, // <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0> - 2625634642U, // <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5> - 2595778728U, // <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7> - 3699376639U, // <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7> - 2260235715U, // <5,7,0,7>: Cost 3 vrev <7,5,7,0> - 1551893149U, // <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS - 2625635062U, // <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2> - 2624308020U, // <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1> - 2625635222U, // <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0> - 1551893504U, // <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7> - 2571898166U, // <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS - 2625635472U, // <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7> - 2627626227U, // <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7> - 3702031684U, // <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7> - 1555211669U, // <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7> - 2629617126U, // <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7> - 3699377670U, // <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3> - 2625635944U, // <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2> - 2625636006U, // <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1> - 2632271658U, // <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7> - 2625636201U, // <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7> - 2625636282U, // <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7> - 3708004381U, // <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7> - 2625636411U, // <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1> - 2625636502U, // <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2> - 2625636604U, // <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5> - 3699378478U, // <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1> - 2625636764U, // <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3> - 2625636866U, // <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6> - 2625636959U, // <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0> - 3699378808U, // <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7> - 2640235254U, // <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7> - 2625637150U, // <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2> - 2571919462U, // <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS - 2571920384U, // <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7> - 3699379260U, // <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0> - 2571922019U, // <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4> - 2571922742U, // <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS - 1551895862U, // <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS - 2846277980U, // <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> - 2646207951U, // <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7> - 1551896105U, // <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS - 2583871590U, // <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS - 2652180176U, // <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3> - 2625638177U, // <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3> - 2625638262U, // <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7> - 2583874870U, // <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS - 2846281732U, // <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5> - 2651517015U, // <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7> - 1772539190U, // <5,7,5,7>: Cost 2 vuzpr RHS, RHS - 1772539191U, // <5,7,5,u>: Cost 2 vuzpr RHS, RHS - 2846281826U, // <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0> - 3699380615U, // <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5> - 2846281108U, // <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2> - 2589854210U, // <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6> - 2846281830U, // <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4> - 2725467658U, // <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u> - 2846281076U, // <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> - 2846279610U, // <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7> - 2846279611U, // <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u> - 1510146150U, // <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS - 2846282574U, // <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1> - 2583889512U, // <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2> - 2846281919U, // <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3> - 1510149430U, // <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS - 1510150168U, // <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7> - 2583892474U, // <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3> - 2625640044U, // <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7> - 1510151982U, // <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS - 1510154342U, // <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS - 1551898414U, // <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS - 2625640325U, // <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, - 1772536477U, // <5,7,u,3>: Cost 2 vuzpr RHS, LHS - 1510157622U, // <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS - 1551898778U, // <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS - 2625640656U, // <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, - 1772539433U, // <5,7,u,7>: Cost 2 vuzpr RHS, RHS - 1551898981U, // <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS - 2625642496U, // <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0> - 1551900774U, // <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS - 2625642660U, // <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2> - 2698630885U, // <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, - 2687129325U, // <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, - 2689783542U, // <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, - 2266134675U, // <5,u,0,6>: Cost 3 vrev - 2595853772U, // <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0> - 1551901341U, // <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS - 2625643254U, // <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2> - 2625643316U, // <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1> - 1613387566U, // <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS - 1551901697U, // <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u> - 2626307154U, // <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u> - 2689783622U, // <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, - 2627634420U, // <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u> - 2982366536U, // <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS - 1613387620U, // <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS - 2846286742U, // <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0> - 2685796528U, // <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5> - 2625644136U, // <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2> - 2687129480U, // <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, - 2632279851U, // <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u> - 2625644394U, // <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u> - 2625644474U, // <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7> - 2713966508U, // <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, - 2625644603U, // <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1> - 2687129532U, // <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, - 2636261649U, // <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u> - 2636925282U, // <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u> - 2625644956U, // <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3> - 1564510724U, // <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u> - 2625645160U, // <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0> - 2734610422U, // <5,u,3,6>: Cost 3 vext3 , - 2640243447U, // <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u> - 1567165256U, // <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u> - 1567828889U, // <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u> - 1661163546U, // <5,u,4,1>: Cost 2 vext3 , - 2734463012U, // <5,u,4,2>: Cost 3 vext3 , - 2698631212U, // <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, - 1570458842U, // <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5> - 1551904054U, // <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS - 2846286172U, // <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6> - 2646216144U, // <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u> - 1551904297U, // <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS - 1509982310U, // <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS - 2560058555U, // <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5> - 2698926194U, // <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, - 2698631295U, // <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, - 1509985590U, // <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS - 229035318U, // <5,u,5,5>: Cost 1 vdup1 RHS - 1613387930U, // <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS - 1772547382U, // <5,u,5,7>: Cost 2 vuzpr RHS, RHS - 229035318U, // <5,u,5,u>: Cost 1 vdup1 RHS - 2566037606U, // <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS - 2920044334U, // <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS - 2566039445U, // <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6> - 2687129808U, // <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, - 2566040886U, // <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS - 2920044698U, // <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS - 2846289268U, // <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6> - 2973781320U, // <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS - 2687129853U, // <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, - 430506086U, // <5,u,7,0>: Cost 1 vext1 RHS, LHS - 1486333117U, // <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7> - 1504249448U, // <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2> - 2040971933U, // <5,u,7,3>: Cost 2 vtrnr RHS, LHS - 430509384U, // <5,u,7,4>: Cost 1 vext1 RHS, RHS - 1504251600U, // <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3> - 118708378U, // <5,u,7,6>: Cost 1 vrev RHS - 2040974889U, // <5,u,7,7>: Cost 2 vtrnr RHS, RHS - 430511918U, // <5,u,7,u>: Cost 1 vext1 RHS, LHS - 430514278U, // <5,u,u,0>: Cost 1 vext1 RHS, LHS - 1551906606U, // <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS - 1613388133U, // <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS - 1772544669U, // <5,u,u,3>: Cost 2 vuzpr RHS, LHS - 430517577U, // <5,u,u,4>: Cost 1 vext1 RHS, RHS - 229035318U, // <5,u,u,5>: Cost 1 vdup1 RHS - 118716571U, // <5,u,u,6>: Cost 1 vrev RHS - 1772547625U, // <5,u,u,7>: Cost 2 vuzpr RHS, RHS - 430520110U, // <5,u,u,u>: Cost 1 vext1 RHS, LHS - 2686025728U, // <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0> - 2686025738U, // <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1> - 2686025748U, // <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2> - 3779084320U, // <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5> - 2642903388U, // <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6> - 3657723939U, // <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0> - 3926676514U, // <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6> - 3926675786U, // <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7> - 2686025802U, // <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2> - 2566070374U, // <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS - 3759767642U, // <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0> - 1612284006U, // <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS - 2583988738U, // <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6> - 2566073654U, // <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS - 2583990308U, // <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1> - 2589963005U, // <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1> - 2595935702U, // <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1> - 1612284060U, // <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS - 2686025892U, // <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2> - 2685804721U, // <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6> - 3759620282U, // <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6> - 2705342658U, // <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5> - 1612284108U, // <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6> - 3706029956U, // <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7> - 2686173406U, // <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6> - 3651769338U, // <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2> - 1612579056U, // <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6> - 3706030230U, // <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2> - 2705342720U, // <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4> - 2705342730U, // <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5> - 3706030492U, // <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3> - 2644896258U, // <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6> - 3718638154U, // <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6> - 3729918619U, // <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6> - 3926672384U, // <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7> - 2705342784U, // <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5> - 2687058250U, // <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6> - 2686026066U, // <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5> - 1613463900U, // <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6> - 3761021285U, // <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6> - 2687353198U, // <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6> - 2632289590U, // <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS - 2645560704U, // <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0> - 2646224337U, // <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0> - 1613906322U, // <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6> - 3651788902U, // <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS - 2687795620U, // <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6> - 3761611181U, // <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6> - 3723284326U, // <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0> - 2646224838U, // <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6> - 3718639630U, // <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6> - 2652196962U, // <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0> - 2852932918U, // <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS - 2852932919U, // <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS - 2852933730U, // <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0> - 2925985894U, // <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS - 3060203622U, // <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS - 3718640178U, // <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5> - 2656178832U, // <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0> - 3725939378U, // <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7> - 2657506098U, // <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0> - 2619020110U, // <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1> - 2925986461U, // <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS - 2572091494U, // <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS - 2572092310U, // <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0> - 2980495524U, // <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2> - 2572094072U, // <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7> - 2572094774U, // <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS - 4054238242U, // <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5> - 3645837653U, // <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0> - 4054239054U, // <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7> - 2572097326U, // <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS - 2686026378U, // <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2> - 2686026386U, // <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1> - 1612284573U, // <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS - 2705343144U, // <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5> - 1616265906U, // <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6> - 2632292506U, // <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS - 2590020356U, // <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u> - 2852933161U, // <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS - 1612284627U, // <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS - 2595995750U, // <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS - 2646229094U, // <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS - 3694092492U, // <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6> - 2686026486U, // <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2> - 2595999030U, // <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS - 3767730952U, // <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2> - 2596000590U, // <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1> - 2596001246U, // <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0> - 2686026531U, // <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2> - 3763602219U, // <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1> - 2686026548U, // <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1> - 3764929346U, // <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6> - 2686026568U, // <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3> - 2691334996U, // <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6> - 3760874332U, // <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5> - 3765224294U, // <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6> - 3669751263U, // <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1> - 2686026613U, // <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3> - 2554208358U, // <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS - 3763602311U, // <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3> - 3639895971U, // <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2> - 2686026646U, // <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0> - 2554211638U, // <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS - 3760874411U, // <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3> - 2554212858U, // <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3> - 3802973114U, // <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0> - 2686026691U, // <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0> - 2566160486U, // <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS - 2686026712U, // <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3> - 2686026724U, // <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6> - 3759768552U, // <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1> - 2692662262U, // <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6> - 2686026752U, // <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7> - 2590053128U, // <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3> - 3663795194U, // <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2> - 2686026775U, // <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3> - 2641587099U, // <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1> - 2693104684U, // <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6> - 3639912357U, // <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4> - 2687206462U, // <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6> - 3633941814U, // <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS - 2693399632U, // <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6> - 3765077075U, // <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0> - 2646232530U, // <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1> - 2687206507U, // <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6> - 2647559796U, // <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1> - 3765077118U, // <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7> - 3767583878U, // <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6> - 2686026896U, // <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7> - 2693989528U, // <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6> - 3767805089U, // <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6> - 2652868706U, // <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0> - 3908250934U, // <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS - 2686026941U, // <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7> - 2554241126U, // <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS - 3763602639U, // <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7> - 3759547607U, // <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6> - 3115221094U, // <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS - 2554244406U, // <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS - 3760874739U, // <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7> - 2554245944U, // <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6> - 3719975758U, // <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1> - 3115221099U, // <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS - 2560221286U, // <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS - 2560222415U, // <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7> - 2980497558U, // <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2> - 3103211622U, // <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS - 2560224566U, // <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS - 2980495698U, // <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5> - 3633967526U, // <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0> - 4054237686U, // <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7> - 2560227118U, // <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS - 2560229478U, // <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS - 2686027117U, // <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3> - 2686027129U, // <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6> - 2686027132U, // <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0> - 2687206795U, // <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6> - 2686027157U, // <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7> - 2590094093U, // <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u> - 2596066790U, // <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u> - 2686027177U, // <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0> - 2646900736U, // <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0> - 1573159014U, // <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS - 2646900900U, // <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2> - 3759769037U, // <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0> - 2641592668U, // <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6> - 3779085794U, // <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3> - 2686027244U, // <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4> - 3669816807U, // <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0> - 1573159581U, // <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS - 2230527897U, // <6,2,1,0>: Cost 3 vrev <2,6,0,1> - 2646901556U, // <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1> - 2646901654U, // <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0> - 2847047782U, // <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS - 3771049517U, // <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6> - 2646901904U, // <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7> - 2686027324U, // <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3> - 3669825000U, // <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1> - 2231117793U, // <6,2,1,u>: Cost 3 vrev <2,6,u,1> - 3763603029U, // <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1> - 3759769184U, // <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3> - 2686027368U, // <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2> - 2686027378U, // <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3> - 2697971326U, // <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6> - 3759769224U, // <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7> - 2698118800U, // <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6> - 3920794092U, // <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7> - 2686027423U, // <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3> - 2686027430U, // <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1> - 3759769262U, // <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0> - 2698487485U, // <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6> - 2705344196U, // <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4> - 2686027470U, // <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5> - 2698708696U, // <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6> - 2724660961U, // <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6> - 2729232104U, // <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4> - 2686027502U, // <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1> - 1567853468U, // <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2> - 3759769351U, // <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u> - 2699151118U, // <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6> - 2686027543U, // <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6> - 2699298592U, // <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6> - 1573162294U, // <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS - 2686027564U, // <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0> - 3719982547U, // <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2> - 1573162532U, // <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2> - 3779086154U, // <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3> - 2646904528U, // <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3> - 3759769440U, // <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7> - 2699888488U, // <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6> - 2230855617U, // <6,2,5,4>: Cost 3 vrev <2,6,4,5> - 2646904836U, // <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5> - 2646904930U, // <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0> - 2847051062U, // <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS - 2700257173U, // <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6> - 2687207321U, // <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1> - 2686027684U, // <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3> - 2566260656U, // <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6> - 2685806522U, // <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7> - 2687207361U, // <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5> - 2686027724U, // <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7> - 2646905656U, // <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6> - 2646905678U, // <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1> - 2686027751U, // <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7> - 2554323046U, // <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS - 2572239606U, // <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2> - 2566268849U, // <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7> - 1906753638U, // <6,2,7,3>: Cost 2 vzipr RHS, LHS - 2554326326U, // <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS - 3304687564U, // <6,2,7,5>: Cost 4 vrev <2,6,5,7> - 2980495708U, // <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6> - 2646906476U, // <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7> - 1906753643U, // <6,2,7,u>: Cost 2 vzipr RHS, LHS - 1591744256U, // <6,2,u,0>: Cost 2 vext2 , - 1573164846U, // <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS - 2701805650U, // <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6> - 1906761830U, // <6,2,u,3>: Cost 2 vzipr RHS, LHS - 2686027875U, // <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5> - 1573165210U, // <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS - 2686322800U, // <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0> - 2847051305U, // <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS - 1906761835U, // <6,2,u,u>: Cost 2 vzipr RHS, LHS - 3759769739U, // <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0> - 2686027926U, // <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2> - 2686027937U, // <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4> - 3640027286U, // <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2> - 2687207601U, // <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2> - 2705344698U, // <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2> - 3663917847U, // <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0> - 2237008560U, // <6,3,0,7>: Cost 3 vrev <3,6,7,0> - 2686027989U, // <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2> - 3759769823U, // <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3> - 3759769830U, // <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1> - 3759769841U, // <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3> - 3759769848U, // <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1> - 2703280390U, // <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> - 3759769868U, // <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3> - 3704063194U, // <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0> - 3767732510U, // <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3> - 2703280390U, // <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6> - 3704063468U, // <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4> - 2630321724U, // <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> - 3759769921U, // <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2> - 3759769928U, // <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0> - 3704063767U, // <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6> - 3704063876U, // <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7> - 2636957626U, // <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7> - 3777907058U, // <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6> - 2630321724U, // <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3> - 3759769983U, // <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1> - 3710036245U, // <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3> - 2636958054U, // <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3> - 2686028188U, // <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3> - 2704607656U, // <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6> - 3773041072U, // <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5> - 3711363731U, // <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7> - 3767732676U, // <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7> - 2707999179U, // <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5> - 2584232038U, // <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS - 2642267118U, // <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3> - 2642930751U, // <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3> - 2705197552U, // <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6> - 2584235318U, // <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS - 1631603202U, // <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6> - 2654211444U, // <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6> - 2237041332U, // <6,3,4,7>: Cost 3 vrev <3,6,7,4> - 1631824413U, // <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6> - 3640066150U, // <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS - 3772746288U, // <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7> - 3640067790U, // <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5> - 3773041216U, // <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5> - 2705934922U, // <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6> - 3773041236U, // <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7> - 3779086940U, // <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6> - 3767732831U, // <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0> - 2706229870U, // <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6> - 2602164326U, // <6,3,6,0>: Cost 3 vext1 , LHS - 2654212512U, // <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3> - 2566334393U, // <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6> - 3704066588U, // <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1> - 2602167524U, // <6,3,6,4>: Cost 3 vext1 , <4,4,6,6> - 3710702321U, // <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7> - 2724661933U, // <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6> - 3710702465U, // <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7> - 2602170158U, // <6,3,6,u>: Cost 3 vext1 , LHS - 1492598886U, // <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS - 2560369889U, // <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7> - 1492600762U, // <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7> - 2566342806U, // <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2> - 1492602166U, // <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS - 2602176208U, // <6,3,7,5>: Cost 3 vext1 , <5,1,7,3> - 2566345210U, // <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3> - 2980496528U, // <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7> - 1492604718U, // <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS - 1492607078U, // <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS - 2686028574U, // <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2> - 1492608955U, // <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u> - 2566350998U, // <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2> - 1492610358U, // <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS - 1634257734U, // <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6> - 2566353489U, // <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0> - 2980504720U, // <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7> - 1492612910U, // <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS - 3703406592U, // <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0> - 2629664870U, // <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS - 2629664972U, // <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6> - 3779087232U, // <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1> - 2642936156U, // <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6> - 2712570770U, // <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1> - 2687208348U, // <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2> - 3316723081U, // <6,4,0,7>: Cost 4 vrev <4,6,7,0> - 2629665437U, // <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS - 2242473291U, // <6,4,1,0>: Cost 3 vrev <4,6,0,1> - 3700089652U, // <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1> - 3703407510U, // <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0> - 2852962406U, // <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS - 3628166454U, // <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS - 3760876514U, // <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0> - 2687208430U, // <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3> - 3316731274U, // <6,4,1,7>: Cost 4 vrev <4,6,7,1> - 2243063187U, // <6,4,1,u>: Cost 3 vrev <4,6,u,1> - 2629666284U, // <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4> - 3703408188U, // <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3> - 3703408232U, // <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2> - 3703408294U, // <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1> - 2632320816U, // <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4> - 2923384118U, // <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS - 2687208508U, // <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0> - 3760950341U, // <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0> - 2634975348U, // <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4> - 3703408790U, // <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2> - 3316305238U, // <6,4,3,1>: Cost 4 vrev <4,6,1,3> - 3703408947U, // <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6> - 3703409052U, // <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3> - 2644929026U, // <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6> - 3718670922U, // <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6> - 2705345682U, // <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5> - 3926705152U, // <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7> - 2668817222U, // <6,4,3,u>: Cost 3 vext2 , <3,u,5,6> - 2590277734U, // <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS - 3716017135U, // <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4> - 2642938944U, // <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4> - 3717344401U, // <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4> - 2712571088U, // <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4> - 2629668150U, // <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS - 1637649636U, // <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> - 2646257109U, // <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4> - 1637649636U, // <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6> - 2566398054U, // <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS - 3760876805U, // <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3> - 2566399937U, // <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5> - 2584316418U, // <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6> - 2566401334U, // <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS - 2584318028U, // <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5> - 1612287286U, // <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS - 2852965686U, // <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS - 1612287304U, // <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS - 1504608358U, // <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS - 2578350838U, // <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2> - 2578351720U, // <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2> - 2578352278U, // <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2> - 1504611638U, // <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS - 2578353872U, // <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3> - 2578354682U, // <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3> - 2578355194U, // <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2> - 1504614190U, // <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS - 2572386406U, // <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS - 2572387226U, // <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4> - 3640157902U, // <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5> - 2572389020U, // <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7> - 2572389686U, // <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS - 2980497102U, // <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5> - 2980495564U, // <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6> - 4054239090U, // <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7> - 2572392238U, // <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS - 1504608358U, // <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS - 2629670702U, // <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS - 2566424516U, // <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u> - 2584340994U, // <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6> - 1640156694U, // <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6> - 2629671066U, // <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS - 1612287529U, // <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS - 2852965929U, // <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS - 1612287547U, // <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS - 3708723200U, // <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0> - 2634981478U, // <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS - 3694125260U, // <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6> - 3779087962U, // <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2> - 3760877154U, // <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1> - 4195110916U, // <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5> - 3696779775U, // <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7> - 1175212130U, // <6,5,0,7>: Cost 2 vrev <5,6,7,0> - 1175285867U, // <6,5,0,u>: Cost 2 vrev <5,6,u,0> - 2248445988U, // <6,5,1,0>: Cost 3 vrev <5,6,0,1> - 3698107237U, // <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5> - 3708724118U, // <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0> - 3908575334U, // <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS - 3716023376U, // <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6> - 3708724368U, // <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7> - 3767733960U, // <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4> - 2712571600U, // <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3> - 2712571609U, // <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3> - 2578391142U, // <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS - 3704079934U, // <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5> - 3708724840U, // <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2> - 3705407182U, // <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5> - 2578394422U, // <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS - 3717351272U, // <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6> - 2634983354U, // <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7> - 3115486518U, // <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS - 2634983541U, // <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5> - 3708725398U, // <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2> - 3710052631U, // <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5> - 3708725606U, // <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3> - 3708725660U, // <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3> - 2643610114U, // <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6> - 3717352010U, // <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6> - 3773632358U, // <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0> - 2248978533U, // <6,5,3,7>: Cost 3 vrev <5,6,7,3> - 2249052270U, // <6,5,3,u>: Cost 3 vrev <5,6,u,3> - 2596323430U, // <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS - 3716025328U, // <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5> - 3716688961U, // <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5> - 2643610770U, // <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5> - 2596326710U, // <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS - 2634984758U, // <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS - 3767734199U, // <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0> - 1643696070U, // <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6> - 1643769807U, // <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6> - 2578415718U, // <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS - 3652158198U, // <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2> - 3652159080U, // <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2> - 3652159638U, // <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2> - 2578418998U, // <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS - 2712571908U, // <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5> - 2718027790U, // <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6> - 2712571928U, // <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7> - 2712571937U, // <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7> - 2705346596U, // <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1> - 3767144496U, // <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4> - 3773116473U, // <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4> - 2705346626U, // <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4> - 2705346636U, // <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5> - 3908577217U, // <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5> - 2578428728U, // <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6> - 2712572002U, // <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0> - 2705346668U, // <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1> - 2560516198U, // <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS - 2560517363U, // <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7> - 2566490060U, // <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7> - 3634260118U, // <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2> - 2560519478U, // <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS - 2980498650U, // <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5> - 2980497922U, // <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6> - 3103214902U, // <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS - 2560522030U, // <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS - 2560524390U, // <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS - 2560525556U, // <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u> - 2566498253U, // <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u> - 2646931439U, // <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, - 2560527670U, // <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS - 2634987674U, // <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS - 2980506114U, // <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6> - 1175277674U, // <6,5,u,7>: Cost 2 vrev <5,6,7,u> - 1175351411U, // <6,5,u,u>: Cost 2 vrev <5,6,u,u> - 2578448486U, // <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS - 1573191782U, // <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS - 2686030124U, // <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4> - 3779088690U, // <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1> - 2687209788U, // <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2> - 3652194000U, // <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3> - 2254852914U, // <6,6,0,6>: Cost 3 vrev <6,6,6,0> - 4041575734U, // <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS - 1573192349U, // <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS - 2646934262U, // <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2> - 2646934324U, // <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1> - 2646934422U, // <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0> - 2846785638U, // <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS - 3760951694U, // <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3> - 2646934672U, // <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7> - 2712572320U, // <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3> - 3775549865U, // <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3> - 2846785643U, // <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS - 3759772094U, // <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6> - 3704751676U, // <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3> - 2631009936U, // <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6> - 2646935206U, // <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1> - 3759772127U, // <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3> - 3704752004U, // <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7> - 2646935482U, // <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7> - 2712572410U, // <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3> - 2712572419U, // <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3> - 2646935702U, // <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2> - 3777024534U, // <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4> - 3704752453U, // <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6> - 2646935964U, // <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3> - 2705347122U, // <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5> - 3779678778U, // <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4> - 2657553069U, // <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6> - 4039609654U, // <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS - 2708001366U, // <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5> - 2578481254U, // <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS - 3652223734U, // <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2> - 3760951922U, // <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6> - 3779089019U, // <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6> - 1570540772U, // <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6> - 1573195062U, // <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS - 2712572560U, // <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0> - 2723410591U, // <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6> - 1573195304U, // <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6> - 3640287334U, // <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS - 2646937296U, // <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3> - 3640289235U, // <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5> - 3720679279U, // <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0> - 2646937542U, // <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6> - 2646937604U, // <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5> - 2646937698U, // <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0> - 2846788918U, // <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS - 2846788919U, // <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS - 1516699750U, // <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS - 2590442230U, // <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2> - 2646938106U, // <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3> - 2590443670U, // <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2> - 1516703030U, // <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS - 2590445264U, // <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3> - 296144182U, // <6,6,6,6>: Cost 1 vdup2 RHS - 2712572738U, // <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7> - 296144182U, // <6,6,6,u>: Cost 1 vdup2 RHS - 2566561894U, // <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS - 3634332924U, // <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7> - 2566563797U, // <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7> - 2584480258U, // <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6> - 2566565174U, // <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS - 2717438846U, // <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4> - 2980500280U, // <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6> - 1906756918U, // <6,6,7,7>: Cost 2 vzipr RHS, RHS - 1906756919U, // <6,6,7,u>: Cost 2 vzipr RHS, RHS - 1516699750U, // <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS - 1573197614U, // <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS - 2566571990U, // <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u> - 2846786205U, // <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS - 1516703030U, // <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS - 1573197978U, // <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS - 296144182U, // <6,6,u,6>: Cost 1 vdup2 RHS - 1906765110U, // <6,6,u,7>: Cost 2 vzipr RHS, RHS - 296144182U, // <6,6,u,u>: Cost 1 vdup2 RHS - 1571209216U, // <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0> - 497467494U, // <6,7,0,1>: Cost 1 vext2 RHS, LHS - 1571209380U, // <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2> - 2644951292U, // <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0> - 1571209554U, // <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5> - 1510756450U, // <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0> - 2644951542U, // <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7> - 2584499194U, // <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2> - 497468061U, // <6,7,0,u>: Cost 1 vext2 RHS, LHS - 1571209974U, // <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2> - 1571210036U, // <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1> - 1571210134U, // <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0> - 1571210200U, // <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3> - 2644952098U, // <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5> - 1571210384U, // <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7> - 2644952271U, // <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7> - 2578535418U, // <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2> - 1571210605U, // <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3> - 2644952509U, // <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2> - 2644952582U, // <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3> - 1571210856U, // <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2> - 1571210918U, // <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1> - 2644952828U, // <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6> - 2633009028U, // <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7> - 1571211194U, // <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7> - 2668840938U, // <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1> - 1571211323U, // <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1> - 1571211414U, // <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2> - 2644953311U, // <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3> - 2644953390U, // <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1> - 1571211676U, // <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3> - 1571211778U, // <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6> - 2644953648U, // <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7> - 2644953720U, // <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7> - 2644953795U, // <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1> - 1571212062U, // <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2> - 1573202834U, // <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1> - 2644954058U, // <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3> - 2644954166U, // <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3> - 2644954258U, // <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5> - 1571212496U, // <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4> - 497470774U, // <6,7,4,5>: Cost 1 vext2 RHS, RHS - 1573203316U, // <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6> - 2646281688U, // <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7> - 497471017U, // <6,7,4,u>: Cost 1 vext2 RHS, RHS - 2644954696U, // <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2> - 1573203664U, // <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3> - 2644954878U, // <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4> - 2644954991U, // <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0> - 1571213254U, // <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6> - 1571213316U, // <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5> - 1571213410U, // <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0> - 1573204136U, // <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7> - 1573204217U, // <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7> - 2644955425U, // <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2> - 2644955561U, // <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3> - 1573204474U, // <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3> - 2644955698U, // <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5> - 2644955789U, // <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6> - 2644955889U, // <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7> - 1571214136U, // <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6> - 1571214158U, // <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1> - 1573204895U, // <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1> - 1573204986U, // <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2> - 2572608656U, // <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7> - 2644956362U, // <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3> - 2572610231U, // <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7> - 1573205350U, // <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6> - 2646947220U, // <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7> - 1516786498U, // <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7> - 1571214956U, // <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7> - 1573205634U, // <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2> - 1571215059U, // <6,7,u,0>: Cost 2 vext2 RHS, - 497473326U, // <6,7,u,1>: Cost 1 vext2 RHS, LHS - 1571215237U, // <6,7,u,2>: Cost 2 vext2 RHS, - 1571215292U, // <6,7,u,3>: Cost 2 vext2 RHS, - 1571215423U, // <6,7,u,4>: Cost 2 vext2 RHS, - 497473690U, // <6,7,u,5>: Cost 1 vext2 RHS, RHS - 1571215568U, // <6,7,u,6>: Cost 2 vext2 RHS, - 1573206272U, // <6,7,u,7>: Cost 2 vext2 RHS, - 497473893U, // <6,7,u,u>: Cost 1 vext2 RHS, LHS - 1571217408U, // <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0> - 497475686U, // <6,u,0,1>: Cost 1 vext2 RHS, LHS - 1571217572U, // <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2> - 2689865445U, // <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, - 1571217746U, // <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5> - 1510830187U, // <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0> - 2644959734U, // <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7> - 1193130221U, // <6,u,0,7>: Cost 2 vrev - 497476253U, // <6,u,0,u>: Cost 1 vext2 RHS, LHS - 1571218166U, // <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2> - 1571218228U, // <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1> - 1612289838U, // <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS - 1571218392U, // <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3> - 2566663478U, // <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS - 1571218576U, // <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7> - 2644960463U, // <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7> - 2717439835U, // <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, - 1612289892U, // <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS - 1504870502U, // <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS - 2644960774U, // <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3> - 1571219048U, // <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2> - 1571219110U, // <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1> - 1504873782U, // <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS - 2633017221U, // <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u> - 1571219386U, // <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7> - 2712573868U, // <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, - 1571219515U, // <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1> - 1571219606U, // <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2> - 2644961503U, // <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3> - 2566678499U, // <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3> - 1571219868U, // <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3> - 1571219970U, // <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6> - 2689865711U, // <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, - 2708002806U, // <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, - 2644961987U, // <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1> - 1571220254U, // <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2> - 1571220370U, // <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1> - 2644962250U, // <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3> - 1661245476U, // <6,u,4,2>: Cost 2 vext3 , - 2686031917U, // <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, - 1571220688U, // <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4> - 497478967U, // <6,u,4,5>: Cost 1 vext2 RHS, RHS - 1571220852U, // <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6> - 1661614161U, // <6,u,4,7>: Cost 2 vext3 , - 497479209U, // <6,u,4,u>: Cost 1 vext2 RHS, RHS - 2566692966U, // <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS - 1571221200U, // <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3> - 2566694885U, // <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5> - 2689865855U, // <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, - 1571221446U, // <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6> - 1571221508U, // <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5> - 1612290202U, // <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS - 1571221672U, // <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7> - 1612290220U, // <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS - 1504903270U, // <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS - 2644963752U, // <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2> - 1571222010U, // <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3> - 2686032080U, // <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, - 1504906550U, // <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS - 2644964079U, // <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5> - 296144182U, // <6,u,6,6>: Cost 1 vdup2 RHS - 1571222350U, // <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1> - 296144182U, // <6,u,6,u>: Cost 1 vdup2 RHS - 1492967526U, // <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS - 2560738574U, // <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7> - 1492969447U, // <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7> - 1906753692U, // <6,u,7,3>: Cost 2 vzipr RHS, LHS - 1492970806U, // <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS - 2980495761U, // <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5> - 1516860235U, // <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7> - 1906756936U, // <6,u,7,7>: Cost 2 vzipr RHS, RHS - 1492973358U, // <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS - 1492975718U, // <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS - 497481518U, // <6,u,u,1>: Cost 1 vext2 RHS, LHS - 1612290405U, // <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS - 1571223484U, // <6,u,u,3>: Cost 2 vext2 RHS, - 1492978998U, // <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS - 497481882U, // <6,u,u,5>: Cost 1 vext2 RHS, RHS - 296144182U, // <6,u,u,6>: Cost 1 vdup2 RHS - 1906765128U, // <6,u,u,7>: Cost 2 vzipr RHS, RHS - 497482085U, // <6,u,u,u>: Cost 1 vext2 RHS, LHS - 1638318080U, // <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0> - 1638318090U, // <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1> - 1638318100U, // <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2> - 3646442178U, // <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0> - 2712059941U, // <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1> - 2651603364U, // <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6> - 2590618445U, // <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0> - 3785801798U, // <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7> - 1638318153U, // <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1> - 1516879974U, // <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS - 2693922911U, // <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5> - 564576358U, // <7,0,1,2>: Cost 1 vext3 RHS, LHS - 2638996480U, // <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7> - 1516883254U, // <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS - 2649613456U, // <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7> - 1516884814U, // <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1> - 2590626808U, // <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0> - 564576412U, // <7,0,1,u>: Cost 1 vext3 RHS, LHS - 1638318244U, // <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2> - 2692743344U, // <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5> - 2712060084U, // <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0> - 2712060094U, // <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1> - 1638318284U, // <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6> - 2712060118U, // <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7> - 2651604922U, // <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7> - 2686255336U, // <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7> - 1638318316U, // <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2> - 2651605142U, // <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2> - 2712060156U, // <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0> - 2712060165U, // <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0> - 2651605404U, // <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3> - 2651605506U, // <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6> - 2638998111U, // <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0> - 2639661744U, // <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0> - 3712740068U, // <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7> - 2640989010U, // <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0> - 2712060232U, // <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4> - 1638318418U, // <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5> - 1638318428U, // <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6> - 3646474950U, // <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4> - 2712060270U, // <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6> - 1577864502U, // <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS - 2651606388U, // <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6> - 3787792776U, // <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5> - 1638318481U, // <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5> - 2590654566U, // <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS - 2651606736U, // <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3> - 2712060334U, // <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7> - 2649616239U, // <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0> - 2651606982U, // <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6> - 2651607044U, // <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5> - 1577865314U, // <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0> - 2651607208U, // <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7> - 1579192580U, // <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0> - 2688393709U, // <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7> - 2712060406U, // <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7> - 2688541183U, // <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7> - 2655588936U, // <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0> - 3762430481U, // <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7> - 2651607730U, // <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7> - 2651607864U, // <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6> - 2651607886U, // <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1> - 2688983605U, // <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7> - 2651608058U, // <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2> - 2932703334U, // <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS - 3066921062U, // <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS - 3712742678U, // <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7> - 2651608422U, // <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6> - 2651608513U, // <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7> - 2663552532U, // <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0> - 2651608684U, // <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7> - 2651608706U, // <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2> - 1638318730U, // <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2> - 1638318738U, // <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1> - 564576925U, // <7,0,u,2>: Cost 1 vext3 RHS, LHS - 2572765898U, // <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u> - 1638318770U, // <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6> - 1577867418U, // <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS - 1516942165U, // <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u> - 2651609344U, // <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, - 564576979U, // <7,0,u,u>: Cost 1 vext3 RHS, LHS - 2590687334U, // <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS - 2639003750U, // <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS - 2793357414U, // <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS - 1638318838U, // <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2> - 2590690614U, // <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS - 2712060679U, // <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1> - 2590692182U, // <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0> - 3785802521U, // <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1> - 1638318883U, // <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2> - 2712060715U, // <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1> - 1638318900U, // <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1> - 3774300994U, // <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6> - 1638318920U, // <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3> - 2712060755U, // <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5> - 2691416926U, // <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7> - 2590700375U, // <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1> - 3765158766U, // <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5> - 1638318965U, // <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3> - 2712060796U, // <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1> - 2712060807U, // <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3> - 3712747112U, // <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2> - 1638318998U, // <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0> - 2712060836U, // <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5> - 2712060843U, // <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3> - 2590708568U, // <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2> - 2735948730U, // <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0> - 1638319043U, // <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0> - 2712060876U, // <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0> - 1638319064U, // <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3> - 2712060894U, // <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0> - 2692596718U, // <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7> - 2712060917U, // <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5> - 1619002368U, // <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7> - 2692817929U, // <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7> - 2735948814U, // <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3> - 1619223579U, // <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7> - 2712060962U, // <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5> - 2712060971U, // <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5> - 2712060980U, // <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5> - 2712060989U, // <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5> - 3785802822U, // <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5> - 2639007030U, // <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS - 2645642634U, // <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1> - 3719384520U, // <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0> - 2639007273U, // <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS - 2572812390U, // <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS - 2693776510U, // <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7> - 3774301318U, // <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6> - 1620182160U, // <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7> - 2572815670U, // <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS - 3766486178U, // <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7> - 2651615331U, // <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1> - 2652278964U, // <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1> - 1620550845U, // <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7> - 3768108230U, // <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7> - 2694440143U, // <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7> - 2712061144U, // <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7> - 2694587617U, // <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7> - 3768403178U, // <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7> - 2694735091U, // <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7> - 3768550652U, // <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7> - 2652279630U, // <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1> - 2694956302U, // <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7> - 2645644282U, // <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2> - 2859062094U, // <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1> - 3779462437U, // <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3> - 3121938534U, // <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS - 2554916150U, // <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS - 3769140548U, // <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7> - 3726022164U, // <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0> - 2554918508U, // <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7> - 3121938539U, // <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS - 2572836966U, // <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS - 1638319469U, // <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3> - 2712061299U, // <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0> - 1622173059U, // <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7> - 2572840246U, // <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS - 1622320533U, // <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7> - 2696136094U, // <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7> - 2859060777U, // <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS - 1622541744U, // <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7> - 2712061364U, // <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2> - 2712061373U, // <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2> - 2712061380U, // <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0> - 2712061389U, // <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0> - 2712061404U, // <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6> - 2696725990U, // <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7> - 2712061417U, // <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1> - 3785803251U, // <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2> - 2696947201U, // <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7> - 2712061446U, // <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3> - 3785803276U, // <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0> - 3785803285U, // <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0> - 2712061471U, // <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1> - 2712061482U, // <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3> - 3766486576U, // <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0> - 2712061500U, // <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3> - 2602718850U, // <7,2,1,7>: Cost 3 vext1 , <7,u,1,2> - 2712061516U, // <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1> - 2712061525U, // <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1> - 2712061536U, // <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3> - 1638319720U, // <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2> - 1638319730U, // <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3> - 2712061565U, // <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5> - 2698053256U, // <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7> - 2712061584U, // <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6> - 3771795096U, // <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5> - 1638319775U, // <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3> - 1638319782U, // <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1> - 2693924531U, // <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5> - 2700560061U, // <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6> - 2693924551U, // <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7> - 1638319822U, // <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5> - 2698716889U, // <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7> - 2712061665U, // <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6> - 2735949540U, // <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0> - 1638319854U, // <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1> - 2712061692U, // <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6> - 2712061698U, // <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3> - 2712061708U, // <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4> - 2712061718U, // <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5> - 2712061728U, // <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6> - 2699380522U, // <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7> - 2712061740U, // <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0> - 3809691445U, // <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0> - 2699601733U, // <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7> - 2699675470U, // <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7> - 3766486867U, // <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3> - 2699822944U, // <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7> - 2692745065U, // <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7> - 2699970418U, // <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7> - 3766486907U, // <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7> - 2700117892U, // <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7> - 3771795334U, // <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0> - 2692745110U, // <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7> - 2572894310U, // <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS - 2712061860U, // <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3> - 2700486577U, // <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7> - 1626818490U, // <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7> - 2572897590U, // <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS - 2700707788U, // <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7> - 2700781525U, // <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7> - 3774597086U, // <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7> - 1627187175U, // <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7> - 2735949802U, // <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1> - 3780200434U, // <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0> - 3773564928U, // <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5> - 2986541158U, // <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS - 2554989878U, // <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS - 3775113245U, // <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7> - 4060283228U, // <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6> - 2554992236U, // <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7> - 2986541163U, // <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS - 1638320187U, // <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1> - 2693924936U, // <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5> - 1638319720U, // <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2> - 1628145756U, // <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7> - 1638320227U, // <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5> - 2702035054U, // <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7> - 2702108791U, // <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7> - 2735949945U, // <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0> - 1628514441U, // <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7> - 2712062091U, // <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0> - 1638320278U, // <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2> - 2712062109U, // <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0> - 2590836886U, // <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2> - 2712062128U, // <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1> - 2712062138U, // <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2> - 2590839656U, // <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0> - 3311414017U, // <7,3,0,7>: Cost 4 vrev <3,7,7,0> - 1638320341U, // <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2> - 2237164227U, // <7,3,1,0>: Cost 3 vrev <3,7,0,1> - 2712062182U, // <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1> - 2712062193U, // <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3> - 2692745468U, // <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5> - 2712062214U, // <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6> - 2693925132U, // <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3> - 3768183059U, // <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1> - 2692745504U, // <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5> - 2696063273U, // <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5> - 2712062254U, // <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1> - 2712062262U, // <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0> - 2712062273U, // <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2> - 2712062280U, // <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0> - 2712062294U, // <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5> - 2712062302U, // <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4> - 2700560742U, // <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3> - 2712062319U, // <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3> - 2712062325U, // <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0> - 2712062335U, // <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1> - 2636368158U, // <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3> - 2637031791U, // <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3> - 1638320540U, // <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3> - 2712062374U, // <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4> - 2704689586U, // <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7> - 2590864235U, // <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3> - 2704837060U, // <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7> - 1638320540U, // <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3> - 2712062416U, // <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1> - 2712062426U, // <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2> - 2566981640U, // <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4> - 2712062447U, // <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5> - 2712062456U, // <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5> - 1638320642U, // <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6> - 2648313204U, // <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6> - 3311446789U, // <7,3,4,7>: Cost 4 vrev <3,7,7,4> - 1638320669U, // <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6> - 2602819686U, // <7,3,5,0>: Cost 3 vext1 , LHS - 1574571728U, // <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3> - 2648977185U, // <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3> - 2705869378U, // <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7> - 2237491947U, // <7,3,5,4>: Cost 3 vrev <3,7,4,5> - 2706016852U, // <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7> - 2648313954U, // <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0> - 2692745823U, // <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0> - 1579217159U, // <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3> - 2706311800U, // <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7> - 2654286249U, // <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3> - 1581208058U, // <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3> - 2706533011U, // <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7> - 2706606748U, // <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7> - 3780422309U, // <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7> - 2712062637U, // <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6> - 2706827959U, // <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7> - 1585189856U, // <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3> - 2693925571U, // <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1> - 2693925584U, // <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5> - 2700561114U, // <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6> - 2572978916U, // <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7> - 2693925611U, // <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5> - 2707344118U, // <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7> - 2654950894U, // <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7> - 2648315500U, // <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7> - 2693925643U, // <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1> - 2237221578U, // <7,3,u,0>: Cost 3 vrev <3,7,0,u> - 1638320926U, // <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2> - 1593153452U, // <7,3,u,2>: Cost 2 vext2 , - 1638320540U, // <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3> - 2237516526U, // <7,3,u,4>: Cost 3 vrev <3,7,4,u> - 1638320966U, // <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6> - 2712062796U, // <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3> - 2692967250U, // <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0> - 1638320989U, // <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2> - 2651635712U, // <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0> - 1577893990U, // <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS - 2651635876U, // <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2> - 3785804672U, // <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1> - 2651636050U, // <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5> - 1638468498U, // <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1> - 1638468508U, // <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2> - 3787795364U, // <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1> - 1640459181U, // <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1> - 2651636470U, // <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2> - 2651636532U, // <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1> - 2712062922U, // <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3> - 2639029248U, // <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7> - 2712062940U, // <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3> - 2712062946U, // <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0> - 2712062958U, // <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3> - 3785804791U, // <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3> - 2712062973U, // <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0> - 3785804807U, // <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1> - 3785804818U, // <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3> - 2651637352U, // <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2> - 2651637414U, // <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1> - 3716753194U, // <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7> - 2712063030U, // <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3> - 2712063036U, // <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0> - 3773123658U, // <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5> - 2712063054U, // <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0> - 2651637910U, // <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2> - 3712772348U, // <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5> - 3785804906U, // <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1> - 2651638172U, // <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3> - 2651638274U, // <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6> - 2639030883U, // <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4> - 2712063122U, // <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5> - 3712772836U, // <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7> - 2641021782U, // <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4> - 2714053802U, // <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2> - 3785804978U, // <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1> - 3716754505U, // <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4> - 3785804998U, // <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3> - 1638321360U, // <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4> - 1638468826U, // <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5> - 1638468836U, // <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6> - 3785215214U, // <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7> - 1640459509U, // <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5> - 1517207654U, // <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS - 2573034640U, // <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7> - 2712063246U, // <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3> - 2573036267U, // <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5> - 1517210934U, // <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS - 2711989549U, // <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7> - 564579638U, // <7,4,5,6>: Cost 1 vext3 RHS, RHS - 2651639976U, // <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7> - 564579656U, // <7,4,5,u>: Cost 1 vext3 RHS, RHS - 2712063307U, // <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1> - 3767668056U, // <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5> - 2651640314U, // <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3> - 2655621708U, // <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4> - 1638468980U, // <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6> - 2712063358U, // <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7> - 2712063367U, // <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7> - 2712210826U, // <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1> - 1638469012U, // <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2> - 2651640826U, // <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2> - 3773713830U, // <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2> - 3773713842U, // <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5> - 3780349372U, // <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6> - 2651641140U, // <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1> - 2712210888U, // <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0> - 2712210898U, // <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1> - 2651641452U, // <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7> - 2713538026U, // <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7> - 1517232230U, // <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS - 1577899822U, // <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS - 2712063489U, // <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3> - 2573060846U, // <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u> - 1640312342U, // <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6> - 1638469146U, // <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1> - 564579881U, // <7,4,u,6>: Cost 1 vext3 RHS, RHS - 2714054192U, // <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5> - 564579899U, // <7,4,u,u>: Cost 1 vext3 RHS, RHS - 2579038310U, // <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS - 2636382310U, // <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS - 2796339302U, // <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS - 3646810719U, // <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0> - 2712063586U, // <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1> - 2735951467U, // <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1> - 2735951476U, // <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1> - 2579043322U, // <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2> - 2636382877U, // <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS - 2712211087U, // <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1> - 3698180916U, // <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1> - 3710124950U, // <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0> - 2636383232U, // <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7> - 2712211127U, // <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5> - 2590994128U, // <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3> - 2590995323U, // <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1> - 1638469328U, // <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3> - 1638469337U, // <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3> - 3785805536U, // <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1> - 3785805544U, // <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0> - 3704817288U, // <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7> - 2712063742U, // <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4> - 3716761386U, // <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7> - 2714054415U, // <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3> - 3774304024U, // <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3> - 2712063777U, // <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3> - 2712063787U, // <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4> - 3634888806U, // <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS - 2636384544U, // <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5> - 3710790001U, // <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5> - 3710126492U, // <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3> - 3634892086U, // <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS - 2639039076U, // <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5> - 3713444533U, // <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5> - 2693926767U, // <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0> - 2712063864U, // <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0> - 2579071078U, // <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS - 3646841856U, // <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7> - 3716762698U, // <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5> - 3646843491U, // <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4> - 2579074358U, // <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS - 2636385590U, // <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS - 2645675406U, // <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5> - 1638322118U, // <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6> - 1638469583U, // <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6> - 2714054611U, // <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1> - 2652974800U, // <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3> - 3710127905U, // <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3> - 3785805808U, // <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3> - 2712211450U, // <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4> - 1638322180U, // <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5> - 2712064014U, // <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6> - 1638469656U, // <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7> - 1638469665U, // <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7> - 2712064036U, // <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1> - 2714054707U, // <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7> - 3785805879U, // <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2> - 2712064066U, // <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4> - 2712064076U, // <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5> - 2714054743U, // <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7> - 2712064096U, // <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7> - 1638322274U, // <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0> - 1638469739U, // <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0> - 1511325798U, // <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS - 2692747392U, // <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3> - 2585069160U, // <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2> - 2573126390U, // <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7> - 1511329078U, // <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS - 1638469800U, // <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7> - 2712211626U, // <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0> - 2712211636U, // <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1> - 1638469823U, // <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3> - 1511333990U, // <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS - 2636388142U, // <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS - 2712211671U, // <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0> - 2573134583U, // <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u> - 1511337270U, // <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS - 1638469881U, // <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7> - 2712064258U, // <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7> - 1638469892U, // <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0> - 1638469904U, // <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3> - 2650324992U, // <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0> - 1576583270U, // <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS - 2712064300U, // <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4> - 2255295336U, // <7,6,0,3>: Cost 3 vrev <6,7,3,0> - 2712064316U, // <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2> - 2585088098U, // <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0> - 2735952204U, // <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0> - 2712211799U, // <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2> - 1576583837U, // <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS - 1181340494U, // <7,6,1,0>: Cost 2 vrev <6,7,0,1> - 2650325812U, // <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1> - 2650325910U, // <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0> - 2650325976U, // <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3> - 2579123510U, // <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS - 2650326160U, // <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7> - 2714055072U, // <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3> - 2712064425U, // <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3> - 1181930390U, // <7,6,1,u>: Cost 2 vrev <6,7,u,1> - 2712211897U, // <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1> - 2714055108U, // <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3> - 2650326632U, // <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2> - 2650326694U, // <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1> - 2714055137U, // <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5> - 2714055148U, // <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7> - 2650326970U, // <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7> - 1638470138U, // <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3> - 1638470147U, // <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3> - 2650327190U, // <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2> - 2255172441U, // <7,6,3,1>: Cost 3 vrev <6,7,1,3> - 2255246178U, // <7,6,3,2>: Cost 3 vrev <6,7,2,3> - 2650327452U, // <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3> - 2712064562U, // <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5> - 2650327627U, // <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7> - 3713452726U, // <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6> - 2700563016U, // <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0> - 2712064593U, // <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0> - 2650327954U, // <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1> - 2735952486U, // <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3> - 2735952497U, // <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5> - 2255328108U, // <7,6,4,3>: Cost 3 vrev <6,7,3,4> - 2712212100U, // <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6> - 1576586550U, // <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS - 2714055312U, // <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0> - 2712212126U, // <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5> - 1576586793U, // <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS - 2579152998U, // <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS - 2650328784U, // <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3> - 2714055364U, // <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7> - 3785806538U, // <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4> - 1576587206U, // <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6> - 2650329092U, // <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5> - 2650329186U, // <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0> - 2712064753U, // <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7> - 1181963162U, // <7,6,5,u>: Cost 2 vrev <6,7,u,5> - 2714055421U, // <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1> - 2714055432U, // <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3> - 2650329594U, // <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3> - 3785806619U, // <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4> - 2712212260U, // <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4> - 2714055472U, // <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7> - 1638323000U, // <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6> - 1638470466U, // <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7> - 1638470475U, // <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7> - 1638323022U, // <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1> - 2712064854U, // <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0> - 2712064865U, // <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2> - 2712064872U, // <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0> - 1638323062U, // <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5> - 2712064894U, // <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4> - 2712064905U, // <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6> - 2712064915U, // <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7> - 1638323094U, // <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1> - 1638470559U, // <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1> - 1576589102U, // <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS - 2712212402U, // <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2> - 2712212409U, // <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0> - 1638470599U, // <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5> - 1576589466U, // <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS - 1638323000U, // <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6> - 1638470624U, // <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3> - 1638470631U, // <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1> - 2712065007U, // <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0> - 1638323194U, // <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2> - 2712065025U, // <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0> - 3646958337U, // <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0> - 2712065044U, // <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1> - 2585161907U, // <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0> - 2591134604U, // <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0> - 2591134714U, // <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2> - 1638323257U, // <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2> - 2712065091U, // <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3> - 2712065098U, // <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1> - 2712065109U, // <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3> - 2692748384U, // <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5> - 2585169206U, // <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS - 2693928048U, // <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3> - 2585170766U, // <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1> - 2735953024U, // <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1> - 2695918731U, // <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3> - 3770471574U, // <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5> - 3785807002U, // <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0> - 2712065189U, // <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2> - 2712065196U, // <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0> - 3773125818U, // <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5> - 3766490305U, // <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3> - 2700563658U, // <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3> - 2735953107U, // <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3> - 2701890780U, // <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3> - 2712065251U, // <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1> - 3766490350U, // <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3> - 3774305530U, // <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6> - 2637728196U, // <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7> - 2712065291U, // <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5> - 2585186486U, // <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3> - 2639719095U, // <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7> - 2640382728U, // <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7> - 2641046361U, // <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7> - 2712212792U, // <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5> - 3646989312U, // <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7> - 3785807176U, // <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3> - 3646991109U, // <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4> - 2712065371U, // <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4> - 1638323558U, // <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6> - 2712212845U, // <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4> - 2591167846U, // <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6> - 1638323585U, // <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6> - 2585198694U, // <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS - 2712212884U, // <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7> - 3711471393U, // <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3> - 2649673590U, // <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7> - 2712065455U, // <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7> - 1577259032U, // <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7> - 2712065473U, // <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7> - 2712212936U, // <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5> - 1579249931U, // <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7> - 2591178854U, // <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS - 2735953374U, // <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0> - 2712212974U, // <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7> - 2655646287U, // <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7> - 2591182134U, // <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS - 2656973553U, // <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7> - 1583895362U, // <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7> - 2712065556U, // <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0> - 1585222628U, // <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7> - 1523417190U, // <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS - 2597159670U, // <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2> - 2597160552U, // <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2> - 2597161110U, // <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2> - 1523420470U, // <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS - 2651002296U, // <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7> - 2657637906U, // <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7> - 363253046U, // <7,7,7,7>: Cost 1 vdup3 RHS - 363253046U, // <7,7,7,u>: Cost 1 vdup3 RHS - 1523417190U, // <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS - 1638471298U, // <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2> - 2712213132U, // <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3> - 2712213138U, // <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0> - 1523420470U, // <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS - 1638471338U, // <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6> - 1595840756U, // <7,7,u,6>: Cost 2 vext2 , - 363253046U, // <7,7,u,7>: Cost 1 vdup3 RHS - 363253046U, // <7,7,u,u>: Cost 1 vdup3 RHS - 1638318080U, // <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0> - 1638323923U, // <7,u,0,1>: Cost 2 vext3 RHS, - 1662211804U, // <7,u,0,2>: Cost 2 vext3 RHS, - 1638323941U, // <7,u,0,3>: Cost 2 vext3 RHS, - 2712065773U, // <7,u,0,4>: Cost 3 vext3 RHS, - 1662359286U, // <7,u,0,5>: Cost 2 vext3 RHS, - 1662359296U, // <7,u,0,6>: Cost 2 vext3 RHS, - 2987150664U, // <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS - 1638323986U, // <7,u,0,u>: Cost 2 vext3 RHS, - 1517469798U, // <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS - 1638318900U, // <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1> - 564582190U, // <7,u,1,2>: Cost 1 vext3 RHS, LHS - 1638324023U, // <7,u,1,3>: Cost 2 vext3 RHS, - 1517473078U, // <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS - 2693928777U, // <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, - 1517474710U, // <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1> - 1640462171U, // <7,u,1,7>: Cost 2 vext3 RHS, - 564582244U, // <7,u,1,u>: Cost 1 vext3 RHS, LHS - 1638318244U, // <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2> - 2712065907U, // <7,u,2,1>: Cost 3 vext3 RHS, - 1638319720U, // <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2> - 1638324101U, // <7,u,2,3>: Cost 2 vext3 RHS, - 1638318284U, // <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6> - 2712065947U, // <7,u,2,5>: Cost 3 vext3 RHS, - 2700564387U, // <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, - 1640314796U, // <7,u,2,7>: Cost 2 vext3 RHS, - 1638324146U, // <7,u,2,u>: Cost 2 vext3 RHS, - 1638324156U, // <7,u,3,0>: Cost 2 vext3 RHS, - 1638319064U, // <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3> - 2700564435U, // <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, - 1638320540U, // <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3> - 1638324196U, // <7,u,3,4>: Cost 2 vext3 RHS, - 1638324207U, // <7,u,3,5>: Cost 2 vext3 RHS, - 2700564472U, // <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, - 2695919610U, // <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, - 1638324228U, // <7,u,3,u>: Cost 2 vext3 RHS, - 2712066061U, // <7,u,4,0>: Cost 3 vext3 RHS, - 1662212122U, // <7,u,4,1>: Cost 2 vext3 RHS, - 1662212132U, // <7,u,4,2>: Cost 2 vext3 RHS, - 2712066092U, // <7,u,4,3>: Cost 3 vext3 RHS, - 1638321360U, // <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4> - 1638324287U, // <7,u,4,5>: Cost 2 vext3 RHS, - 1662359624U, // <7,u,4,6>: Cost 2 vext3 RHS, - 1640314961U, // <7,u,4,7>: Cost 2 vext3 RHS, - 1638324314U, // <7,u,4,u>: Cost 2 vext3 RHS, - 1517502566U, // <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS - 1574612693U, // <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u> - 2712066162U, // <7,u,5,2>: Cost 3 vext3 RHS, - 1638324351U, // <7,u,5,3>: Cost 2 vext3 RHS, - 1576603592U, // <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u> - 1577267225U, // <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u> - 564582554U, // <7,u,5,6>: Cost 1 vext3 RHS, RHS - 1640462499U, // <7,u,5,7>: Cost 2 vext3 RHS, - 564582572U, // <7,u,5,u>: Cost 1 vext3 RHS, RHS - 2712066223U, // <7,u,6,0>: Cost 3 vext3 RHS, - 2712066238U, // <7,u,6,1>: Cost 3 vext3 RHS, - 1581249023U, // <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u> - 1638324432U, // <7,u,6,3>: Cost 2 vext3 RHS, - 1638468980U, // <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6> - 2712066274U, // <7,u,6,5>: Cost 3 vext3 RHS, - 1583903555U, // <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u> - 1640315117U, // <7,u,6,7>: Cost 2 vext3 RHS, - 1638324477U, // <7,u,6,u>: Cost 2 vext3 RHS, - 1638471936U, // <7,u,7,0>: Cost 2 vext3 RHS, - 2692970763U, // <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, - 2700933399U, // <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, - 2573347601U, // <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7> - 1638471976U, // <7,u,7,4>: Cost 2 vext3 RHS, - 1511551171U, // <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7> - 2712213815U, // <7,u,7,6>: Cost 3 vext3 RHS, - 363253046U, // <7,u,7,7>: Cost 1 vdup3 RHS - 363253046U, // <7,u,7,u>: Cost 1 vdup3 RHS - 1638324561U, // <7,u,u,0>: Cost 2 vext3 RHS, - 1638324571U, // <7,u,u,1>: Cost 2 vext3 RHS, - 564582757U, // <7,u,u,2>: Cost 1 vext3 RHS, LHS - 1638324587U, // <7,u,u,3>: Cost 2 vext3 RHS, - 1638324601U, // <7,u,u,4>: Cost 2 vext3 RHS, - 1638324611U, // <7,u,u,5>: Cost 2 vext3 RHS, - 564582797U, // <7,u,u,6>: Cost 1 vext3 RHS, RHS - 363253046U, // <7,u,u,7>: Cost 1 vdup3 RHS - 564582811U, // <7,u,u,u>: Cost 1 vext3 RHS, LHS - 135053414U, // : Cost 1 vdup0 LHS - 1611489290U, // : Cost 2 vext3 LHS, <0,0,1,1> - 1611489300U, // : Cost 2 vext3 LHS, <0,0,2,2> - 2568054923U, // : Cost 3 vext1 <3,0,0,0>, <3,0,0,0> - 1481706806U, // : Cost 2 vext1 <0,u,0,0>, RHS - 2555449040U, // : Cost 3 vext1 <0,u,0,0>, <5,1,7,3> - 2591282078U, // : Cost 3 vext1 <6,u,0,0>, <6,u,0,0> - 2591945711U, // : Cost 3 vext1 <7,0,0,0>, <7,0,0,0> - 135053414U, // : Cost 1 vdup0 LHS - 1493655654U, // : Cost 2 vext1 <2,u,0,1>, LHS - 1860550758U, // : Cost 2 vzipl LHS, LHS - 537747563U, // : Cost 1 vext3 LHS, LHS - 2625135576U, // : Cost 3 vext2 <1,2,u,0>, <1,3,1,3> - 1493658934U, // : Cost 2 vext1 <2,u,0,1>, RHS - 2625135760U, // : Cost 3 vext2 <1,2,u,0>, <1,5,3,7> - 1517548447U, // : Cost 2 vext1 <6,u,0,1>, <6,u,0,1> - 2591290362U, // : Cost 3 vext1 <6,u,0,1>, <7,0,1,2> - 537747612U, // : Cost 1 vext3 LHS, LHS - 1611489444U, // : Cost 2 vext3 LHS, <0,2,0,2> - 2685231276U, // : Cost 3 vext3 LHS, <0,2,1,1> - 1994768486U, // : Cost 2 vtrnl LHS, LHS - 2685231294U, // : Cost 3 vext3 LHS, <0,2,3,1> - 1611489484U, // : Cost 2 vext3 LHS, <0,2,4,6> - 2712068310U, // : Cost 3 vext3 RHS, <0,2,5,7> - 2625136570U, // : Cost 3 vext2 <1,2,u,0>, <2,6,3,7> - 2591962097U, // : Cost 3 vext1 <7,0,0,2>, <7,0,0,2> - 1611489516U, // : Cost 2 vext3 LHS, <0,2,u,2> - 2954067968U, // : Cost 3 vzipr LHS, <0,0,0,0> - 2685231356U, // : Cost 3 vext3 LHS, <0,3,1,0> - 72589981U, // : Cost 1 vrev LHS - 2625137052U, // : Cost 3 vext2 <1,2,u,0>, <3,3,3,3> - 2625137154U, // : Cost 3 vext2 <1,2,u,0>, <3,4,5,6> - 2639071848U, // : Cost 3 vext2 <3,5,u,0>, <3,5,u,0> - 2639735481U, // : Cost 3 vext2 <3,6,u,0>, <3,6,u,0> - 2597279354U, // : Cost 3 vext1 <7,u,0,3>, <7,u,0,3> - 73032403U, // : Cost 1 vrev LHS - 2687074636U, // : Cost 3 vext3 <0,4,0,u>, <0,4,0,u> - 1611489618U, // : Cost 2 vext3 LHS, <0,4,1,5> - 1611489628U, // : Cost 2 vext3 LHS, <0,4,2,6> - 3629222038U, // : Cost 4 vext1 <0,u,0,4>, <3,0,1,2> - 2555481398U, // : Cost 3 vext1 <0,u,0,4>, RHS - 1551396150U, // : Cost 2 vext2 <1,2,u,0>, RHS - 2651680116U, // : Cost 3 vext2 <5,6,u,0>, <4,6,4,6> - 2646150600U, // : Cost 3 vext2 <4,7,5,0>, <4,7,5,0> - 1611932050U, // : Cost 2 vext3 LHS, <0,4,u,6> - 2561458278U, // : Cost 3 vext1 <1,u,0,5>, LHS - 1863532646U, // : Cost 2 vzipl RHS, LHS - 2712068526U, // : Cost 3 vext3 RHS, <0,5,2,7> - 2649689976U, // : Cost 3 vext2 <5,3,u,0>, <5,3,u,0> - 2220237489U, // : Cost 3 vrev <0,u,4,5> - 2651680772U, // : Cost 3 vext2 <5,6,u,0>, <5,5,5,5> - 1577939051U, // : Cost 2 vext2 <5,6,u,0>, <5,6,u,0> - 2830077238U, // : Cost 3 vuzpr <1,u,3,0>, RHS - 1579266317U, // : Cost 2 vext2 <5,u,u,0>, <5,u,u,0> - 2555494502U, // : Cost 3 vext1 <0,u,0,6>, LHS - 2712068598U, // : Cost 3 vext3 RHS, <0,6,1,7> - 1997750374U, // : Cost 2 vtrnl RHS, LHS - 2655662673U, // : Cost 3 vext2 <6,3,u,0>, <6,3,u,0> - 2555497782U, // : Cost 3 vext1 <0,u,0,6>, RHS - 2651681459U, // : Cost 3 vext2 <5,6,u,0>, <6,5,0,u> - 2651681592U, // : Cost 3 vext2 <5,6,u,0>, <6,6,6,6> - 2651681614U, // : Cost 3 vext2 <5,6,u,0>, <6,7,0,1> - 1997750428U, // : Cost 2 vtrnl RHS, LHS - 2567446630U, // : Cost 3 vext1 <2,u,0,7>, LHS - 2567447446U, // : Cost 3 vext1 <2,u,0,7>, <1,2,3,0> - 2567448641U, // : Cost 3 vext1 <2,u,0,7>, <2,u,0,7> - 2573421338U, // : Cost 3 vext1 <3,u,0,7>, <3,u,0,7> - 2567449910U, // : Cost 3 vext1 <2,u,0,7>, RHS - 2651682242U, // : Cost 3 vext2 <5,6,u,0>, <7,5,6,u> - 2591339429U, // : Cost 3 vext1 <6,u,0,7>, <6,u,0,7> - 2651682412U, // : Cost 3 vext2 <5,6,u,0>, <7,7,7,7> - 2567452462U, // : Cost 3 vext1 <2,u,0,7>, LHS - 135053414U, // : Cost 1 vdup0 LHS - 1611489938U, // : Cost 2 vext3 LHS, <0,u,1,1> - 537748125U, // : Cost 1 vext3 LHS, LHS - 2685674148U, // : Cost 3 vext3 LHS, <0,u,3,1> - 1611932338U, // : Cost 2 vext3 LHS, <0,u,4,6> - 1551399066U, // : Cost 2 vext2 <1,2,u,0>, RHS - 1517605798U, // : Cost 2 vext1 <6,u,0,u>, <6,u,0,u> - 2830077481U, // : Cost 3 vuzpr <1,u,3,0>, RHS - 537748179U, // : Cost 1 vext3 LHS, LHS - 1544101961U, // : Cost 2 vext2 <0,0,u,1>, <0,0,u,1> - 1558036582U, // : Cost 2 vext2 <2,3,u,1>, LHS - 2619171051U, // : Cost 3 vext2 <0,2,u,1>, <0,2,u,1> - 1611490038U, // : Cost 2 vext3 LHS, <1,0,3,2> - 2555522358U, // : Cost 3 vext1 <0,u,1,0>, RHS - 2712068871U, // : Cost 3 vext3 RHS, <1,0,5,1> - 2591355815U, // : Cost 3 vext1 <6,u,1,0>, <6,u,1,0> - 2597328512U, // : Cost 3 vext1 <7,u,1,0>, <7,u,1,0> - 1611490083U, // : Cost 2 vext3 LHS, <1,0,u,2> - 1481785446U, // : Cost 2 vext1 <0,u,1,1>, LHS - 202162278U, // : Cost 1 vdup1 LHS - 2555528808U, // : Cost 3 vext1 <0,u,1,1>, <2,2,2,2> - 1611490120U, // : Cost 2 vext3 LHS, <1,1,3,3> - 1481788726U, // : Cost 2 vext1 <0,u,1,1>, RHS - 2689876828U, // : Cost 3 vext3 LHS, <1,1,5,5> - 2591364008U, // : Cost 3 vext1 <6,u,1,1>, <6,u,1,1> - 2592691274U, // : Cost 3 vext1 <7,1,1,1>, <7,1,1,1> - 202162278U, // : Cost 1 vdup1 LHS - 1499709542U, // : Cost 2 vext1 <3,u,1,2>, LHS - 2689876871U, // : Cost 3 vext3 LHS, <1,2,1,3> - 2631116445U, // : Cost 3 vext2 <2,2,u,1>, <2,2,u,1> - 835584U, // : Cost 0 copy LHS - 1499712822U, // : Cost 2 vext1 <3,u,1,2>, RHS - 2689876907U, // : Cost 3 vext3 LHS, <1,2,5,3> - 2631780282U, // : Cost 3 vext2 <2,3,u,1>, <2,6,3,7> - 1523603074U, // : Cost 2 vext1 <7,u,1,2>, <7,u,1,2> - 835584U, // : Cost 0 copy LHS - 1487773798U, // : Cost 2 vext1 <1,u,1,3>, LHS - 1611490264U, // : Cost 2 vext3 LHS, <1,3,1,3> - 2685232094U, // : Cost 3 vext3 LHS, <1,3,2,0> - 2018746470U, // : Cost 2 vtrnr LHS, LHS - 1487777078U, // : Cost 2 vext1 <1,u,1,3>, RHS - 1611490304U, // : Cost 2 vext3 LHS, <1,3,5,7> - 2685674505U, // : Cost 3 vext3 LHS, <1,3,6,7> - 2640407307U, // : Cost 3 vext2 <3,7,u,1>, <3,7,u,1> - 1611490327U, // : Cost 2 vext3 LHS, <1,3,u,3> - 1567992749U, // : Cost 2 vext2 <4,0,u,1>, <4,0,u,1> - 2693121070U, // : Cost 3 vext3 <1,4,1,u>, <1,4,1,u> - 2693194807U, // : Cost 3 vext3 <1,4,2,u>, <1,4,2,u> - 1152386432U, // : Cost 2 vrev <1,u,3,4> - 2555555126U, // : Cost 3 vext1 <0,u,1,4>, RHS - 1558039862U, // : Cost 2 vext2 <2,3,u,1>, RHS - 2645716371U, // : Cost 3 vext2 <4,6,u,1>, <4,6,u,1> - 2597361284U, // : Cost 3 vext1 <7,u,1,4>, <7,u,1,4> - 1152755117U, // : Cost 2 vrev <1,u,u,4> - 1481818214U, // : Cost 2 vext1 <0,u,1,5>, LHS - 2555560694U, // : Cost 3 vext1 <0,u,1,5>, <1,0,3,2> - 2555561576U, // : Cost 3 vext1 <0,u,1,5>, <2,2,2,2> - 1611490448U, // : Cost 2 vext3 LHS, <1,5,3,7> - 1481821494U, // : Cost 2 vext1 <0,u,1,5>, RHS - 2651025435U, // : Cost 3 vext2 <5,5,u,1>, <5,5,u,1> - 2651689068U, // : Cost 3 vext2 <5,6,u,1>, <5,6,u,1> - 2823966006U, // : Cost 3 vuzpr <0,u,1,1>, RHS - 1611932861U, // : Cost 2 vext3 LHS, <1,5,u,7> - 2555568230U, // : Cost 3 vext1 <0,u,1,6>, LHS - 2689877199U, // : Cost 3 vext3 LHS, <1,6,1,7> - 2712069336U, // : Cost 3 vext3 RHS, <1,6,2,7> - 2685232353U, // : Cost 3 vext3 LHS, <1,6,3,7> - 2555571510U, // : Cost 3 vext1 <0,u,1,6>, RHS - 2689877235U, // : Cost 3 vext3 LHS, <1,6,5,7> - 2657661765U, // : Cost 3 vext2 <6,6,u,1>, <6,6,u,1> - 1584583574U, // : Cost 2 vext2 <6,7,u,1>, <6,7,u,1> - 1585247207U, // : Cost 2 vext2 <6,u,u,1>, <6,u,u,1> - 2561548390U, // : Cost 3 vext1 <1,u,1,7>, LHS - 2561549681U, // : Cost 3 vext1 <1,u,1,7>, <1,u,1,7> - 2573493926U, // : Cost 3 vext1 <3,u,1,7>, <2,3,0,1> - 2042962022U, // : Cost 2 vtrnr RHS, LHS - 2561551670U, // : Cost 3 vext1 <1,u,1,7>, RHS - 2226300309U, // : Cost 3 vrev <1,u,5,7> - 2658325990U, // : Cost 3 vext2 <6,7,u,1>, <7,6,1,u> - 2658326124U, // : Cost 3 vext2 <6,7,u,1>, <7,7,7,7> - 2042962027U, // : Cost 2 vtrnr RHS, LHS - 1481842790U, // : Cost 2 vext1 <0,u,1,u>, LHS - 202162278U, // : Cost 1 vdup1 LHS - 2685674867U, // : Cost 3 vext3 LHS, <1,u,2,0> - 835584U, // : Cost 0 copy LHS - 1481846070U, // : Cost 2 vext1 <0,u,1,u>, RHS - 1611933077U, // : Cost 2 vext3 LHS, <1,u,5,7> - 2685674910U, // : Cost 3 vext3 LHS, <1,u,6,7> - 1523652232U, // : Cost 2 vext1 <7,u,1,u>, <7,u,1,u> - 835584U, // : Cost 0 copy LHS - 1544110154U, // : Cost 2 vext2 <0,0,u,2>, <0,0,u,2> - 1545437286U, // : Cost 2 vext2 <0,2,u,2>, LHS - 1545437420U, // : Cost 2 vext2 <0,2,u,2>, <0,2,u,2> - 2685232589U, // : Cost 3 vext3 LHS, <2,0,3,0> - 2619179346U, // : Cost 3 vext2 <0,2,u,2>, <0,4,1,5> - 2712069606U, // : Cost 3 vext3 RHS, <2,0,5,7> - 2689877484U, // : Cost 3 vext3 LHS, <2,0,6,4> - 2659656273U, // : Cost 3 vext2 <7,0,u,2>, <0,7,2,u> - 1545437853U, // : Cost 2 vext2 <0,2,u,2>, LHS - 1550082851U, // : Cost 2 vext2 <1,0,u,2>, <1,0,u,2> - 2619179828U, // : Cost 3 vext2 <0,2,u,2>, <1,1,1,1> - 2619179926U, // : Cost 3 vext2 <0,2,u,2>, <1,2,3,0> - 2685232671U, // : Cost 3 vext3 LHS, <2,1,3,1> - 2555604278U, // : Cost 3 vext1 <0,u,2,1>, RHS - 2619180176U, // : Cost 3 vext2 <0,2,u,2>, <1,5,3,7> - 2689877564U, // : Cost 3 vext3 LHS, <2,1,6,3> - 2602718850U, // : Cost 3 vext1 , <7,u,1,2> - 1158703235U, // : Cost 2 vrev <2,u,u,1> - 1481867366U, // : Cost 2 vext1 <0,u,2,2>, LHS - 2555609846U, // : Cost 3 vext1 <0,u,2,2>, <1,0,3,2> - 269271142U, // : Cost 1 vdup2 LHS - 1611490930U, // : Cost 2 vext3 LHS, <2,2,3,3> - 1481870646U, // : Cost 2 vext1 <0,u,2,2>, RHS - 2689877640U, // : Cost 3 vext3 LHS, <2,2,5,7> - 2619180986U, // : Cost 3 vext2 <0,2,u,2>, <2,6,3,7> - 2593436837U, // : Cost 3 vext1 <7,2,2,2>, <7,2,2,2> - 269271142U, // : Cost 1 vdup2 LHS - 408134301U, // : Cost 1 vext1 LHS, LHS - 1481876214U, // : Cost 2 vext1 LHS, <1,0,3,2> - 1481877096U, // : Cost 2 vext1 LHS, <2,2,2,2> - 1880326246U, // : Cost 2 vzipr LHS, LHS - 408137014U, // : Cost 1 vext1 LHS, RHS - 1529654992U, // : Cost 2 vext1 LHS, <5,1,7,3> - 1529655802U, // : Cost 2 vext1 LHS, <6,2,7,3> - 1529656314U, // : Cost 2 vext1 LHS, <7,0,1,2> - 408139566U, // : Cost 1 vext1 LHS, LHS - 1567853468U, // : Cost 2 vext2 <4,0,6,2>, <4,0,6,2> - 2561598362U, // : Cost 3 vext1 <1,u,2,4>, <1,2,3,4> - 2555627214U, // : Cost 3 vext1 <0,u,2,4>, <2,3,4,5> - 2685232918U, // : Cost 3 vext3 LHS, <2,4,3,5> - 2555628854U, // : Cost 3 vext1 <0,u,2,4>, RHS - 1545440566U, // : Cost 2 vext2 <0,2,u,2>, RHS - 1571982740U, // : Cost 2 vext2 <4,6,u,2>, <4,6,u,2> - 2592125957U, // : Cost 3 vext1 <7,0,2,4>, <7,0,2,4> - 1545440809U, // : Cost 2 vext2 <0,2,u,2>, RHS - 2555633766U, // : Cost 3 vext1 <0,u,2,5>, LHS - 2561606550U, // : Cost 3 vext1 <1,u,2,5>, <1,2,3,0> - 2689877856U, // : Cost 3 vext3 LHS, <2,5,2,7> - 2685233000U, // : Cost 3 vext3 LHS, <2,5,3,6> - 1158441059U, // : Cost 2 vrev <2,u,4,5> - 2645725188U, // : Cost 3 vext2 <4,6,u,2>, <5,5,5,5> - 2689877892U, // : Cost 3 vext3 LHS, <2,5,6,7> - 2823900470U, // : Cost 3 vuzpr <0,u,0,2>, RHS - 1158736007U, // : Cost 2 vrev <2,u,u,5> - 1481900134U, // : Cost 2 vext1 <0,u,2,6>, LHS - 2555642614U, // : Cost 3 vext1 <0,u,2,6>, <1,0,3,2> - 2555643496U, // : Cost 3 vext1 <0,u,2,6>, <2,2,2,2> - 1611491258U, // : Cost 2 vext3 LHS, <2,6,3,7> - 1481903414U, // : Cost 2 vext1 <0,u,2,6>, RHS - 2689877964U, // : Cost 3 vext3 LHS, <2,6,5,7> - 2689877973U, // : Cost 3 vext3 LHS, <2,6,6,7> - 2645726030U, // : Cost 3 vext2 <4,6,u,2>, <6,7,0,1> - 1611933671U, // : Cost 2 vext3 LHS, <2,6,u,7> - 1585919033U, // : Cost 2 vext2 <7,0,u,2>, <7,0,u,2> - 2573566710U, // : Cost 3 vext1 <3,u,2,7>, <1,0,3,2> - 2567596115U, // : Cost 3 vext1 <2,u,2,7>, <2,u,2,7> - 1906901094U, // : Cost 2 vzipr RHS, LHS - 2555653430U, // : Cost 3 vext1 <0,u,2,7>, RHS - 2800080230U, // : Cost 3 vuzpl LHS, <7,4,5,6> - 2980643164U, // : Cost 3 vzipr RHS, <0,4,2,6> - 2645726828U, // : Cost 3 vext2 <4,6,u,2>, <7,7,7,7> - 1906901099U, // : Cost 2 vzipr RHS, LHS - 408175266U, // : Cost 1 vext1 LHS, LHS - 1545443118U, // : Cost 2 vext2 <0,2,u,2>, LHS - 269271142U, // : Cost 1 vdup2 LHS - 1611491416U, // : Cost 2 vext3 LHS, <2,u,3,3> - 408177974U, // : Cost 1 vext1 LHS, RHS - 1545443482U, // : Cost 2 vext2 <0,2,u,2>, RHS - 1726339226U, // : Cost 2 vuzpl LHS, RHS - 1529697274U, // : Cost 2 vext1 LHS, <7,0,1,2> - 408180526U, // : Cost 1 vext1 LHS, LHS - 1544781824U, // : Cost 2 vext2 LHS, <0,0,0,0> - 471040156U, // : Cost 1 vext2 LHS, LHS - 1544781988U, // : Cost 2 vext2 LHS, <0,2,0,2> - 2618523900U, // : Cost 3 vext2 LHS, <0,3,1,0> - 1544782162U, // : Cost 2 vext2 LHS, <0,4,1,5> - 2238188352U, // : Cost 3 vrev <3,u,5,0> - 2623169023U, // : Cost 3 vext2 LHS, <0,6,2,7> - 2238335826U, // : Cost 3 vrev <3,u,7,0> - 471040669U, // : Cost 1 vext2 LHS, LHS - 1544782582U, // : Cost 2 vext2 LHS, <1,0,3,2> - 1544782644U, // : Cost 2 vext2 LHS, <1,1,1,1> - 1544782742U, // : Cost 2 vext2 LHS, <1,2,3,0> - 1544782808U, // : Cost 2 vext2 LHS, <1,3,1,3> - 2618524733U, // : Cost 3 vext2 LHS, <1,4,3,5> - 1544782992U, // : Cost 2 vext2 LHS, <1,5,3,7> - 2618524897U, // : Cost 3 vext2 LHS, <1,6,3,7> - 2703517987U, // : Cost 3 vext3 <3,1,7,u>, <3,1,7,u> - 1544783213U, // : Cost 2 vext2 LHS, <1,u,1,3> - 1529716838U, // : Cost 2 vext1 , LHS - 1164167966U, // : Cost 2 vrev <3,u,1,2> - 1544783464U, // : Cost 2 vext2 LHS, <2,2,2,2> - 1544783526U, // : Cost 2 vext2 LHS, <2,3,0,1> - 1529720118U, // : Cost 2 vext1 , RHS - 2618525544U, // : Cost 3 vext2 LHS, <2,5,3,6> - 1544783802U, // : Cost 2 vext2 LHS, <2,6,3,7> - 2704181620U, // : Cost 3 vext3 <3,2,7,u>, <3,2,7,u> - 1544783931U, // : Cost 2 vext2 LHS, <2,u,0,1> - 1544784022U, // : Cost 2 vext2 LHS, <3,0,1,2> - 1487922559U, // : Cost 2 vext1 <1,u,3,3>, <1,u,3,3> - 1493895256U, // : Cost 2 vext1 <2,u,3,3>, <2,u,3,3> - 336380006U, // : Cost 1 vdup3 LHS - 1544784386U, // : Cost 2 vext2 LHS, <3,4,5,6> - 2824054478U, // : Cost 3 vuzpr LHS, <2,3,4,5> - 2238286668U, // : Cost 3 vrev <3,u,6,3> - 2954069136U, // : Cost 3 vzipr LHS, <1,5,3,7> - 336380006U, // : Cost 1 vdup3 LHS - 1487929446U, // : Cost 2 vext1 <1,u,3,4>, LHS - 1487930752U, // : Cost 2 vext1 <1,u,3,4>, <1,u,3,4> - 2623171644U, // : Cost 3 vext2 LHS, <4,2,6,0> - 2561673366U, // : Cost 3 vext1 <1,u,3,4>, <3,0,1,2> - 1487932726U, // : Cost 2 vext1 <1,u,3,4>, RHS - 471043382U, // : Cost 1 vext2 LHS, RHS - 1592561012U, // : Cost 2 vext2 LHS, <4,6,4,6> - 2238368598U, // : Cost 3 vrev <3,u,7,4> - 471043625U, // : Cost 1 vext2 LHS, RHS - 2555707494U, // : Cost 3 vext1 <0,u,3,5>, LHS - 1574645465U, // : Cost 2 vext2 <5,1,u,3>, <5,1,u,3> - 2567653106U, // : Cost 3 vext1 <2,u,3,5>, <2,3,u,5> - 2555709954U, // : Cost 3 vext1 <0,u,3,5>, <3,4,5,6> - 1592561606U, // : Cost 2 vext2 LHS, <5,4,7,6> - 1592561668U, // : Cost 2 vext2 LHS, <5,5,5,5> - 1592561762U, // : Cost 2 vext2 LHS, <5,6,7,0> - 1750314294U, // : Cost 2 vuzpr LHS, RHS - 1750314295U, // : Cost 2 vuzpr LHS, RHS - 2623172897U, // : Cost 3 vext2 LHS, <6,0,1,2> - 2561688962U, // : Cost 3 vext1 <1,u,3,6>, <1,u,3,6> - 1581281795U, // : Cost 2 vext2 <6,2,u,3>, <6,2,u,3> - 2706541204U, // : Cost 3 vext3 <3,6,3,u>, <3,6,3,u> - 2623173261U, // : Cost 3 vext2 LHS, <6,4,5,6> - 1164495686U, // : Cost 2 vrev <3,u,5,6> - 1592562488U, // : Cost 2 vext2 LHS, <6,6,6,6> - 1592562510U, // : Cost 2 vext2 LHS, <6,7,0,1> - 1164716897U, // : Cost 2 vrev <3,u,u,6> - 1487954022U, // : Cost 2 vext1 <1,u,3,7>, LHS - 1487955331U, // : Cost 2 vext1 <1,u,3,7>, <1,u,3,7> - 1493928028U, // : Cost 2 vext1 <2,u,3,7>, <2,u,3,7> - 2561697942U, // : Cost 3 vext1 <1,u,3,7>, <3,0,1,2> - 1487957302U, // : Cost 2 vext1 <1,u,3,7>, RHS - 2707352311U, // : Cost 3 vext3 <3,7,5,u>, <3,7,5,u> - 2655024623U, // : Cost 3 vext2 <6,2,u,3>, <7,6,2,u> - 1592563308U, // : Cost 2 vext2 LHS, <7,7,7,7> - 1487959854U, // : Cost 2 vext1 <1,u,3,7>, LHS - 1544787667U, // : Cost 2 vext2 LHS, - 471045934U, // : Cost 1 vext2 LHS, LHS - 1549432709U, // : Cost 2 vext2 LHS, - 336380006U, // : Cost 1 vdup3 LHS - 1544788031U, // : Cost 2 vext2 LHS, - 471046298U, // : Cost 1 vext2 LHS, RHS - 1549433040U, // : Cost 2 vext2 LHS, - 1750314537U, // : Cost 2 vuzpr LHS, RHS - 471046501U, // : Cost 1 vext2 LHS, LHS - 2625167360U, // : Cost 3 vext2 <1,2,u,4>, <0,0,0,0> - 1551425638U, // : Cost 2 vext2 <1,2,u,4>, LHS - 2619195630U, // : Cost 3 vext2 <0,2,u,4>, <0,2,u,4> - 2619343104U, // : Cost 3 vext2 <0,3,1,4>, <0,3,1,4> - 2625167698U, // : Cost 3 vext2 <1,2,u,4>, <0,4,1,5> - 1638329234U, // : Cost 2 vext3 RHS, <4,0,5,1> - 1638329244U, // : Cost 2 vext3 RHS, <4,0,6,2> - 3787803556U, // : Cost 4 vext3 RHS, <4,0,7,1> - 1551426205U, // : Cost 2 vext2 <1,2,u,4>, LHS - 2555748454U, // : Cost 3 vext1 <0,u,4,1>, LHS - 2625168180U, // : Cost 3 vext2 <1,2,u,4>, <1,1,1,1> - 1551426503U, // : Cost 2 vext2 <1,2,u,4>, <1,2,u,4> - 2625168344U, // : Cost 3 vext2 <1,2,u,4>, <1,3,1,3> - 2555751734U, // : Cost 3 vext1 <0,u,4,1>, RHS - 1860554038U, // : Cost 2 vzipl LHS, RHS - 2689879022U, // : Cost 3 vext3 LHS, <4,1,6,3> - 2592248852U, // : Cost 3 vext1 <7,0,4,1>, <7,0,4,1> - 1555408301U, // : Cost 2 vext2 <1,u,u,4>, <1,u,u,4> - 2555756646U, // : Cost 3 vext1 <0,u,4,2>, LHS - 2625168943U, // : Cost 3 vext2 <1,2,u,4>, <2,1,4,u> - 2625169000U, // : Cost 3 vext2 <1,2,u,4>, <2,2,2,2> - 2619197134U, // : Cost 3 vext2 <0,2,u,4>, <2,3,4,5> - 2555759926U, // : Cost 3 vext1 <0,u,4,2>, RHS - 2712071222U, // : Cost 3 vext3 RHS, <4,2,5,3> - 1994771766U, // : Cost 2 vtrnl LHS, RHS - 2592257045U, // : Cost 3 vext1 <7,0,4,2>, <7,0,4,2> - 1994771784U, // : Cost 2 vtrnl LHS, RHS - 2625169558U, // : Cost 3 vext2 <1,2,u,4>, <3,0,1,2> - 2567709594U, // : Cost 3 vext1 <2,u,4,3>, <1,2,3,4> - 2567710817U, // : Cost 3 vext1 <2,u,4,3>, <2,u,4,3> - 2625169820U, // : Cost 3 vext2 <1,2,u,4>, <3,3,3,3> - 2625169922U, // : Cost 3 vext2 <1,2,u,4>, <3,4,5,6> - 2954069710U, // : Cost 3 vzipr LHS, <2,3,4,5> - 2954068172U, // : Cost 3 vzipr LHS, <0,2,4,6> - 3903849472U, // : Cost 4 vuzpr <1,u,3,4>, <1,3,5,7> - 2954068174U, // : Cost 3 vzipr LHS, <0,2,4,u> - 1505919078U, // : Cost 2 vext1 <4,u,4,4>, LHS - 2567717831U, // : Cost 3 vext1 <2,u,4,4>, <1,2,u,4> - 2567719010U, // : Cost 3 vext1 <2,u,4,4>, <2,u,4,4> - 2570373542U, // : Cost 3 vext1 <3,3,4,4>, <3,3,4,4> - 161926454U, // : Cost 1 vdup0 RHS - 1551428918U, // : Cost 2 vext2 <1,2,u,4>, RHS - 1638329572U, // : Cost 2 vext3 RHS, <4,4,6,6> - 2594927963U, // : Cost 3 vext1 <7,4,4,4>, <7,4,4,4> - 161926454U, // : Cost 1 vdup0 RHS - 1493983334U, // : Cost 2 vext1 <2,u,4,5>, LHS - 2689879301U, // : Cost 3 vext3 LHS, <4,5,1,3> - 1493985379U, // : Cost 2 vext1 <2,u,4,5>, <2,u,4,5> - 2567727254U, // : Cost 3 vext1 <2,u,4,5>, <3,0,1,2> - 1493986614U, // : Cost 2 vext1 <2,u,4,5>, RHS - 1863535926U, // : Cost 2 vzipl RHS, RHS - 537750838U, // : Cost 1 vext3 LHS, RHS - 2830110006U, // : Cost 3 vuzpr <1,u,3,4>, RHS - 537750856U, // : Cost 1 vext3 LHS, RHS - 1482047590U, // : Cost 2 vext1 <0,u,4,6>, LHS - 2555790070U, // : Cost 3 vext1 <0,u,4,6>, <1,0,3,2> - 2555790952U, // : Cost 3 vext1 <0,u,4,6>, <2,2,2,2> - 2555791510U, // : Cost 3 vext1 <0,u,4,6>, <3,0,1,2> - 1482050870U, // : Cost 2 vext1 <0,u,4,6>, RHS - 2689879422U, // : Cost 3 vext3 LHS, <4,6,5,7> - 1997753654U, // : Cost 2 vtrnl RHS, RHS - 2712071562U, // : Cost 3 vext3 RHS, <4,6,7,1> - 1482053422U, // : Cost 2 vext1 <0,u,4,6>, LHS - 2567741542U, // : Cost 3 vext1 <2,u,4,7>, LHS - 2567742362U, // : Cost 3 vext1 <2,u,4,7>, <1,2,3,4> - 2567743589U, // : Cost 3 vext1 <2,u,4,7>, <2,u,4,7> - 2573716286U, // : Cost 3 vext1 <3,u,4,7>, <3,u,4,7> - 2567744822U, // : Cost 3 vext1 <2,u,4,7>, RHS - 2712071624U, // : Cost 3 vext3 RHS, <4,7,5,0> - 96808489U, // : Cost 1 vrev RHS - 2651715180U, // : Cost 3 vext2 <5,6,u,4>, <7,7,7,7> - 96955963U, // : Cost 1 vrev RHS - 1482063974U, // : Cost 2 vext1 <0,u,4,u>, LHS - 1551431470U, // : Cost 2 vext2 <1,2,u,4>, LHS - 1494009958U, // : Cost 2 vext1 <2,u,4,u>, <2,u,4,u> - 2555807894U, // : Cost 3 vext1 <0,u,4,u>, <3,0,1,2> - 161926454U, // : Cost 1 vdup0 RHS - 1551431834U, // : Cost 2 vext2 <1,2,u,4>, RHS - 537751081U, // : Cost 1 vext3 LHS, RHS - 2830110249U, // : Cost 3 vuzpr <1,u,3,4>, RHS - 537751099U, // : Cost 1 vext3 LHS, RHS - 2631811072U, // : Cost 3 vext2 <2,3,u,5>, <0,0,0,0> - 1558069350U, // : Cost 2 vext2 <2,3,u,5>, LHS - 2619203823U, // : Cost 3 vext2 <0,2,u,5>, <0,2,u,5> - 2619867456U, // : Cost 3 vext2 <0,3,u,5>, <0,3,u,5> - 1546273106U, // : Cost 2 vext2 <0,4,1,5>, <0,4,1,5> - 2733010539U, // : Cost 3 vext3 LHS, <5,0,5,1> - 2597622682U, // : Cost 3 vext1 <7,u,5,0>, <6,7,u,5> - 1176539396U, // : Cost 2 vrev <5,u,7,0> - 1558069917U, // : Cost 2 vext2 <2,3,u,5>, LHS - 1505968230U, // : Cost 2 vext1 <4,u,5,1>, LHS - 2624512887U, // : Cost 3 vext2 <1,1,u,5>, <1,1,u,5> - 2631811990U, // : Cost 3 vext2 <2,3,u,5>, <1,2,3,0> - 2618541056U, // : Cost 3 vext2 <0,1,u,5>, <1,3,5,7> - 1505971510U, // : Cost 2 vext1 <4,u,5,1>, RHS - 2627167419U, // : Cost 3 vext2 <1,5,u,5>, <1,5,u,5> - 2579714554U, // : Cost 3 vext1 <4,u,5,1>, <6,2,7,3> - 1638330064U, // : Cost 2 vext3 RHS, <5,1,7,3> - 1638477529U, // : Cost 2 vext3 RHS, <5,1,u,3> - 2561802342U, // : Cost 3 vext1 <1,u,5,2>, LHS - 2561803264U, // : Cost 3 vext1 <1,u,5,2>, <1,3,5,7> - 2631149217U, // : Cost 3 vext2 <2,2,u,5>, <2,2,u,5> - 1558071026U, // : Cost 2 vext2 <2,3,u,5>, <2,3,u,5> - 2561805622U, // : Cost 3 vext1 <1,u,5,2>, RHS - 2714062607U, // : Cost 3 vext3 RHS, <5,2,5,3> - 2631813050U, // : Cost 3 vext2 <2,3,u,5>, <2,6,3,7> - 3092335926U, // : Cost 3 vtrnr <0,u,0,2>, RHS - 1561389191U, // : Cost 2 vext2 <2,u,u,5>, <2,u,u,5> - 2561810534U, // : Cost 3 vext1 <1,u,5,3>, LHS - 2561811857U, // : Cost 3 vext1 <1,u,5,3>, <1,u,5,3> - 2631813474U, // : Cost 3 vext2 <2,3,u,5>, <3,2,5,u> - 2631813532U, // : Cost 3 vext2 <2,3,u,5>, <3,3,3,3> - 2619869698U, // : Cost 3 vext2 <0,3,u,5>, <3,4,5,6> - 3001847002U, // : Cost 3 vzipr LHS, <4,4,5,5> - 2954070530U, // : Cost 3 vzipr LHS, <3,4,5,6> - 2018749750U, // : Cost 2 vtrnr LHS, RHS - 2018749751U, // : Cost 2 vtrnr LHS, RHS - 2573762662U, // : Cost 3 vext1 <3,u,5,4>, LHS - 2620017634U, // : Cost 3 vext2 <0,4,1,5>, <4,1,5,0> - 2573764338U, // : Cost 3 vext1 <3,u,5,4>, <2,3,u,5> - 2573765444U, // : Cost 3 vext1 <3,u,5,4>, <3,u,5,4> - 1570680053U, // : Cost 2 vext2 <4,4,u,5>, <4,4,u,5> - 1558072630U, // : Cost 2 vext2 <2,3,u,5>, RHS - 2645749143U, // : Cost 3 vext2 <4,6,u,5>, <4,6,u,5> - 1638330310U, // : Cost 2 vext3 RHS, <5,4,7,6> - 1558072873U, // : Cost 2 vext2 <2,3,u,5>, RHS - 1506000998U, // : Cost 2 vext1 <4,u,5,5>, LHS - 2561827984U, // : Cost 3 vext1 <1,u,5,5>, <1,5,3,7> - 2579744360U, // : Cost 3 vext1 <4,u,5,5>, <2,2,2,2> - 2579744918U, // : Cost 3 vext1 <4,u,5,5>, <3,0,1,2> - 1506004278U, // : Cost 2 vext1 <4,u,5,5>, RHS - 229035318U, // : Cost 1 vdup1 RHS - 2712072206U, // : Cost 3 vext3 RHS, <5,5,6,6> - 1638330392U, // : Cost 2 vext3 RHS, <5,5,7,7> - 229035318U, // : Cost 1 vdup1 RHS - 1500037222U, // : Cost 2 vext1 <3,u,5,6>, LHS - 2561836436U, // : Cost 3 vext1 <1,u,5,6>, <1,u,5,6> - 2567809133U, // : Cost 3 vext1 <2,u,5,6>, <2,u,5,6> - 1500040006U, // : Cost 2 vext1 <3,u,5,6>, <3,u,5,6> - 1500040502U, // : Cost 2 vext1 <3,u,5,6>, RHS - 2714062935U, // : Cost 3 vext3 RHS, <5,6,5,7> - 2712072288U, // : Cost 3 vext3 RHS, <5,6,6,7> - 27705344U, // : Cost 0 copy RHS - 27705344U, // : Cost 0 copy RHS - 1488101478U, // : Cost 2 vext1 <1,u,5,7>, LHS - 1488102805U, // : Cost 2 vext1 <1,u,5,7>, <1,u,5,7> - 2561844840U, // : Cost 3 vext1 <1,u,5,7>, <2,2,2,2> - 2561845398U, // : Cost 3 vext1 <1,u,5,7>, <3,0,1,2> - 1488104758U, // : Cost 2 vext1 <1,u,5,7>, RHS - 1638330536U, // : Cost 2 vext3 RHS, <5,7,5,7> - 2712072362U, // : Cost 3 vext3 RHS, <5,7,6,0> - 2042965302U, // : Cost 2 vtrnr RHS, RHS - 1488107310U, // : Cost 2 vext1 <1,u,5,7>, LHS - 1488109670U, // : Cost 2 vext1 <1,u,5,u>, LHS - 1488110998U, // : Cost 2 vext1 <1,u,5,u>, <1,u,5,u> - 2561853032U, // : Cost 3 vext1 <1,u,5,u>, <2,2,2,2> - 1500056392U, // : Cost 2 vext1 <3,u,5,u>, <3,u,5,u> - 1488112950U, // : Cost 2 vext1 <1,u,5,u>, RHS - 229035318U, // : Cost 1 vdup1 RHS - 2954111490U, // : Cost 3 vzipr LHS, <3,4,5,6> - 27705344U, // : Cost 0 copy RHS - 27705344U, // : Cost 0 copy RHS - 2619211776U, // : Cost 3 vext2 <0,2,u,6>, <0,0,0,0> - 1545470054U, // : Cost 2 vext2 <0,2,u,6>, LHS - 1545470192U, // : Cost 2 vext2 <0,2,u,6>, <0,2,u,6> - 2255958969U, // : Cost 3 vrev <6,u,3,0> - 1546797458U, // : Cost 2 vext2 <0,4,u,6>, <0,4,u,6> - 2720624971U, // : Cost 3 vext3 <6,0,5,u>, <6,0,5,u> - 2256180180U, // : Cost 3 vrev <6,u,6,0> - 2960682294U, // : Cost 3 vzipr <1,2,u,0>, RHS - 1545470621U, // : Cost 2 vext2 <0,2,u,6>, LHS - 1182004127U, // : Cost 2 vrev <6,u,0,1> - 2619212596U, // : Cost 3 vext2 <0,2,u,6>, <1,1,1,1> - 2619212694U, // : Cost 3 vext2 <0,2,u,6>, <1,2,3,0> - 2619212760U, // : Cost 3 vext2 <0,2,u,6>, <1,3,1,3> - 2626511979U, // : Cost 3 vext2 <1,4,u,6>, <1,4,u,6> - 2619212944U, // : Cost 3 vext2 <0,2,u,6>, <1,5,3,7> - 2714063264U, // : Cost 3 vext3 RHS, <6,1,6,3> - 2967326006U, // : Cost 3 vzipr <2,3,u,1>, RHS - 1182594023U, // : Cost 2 vrev <6,u,u,1> - 1506050150U, // : Cost 2 vext1 <4,u,6,2>, LHS - 2579792630U, // : Cost 3 vext1 <4,u,6,2>, <1,0,3,2> - 2619213416U, // : Cost 3 vext2 <0,2,u,6>, <2,2,2,2> - 2619213478U, // : Cost 3 vext2 <0,2,u,6>, <2,3,0,1> - 1506053430U, // : Cost 2 vext1 <4,u,6,2>, RHS - 2633148309U, // : Cost 3 vext2 <2,5,u,6>, <2,5,u,6> - 2619213754U, // : Cost 3 vext2 <0,2,u,6>, <2,6,3,7> - 1638330874U, // : Cost 2 vext3 RHS, <6,2,7,3> - 1638478339U, // : Cost 2 vext3 RHS, <6,2,u,3> - 2619213974U, // : Cost 3 vext2 <0,2,u,6>, <3,0,1,2> - 2255836074U, // : Cost 3 vrev <6,u,1,3> - 2255909811U, // : Cost 3 vrev <6,u,2,3> - 2619214236U, // : Cost 3 vext2 <0,2,u,6>, <3,3,3,3> - 1564715549U, // : Cost 2 vext2 <3,4,u,6>, <3,4,u,6> - 2639121006U, // : Cost 3 vext2 <3,5,u,6>, <3,5,u,6> - 3001847012U, // : Cost 3 vzipr LHS, <4,4,6,6> - 1880329526U, // : Cost 2 vzipr LHS, RHS - 1880329527U, // : Cost 2 vzipr LHS, RHS - 2567864422U, // : Cost 3 vext1 <2,u,6,4>, LHS - 2733011558U, // : Cost 3 vext3 LHS, <6,4,1,3> - 2567866484U, // : Cost 3 vext1 <2,u,6,4>, <2,u,6,4> - 2638458005U, // : Cost 3 vext2 <3,4,u,6>, <4,3,6,u> - 1570540772U, // : Cost 2 vext2 <4,4,6,6>, <4,4,6,6> - 1545473334U, // : Cost 2 vext2 <0,2,u,6>, RHS - 1572015512U, // : Cost 2 vext2 <4,6,u,6>, <4,6,u,6> - 2960715062U, // : Cost 3 vzipr <1,2,u,4>, RHS - 1545473577U, // : Cost 2 vext2 <0,2,u,6>, RHS - 2567872614U, // : Cost 3 vext1 <2,u,6,5>, LHS - 2645757648U, // : Cost 3 vext2 <4,6,u,6>, <5,1,7,3> - 2567874490U, // : Cost 3 vext1 <2,u,6,5>, <2,6,3,7> - 2576501250U, // : Cost 3 vext1 <4,3,6,5>, <3,4,5,6> - 1576660943U, // : Cost 2 vext2 <5,4,u,6>, <5,4,u,6> - 2645757956U, // : Cost 3 vext2 <4,6,u,6>, <5,5,5,5> - 2645758050U, // : Cost 3 vext2 <4,6,u,6>, <5,6,7,0> - 2824080694U, // : Cost 3 vuzpr <0,u,2,6>, RHS - 1182626795U, // : Cost 2 vrev <6,u,u,5> - 1506082918U, // : Cost 2 vext1 <4,u,6,6>, LHS - 2579825398U, // : Cost 3 vext1 <4,u,6,6>, <1,0,3,2> - 2645758458U, // : Cost 3 vext2 <4,6,u,6>, <6,2,7,3> - 2579826838U, // : Cost 3 vext1 <4,u,6,6>, <3,0,1,2> - 1506086198U, // : Cost 2 vext1 <4,u,6,6>, RHS - 2579828432U, // : Cost 3 vext1 <4,u,6,6>, <5,1,7,3> - 296144182U, // : Cost 1 vdup2 RHS - 1638331202U, // : Cost 2 vext3 RHS, <6,6,7,7> - 296144182U, // : Cost 1 vdup2 RHS - 432349286U, // : Cost 1 vext1 RHS, LHS - 1506091766U, // : Cost 2 vext1 RHS, <1,0,3,2> - 1506092648U, // : Cost 2 vext1 RHS, <2,2,2,2> - 1506093206U, // : Cost 2 vext1 RHS, <3,0,1,2> - 432352809U, // : Cost 1 vext1 RHS, RHS - 1506094800U, // : Cost 2 vext1 RHS, <5,1,7,3> - 1506095610U, // : Cost 2 vext1 RHS, <6,2,7,3> - 1906904374U, // : Cost 2 vzipr RHS, RHS - 432355118U, // : Cost 1 vext1 RHS, LHS - 432357478U, // : Cost 1 vext1 RHS, LHS - 1545475886U, // : Cost 2 vext2 <0,2,u,6>, LHS - 1506100840U, // : Cost 2 vext1 RHS, <2,2,2,2> - 1506101398U, // : Cost 2 vext1 RHS, <3,0,1,2> - 432361002U, // : Cost 1 vext1 RHS, RHS - 1545476250U, // : Cost 2 vext2 <0,2,u,6>, RHS - 296144182U, // : Cost 1 vdup2 RHS - 1880370486U, // : Cost 2 vzipr LHS, RHS - 432363310U, // : Cost 1 vext1 RHS, LHS - 1571356672U, // : Cost 2 vext2 RHS, <0,0,0,0> - 497614950U, // : Cost 1 vext2 RHS, LHS - 1571356836U, // : Cost 2 vext2 RHS, <0,2,0,2> - 2573880146U, // : Cost 3 vext1 <3,u,7,0>, <3,u,7,0> - 1571357010U, // : Cost 2 vext2 RHS, <0,4,1,5> - 1512083716U, // : Cost 2 vext1 <5,u,7,0>, <5,u,7,0> - 2621874741U, // : Cost 3 vext2 <0,6,u,7>, <0,6,u,7> - 2585826298U, // : Cost 3 vext1 <5,u,7,0>, <7,0,1,2> - 497615517U, // : Cost 1 vext2 RHS, LHS - 1571357430U, // : Cost 2 vext2 RHS, <1,0,3,2> - 1571357492U, // : Cost 2 vext2 RHS, <1,1,1,1> - 1571357590U, // : Cost 2 vext2 RHS, <1,2,3,0> - 1552114715U, // : Cost 2 vext2 <1,3,u,7>, <1,3,u,7> - 2573888822U, // : Cost 3 vext1 <3,u,7,1>, RHS - 1553441981U, // : Cost 2 vext2 <1,5,u,7>, <1,5,u,7> - 2627847438U, // : Cost 3 vext2 <1,6,u,7>, <1,6,u,7> - 2727408775U, // : Cost 3 vext3 <7,1,7,u>, <7,1,7,u> - 1555432880U, // : Cost 2 vext2 <1,u,u,7>, <1,u,u,7> - 2629838337U, // : Cost 3 vext2 <2,0,u,7>, <2,0,u,7> - 1188058754U, // : Cost 2 vrev <7,u,1,2> - 1571358312U, // : Cost 2 vext2 RHS, <2,2,2,2> - 1571358374U, // : Cost 2 vext2 RHS, <2,3,0,1> - 2632492869U, // : Cost 3 vext2 <2,4,u,7>, <2,4,u,7> - 2633156502U, // : Cost 3 vext2 <2,5,u,7>, <2,5,u,7> - 1560078311U, // : Cost 2 vext2 <2,6,u,7>, <2,6,u,7> - 2728072408U, // : Cost 3 vext3 <7,2,7,u>, <7,2,7,u> - 1561405577U, // : Cost 2 vext2 <2,u,u,7>, <2,u,u,7> - 1571358870U, // : Cost 2 vext2 RHS, <3,0,1,2> - 2627184913U, // : Cost 3 vext2 <1,5,u,7>, <3,1,5,u> - 2633820523U, // : Cost 3 vext2 <2,6,u,7>, <3,2,6,u> - 1571359132U, // : Cost 2 vext2 RHS, <3,3,3,3> - 1571359234U, // : Cost 2 vext2 RHS, <3,4,5,6> - 1512108295U, // : Cost 2 vext1 <5,u,7,3>, <5,u,7,3> - 1518080992U, // : Cost 2 vext1 <6,u,7,3>, <6,u,7,3> - 2640456465U, // : Cost 3 vext2 <3,7,u,7>, <3,7,u,7> - 1571359518U, // : Cost 2 vext2 RHS, <3,u,1,2> - 1571359634U, // : Cost 2 vext2 RHS, <4,0,5,1> - 2573911067U, // : Cost 3 vext1 <3,u,7,4>, <1,3,u,7> - 2645101622U, // : Cost 3 vext2 RHS, <4,2,5,3> - 2573912918U, // : Cost 3 vext1 <3,u,7,4>, <3,u,7,4> - 1571359952U, // : Cost 2 vext2 RHS, <4,4,4,4> - 497618248U, // : Cost 1 vext2 RHS, RHS - 1571360116U, // : Cost 2 vext2 RHS, <4,6,4,6> - 2645102024U, // : Cost 3 vext2 RHS, <4,7,5,0> - 497618473U, // : Cost 1 vext2 RHS, RHS - 2645102152U, // : Cost 3 vext2 RHS, <5,0,1,2> - 1571360464U, // : Cost 2 vext2 RHS, <5,1,7,3> - 2645102334U, // : Cost 3 vext2 RHS, <5,2,3,4> - 2645102447U, // : Cost 3 vext2 RHS, <5,3,7,0> - 1571360710U, // : Cost 2 vext2 RHS, <5,4,7,6> - 1571360772U, // : Cost 2 vext2 RHS, <5,5,5,5> - 1571360866U, // : Cost 2 vext2 RHS, <5,6,7,0> - 1571360936U, // : Cost 2 vext2 RHS, <5,7,5,7> - 1571361017U, // : Cost 2 vext2 RHS, <5,u,5,7> - 1530044518U, // : Cost 2 vext1 , LHS - 2645103016U, // : Cost 3 vext2 RHS, <6,1,7,2> - 1571361274U, // : Cost 2 vext2 RHS, <6,2,7,3> - 2645103154U, // : Cost 3 vext2 RHS, <6,3,4,5> - 1530047798U, // : Cost 2 vext1 , RHS - 1188386474U, // : Cost 2 vrev <7,u,5,6> - 1571361592U, // : Cost 2 vext2 RHS, <6,6,6,6> - 1571361614U, // : Cost 2 vext2 RHS, <6,7,0,1> - 1571361695U, // : Cost 2 vext2 RHS, <6,u,0,1> - 1571361786U, // : Cost 2 vext2 RHS, <7,0,1,2> - 2573935616U, // : Cost 3 vext1 <3,u,7,7>, <1,3,5,7> - 2645103781U, // : Cost 3 vext2 RHS, <7,2,2,2> - 2573937497U, // : Cost 3 vext1 <3,u,7,7>, <3,u,7,7> - 1571362150U, // : Cost 2 vext2 RHS, <7,4,5,6> - 1512141067U, // : Cost 2 vext1 <5,u,7,7>, <5,u,7,7> - 1518113764U, // : Cost 2 vext1 <6,u,7,7>, <6,u,7,7> - 363253046U, // : Cost 1 vdup3 RHS - 363253046U, // : Cost 1 vdup3 RHS - 1571362515U, // : Cost 2 vext2 RHS, - 497620782U, // : Cost 1 vext2 RHS, LHS - 1571362693U, // : Cost 2 vext2 RHS, - 1571362748U, // : Cost 2 vext2 RHS, - 1571362879U, // : Cost 2 vext2 RHS, - 497621146U, // : Cost 1 vext2 RHS, RHS - 1571363024U, // : Cost 2 vext2 RHS, - 363253046U, // : Cost 1 vdup3 RHS - 497621349U, // : Cost 1 vext2 RHS, LHS - 135053414U, // : Cost 1 vdup0 LHS - 471081121U, // : Cost 1 vext2 LHS, LHS - 1544822948U, // : Cost 2 vext2 LHS, <0,2,0,2> - 1616140005U, // : Cost 2 vext3 LHS, - 1544823122U, // : Cost 2 vext2 LHS, <0,4,1,5> - 1512157453U, // : Cost 2 vext1 <5,u,u,0>, <5,u,u,0> - 1662220032U, // : Cost 2 vext3 RHS, - 1194457487U, // : Cost 2 vrev - 471081629U, // : Cost 1 vext2 LHS, LHS - 1544823542U, // : Cost 2 vext2 LHS, <1,0,3,2> - 202162278U, // : Cost 1 vdup1 LHS - 537753390U, // : Cost 1 vext3 LHS, LHS - 1544823768U, // : Cost 2 vext2 LHS, <1,3,1,3> - 1494248758U, // : Cost 2 vext1 <2,u,u,1>, RHS - 1544823952U, // : Cost 2 vext2 LHS, <1,5,3,7> - 1518138343U, // : Cost 2 vext1 <6,u,u,1>, <6,u,u,1> - 1640322907U, // : Cost 2 vext3 RHS, - 537753444U, // : Cost 1 vext3 LHS, LHS - 1482309734U, // : Cost 2 vext1 <0,u,u,2>, LHS - 1194031451U, // : Cost 2 vrev - 269271142U, // : Cost 1 vdup2 LHS - 835584U, // : Cost 0 copy LHS - 1482313014U, // : Cost 2 vext1 <0,u,u,2>, RHS - 2618566504U, // : Cost 3 vext2 LHS, <2,5,3,6> - 1544824762U, // : Cost 2 vext2 LHS, <2,6,3,7> - 1638479788U, // : Cost 2 vext3 RHS, - 835584U, // : Cost 0 copy LHS - 408576723U, // : Cost 1 vext1 LHS, LHS - 1482318582U, // : Cost 2 vext1 LHS, <1,0,3,2> - 120371557U, // : Cost 1 vrev LHS - 336380006U, // : Cost 1 vdup3 LHS - 408579382U, // : Cost 1 vext1 LHS, RHS - 1616140271U, // : Cost 2 vext3 LHS, - 1530098170U, // : Cost 2 vext1 LHS, <6,2,7,3> - 1880329544U, // : Cost 2 vzipr LHS, RHS - 408581934U, // : Cost 1 vext1 LHS, LHS - 1488298086U, // : Cost 2 vext1 <1,u,u,4>, LHS - 1488299437U, // : Cost 2 vext1 <1,u,u,4>, <1,u,u,4> - 1659271204U, // : Cost 2 vext3 LHS, - 1194195311U, // : Cost 2 vrev - 161926454U, // : Cost 1 vdup0 RHS - 471084342U, // : Cost 1 vext2 LHS, RHS - 1571368308U, // : Cost 2 vext2 RHS, <4,6,4,6> - 1640323153U, // : Cost 2 vext3 RHS, - 471084585U, // : Cost 1 vext2 LHS, RHS - 1494278246U, // : Cost 2 vext1 <2,u,u,5>, LHS - 1571368656U, // : Cost 2 vext2 RHS, <5,1,7,3> - 1494280327U, // : Cost 2 vext1 <2,u,u,5>, <2,u,u,5> - 1616140415U, // : Cost 2 vext3 LHS, - 1494281526U, // : Cost 2 vext1 <2,u,u,5>, RHS - 229035318U, // : Cost 1 vdup1 RHS - 537753754U, // : Cost 1 vext3 LHS, RHS - 1750355254U, // : Cost 2 vuzpr LHS, RHS - 537753772U, // : Cost 1 vext3 LHS, RHS - 1482342502U, // : Cost 2 vext1 <0,u,u,6>, LHS - 2556084982U, // : Cost 3 vext1 <0,u,u,6>, <1,0,3,2> - 1571369466U, // : Cost 2 vext2 RHS, <6,2,7,3> - 1611938000U, // : Cost 2 vext3 LHS, - 1482345782U, // : Cost 2 vext1 <0,u,u,6>, RHS - 1194359171U, // : Cost 2 vrev - 296144182U, // : Cost 1 vdup2 RHS - 27705344U, // : Cost 0 copy RHS - 27705344U, // : Cost 0 copy RHS - 432496742U, // : Cost 1 vext1 RHS, LHS - 1488324016U, // : Cost 2 vext1 <1,u,u,7>, <1,u,u,7> - 1494296713U, // : Cost 2 vext1 <2,u,u,7>, <2,u,u,7> - 1906901148U, // : Cost 2 vzipr RHS, LHS - 432500283U, // : Cost 1 vext1 RHS, RHS - 1506242256U, // : Cost 2 vext1 RHS, <5,1,7,3> - 120699277U, // : Cost 1 vrev RHS - 363253046U, // : Cost 1 vdup3 RHS - 432502574U, // : Cost 1 vext1 RHS, LHS - 408617688U, // : Cost 1 vext1 LHS, LHS - 471086894U, // : Cost 1 vext2 LHS, LHS - 537753957U, // : Cost 1 vext3 LHS, LHS - 835584U, // : Cost 0 copy LHS - 408620342U, // : Cost 1 vext1 LHS, RHS - 471087258U, // : Cost 1 vext2 LHS, RHS - 537753997U, // : Cost 1 vext3 LHS, RHS - 27705344U, // : Cost 0 copy RHS - 835584U, // : Cost 0 copy LHS - 0 -}; diff --git a/lib/Target/ARM64/ARM64PromoteConstant.cpp b/lib/Target/ARM64/ARM64PromoteConstant.cpp deleted file mode 100644 index e61a62262d3..00000000000 --- a/lib/Target/ARM64/ARM64PromoteConstant.cpp +++ /dev/null @@ -1,580 +0,0 @@ - -//===-- ARM64PromoteConstant.cpp --- Promote constant to global for ARM64 -===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the ARM64PromoteConstant pass which promotes constants -// to global variables when this is likely to be more efficient. Currently only -// types related to constant vector (i.e., constant vector, array of constant -// vectors, constant structure with a constant vector field, etc.) are promoted -// to global variables. Constant vectors are likely to be lowered in target -// constant pool during instruction selection already; therefore, the access -// will remain the same (memory load), but the structure types are not split -// into different constant pool accesses for each field. A bonus side effect is -// that created globals may be merged by the global merge pass. -// -// FIXME: This pass may be useful for other targets too. -//===----------------------------------------------------------------------===// - -#include "ARM64.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/Dominators.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/GlobalVariable.h" -#include "llvm/IR/InlineAsm.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" - -using namespace llvm; - -#define DEBUG_TYPE "arm64-promote-const" - -// Stress testing mode - disable heuristics. -static cl::opt Stress("arm64-stress-promote-const", cl::Hidden, - cl::desc("Promote all vector constants")); - -STATISTIC(NumPromoted, "Number of promoted constants"); -STATISTIC(NumPromotedUses, "Number of promoted constants uses"); - -//===----------------------------------------------------------------------===// -// ARM64PromoteConstant -//===----------------------------------------------------------------------===// - -namespace { -/// Promotes interesting constant into global variables. -/// The motivating example is: -/// static const uint16_t TableA[32] = { -/// 41944, 40330, 38837, 37450, 36158, 34953, 33826, 32768, -/// 31776, 30841, 29960, 29128, 28340, 27595, 26887, 26215, -/// 25576, 24967, 24386, 23832, 23302, 22796, 22311, 21846, -/// 21400, 20972, 20561, 20165, 19785, 19419, 19066, 18725, -/// }; -/// -/// uint8x16x4_t LoadStatic(void) { -/// uint8x16x4_t ret; -/// ret.val[0] = vld1q_u16(TableA + 0); -/// ret.val[1] = vld1q_u16(TableA + 8); -/// ret.val[2] = vld1q_u16(TableA + 16); -/// ret.val[3] = vld1q_u16(TableA + 24); -/// return ret; -/// } -/// -/// The constants in this example are folded into the uses. Thus, 4 different -/// constants are created. -/// -/// As their type is vector the cheapest way to create them is to load them -/// for the memory. -/// -/// Therefore the final assembly final has 4 different loads. With this pass -/// enabled, only one load is issued for the constants. -class ARM64PromoteConstant : public ModulePass { - -public: - static char ID; - ARM64PromoteConstant() : ModulePass(ID) {} - - const char *getPassName() const override { return "ARM64 Promote Constant"; } - - /// Iterate over the functions and promote the interesting constants into - /// global variables with module scope. - bool runOnModule(Module &M) override { - DEBUG(dbgs() << getPassName() << '\n'); - bool Changed = false; - for (auto &MF : M) { - Changed |= runOnFunction(MF); - } - return Changed; - } - -private: - /// Look for interesting constants used within the given function. - /// Promote them into global variables, load these global variables within - /// the related function, so that the number of inserted load is minimal. - bool runOnFunction(Function &F); - - // This transformation requires dominator info - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired(); - AU.addPreserved(); - } - - /// Type to store a list of User. - typedef SmallVector Users; - /// Map an insertion point to all the uses it dominates. - typedef DenseMap InsertionPoints; - /// Map a function to the required insertion point of load for a - /// global variable. - typedef DenseMap InsertionPointsPerFunc; - - /// Find the closest point that dominates the given Use. - Instruction *findInsertionPoint(Value::user_iterator &Use); - - /// Check if the given insertion point is dominated by an existing - /// insertion point. - /// If true, the given use is added to the list of dominated uses for - /// the related existing point. - /// \param NewPt the insertion point to be checked - /// \param UseIt the use to be added into the list of dominated uses - /// \param InsertPts existing insertion points - /// \pre NewPt and all instruction in InsertPts belong to the same function - /// \return true if one of the insertion point in InsertPts dominates NewPt, - /// false otherwise - bool isDominated(Instruction *NewPt, Value::user_iterator &UseIt, - InsertionPoints &InsertPts); - - /// Check if the given insertion point can be merged with an existing - /// insertion point in a common dominator. - /// If true, the given use is added to the list of the created insertion - /// point. - /// \param NewPt the insertion point to be checked - /// \param UseIt the use to be added into the list of dominated uses - /// \param InsertPts existing insertion points - /// \pre NewPt and all instruction in InsertPts belong to the same function - /// \pre isDominated returns false for the exact same parameters. - /// \return true if it exists an insertion point in InsertPts that could - /// have been merged with NewPt in a common dominator, - /// false otherwise - bool tryAndMerge(Instruction *NewPt, Value::user_iterator &UseIt, - InsertionPoints &InsertPts); - - /// Compute the minimal insertion points to dominates all the interesting - /// uses of value. - /// Insertion points are group per function and each insertion point - /// contains a list of all the uses it dominates within the related function - /// \param Val constant to be examined - /// \param[out] InsPtsPerFunc output storage of the analysis - void computeInsertionPoints(Constant *Val, - InsertionPointsPerFunc &InsPtsPerFunc); - - /// Insert a definition of a new global variable at each point contained in - /// InsPtsPerFunc and update the related uses (also contained in - /// InsPtsPerFunc). - bool insertDefinitions(Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc); - - /// Compute the minimal insertion points to dominate all the interesting - /// uses of Val and insert a definition of a new global variable - /// at these points. - /// Also update the uses of Val accordingly. - /// Currently a use of Val is considered interesting if: - /// - Val is not UndefValue - /// - Val is not zeroinitialized - /// - Replacing Val per a load of a global variable is valid. - /// \see shouldConvert for more details - bool computeAndInsertDefinitions(Constant *Val); - - /// Promote the given constant into a global variable if it is expected to - /// be profitable. - /// \return true if Cst has been promoted - bool promoteConstant(Constant *Cst); - - /// Transfer the list of dominated uses of IPI to NewPt in InsertPts. - /// Append UseIt to this list and delete the entry of IPI in InsertPts. - static void appendAndTransferDominatedUses(Instruction *NewPt, - Value::user_iterator &UseIt, - InsertionPoints::iterator &IPI, - InsertionPoints &InsertPts) { - // Record the dominated use. - IPI->second.push_back(UseIt); - // Transfer the dominated uses of IPI to NewPt - // Inserting into the DenseMap may invalidate existing iterator. - // Keep a copy of the key to find the iterator to erase. - Instruction *OldInstr = IPI->first; - InsertPts.insert(InsertionPoints::value_type(NewPt, IPI->second)); - // Erase IPI. - IPI = InsertPts.find(OldInstr); - InsertPts.erase(IPI); - } -}; -} // end anonymous namespace - -char ARM64PromoteConstant::ID = 0; - -namespace llvm { -void initializeARM64PromoteConstantPass(PassRegistry &); -} - -INITIALIZE_PASS_BEGIN(ARM64PromoteConstant, "arm64-promote-const", - "ARM64 Promote Constant Pass", false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_END(ARM64PromoteConstant, "arm64-promote-const", - "ARM64 Promote Constant Pass", false, false) - -ModulePass *llvm::createARM64PromoteConstantPass() { - return new ARM64PromoteConstant(); -} - -/// Check if the given type uses a vector type. -static bool isConstantUsingVectorTy(const Type *CstTy) { - if (CstTy->isVectorTy()) - return true; - if (CstTy->isStructTy()) { - for (unsigned EltIdx = 0, EndEltIdx = CstTy->getStructNumElements(); - EltIdx < EndEltIdx; ++EltIdx) - if (isConstantUsingVectorTy(CstTy->getStructElementType(EltIdx))) - return true; - } else if (CstTy->isArrayTy()) - return isConstantUsingVectorTy(CstTy->getArrayElementType()); - return false; -} - -/// Check if the given use (Instruction + OpIdx) of Cst should be converted into -/// a load of a global variable initialized with Cst. -/// A use should be converted if it is legal to do so. -/// For instance, it is not legal to turn the mask operand of a shuffle vector -/// into a load of a global variable. -static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr, - unsigned OpIdx) { - // shufflevector instruction expects a const for the mask argument, i.e., the - // third argument. Do not promote this use in that case. - if (isa(Instr) && OpIdx == 2) - return false; - - // extractvalue instruction expects a const idx. - if (isa(Instr) && OpIdx > 0) - return false; - - // extractvalue instruction expects a const idx. - if (isa(Instr) && OpIdx > 1) - return false; - - if (isa(Instr) && OpIdx > 0) - return false; - - // Alignment argument must be constant. - if (isa(Instr) && OpIdx > 0) - return false; - - // Alignment argument must be constant. - if (isa(Instr) && OpIdx > 1) - return false; - - // Index must be constant. - if (isa(Instr) && OpIdx > 0) - return false; - - // Personality function and filters must be constant. - // Give up on that instruction. - if (isa(Instr)) - return false; - - // Switch instruction expects constants to compare to. - if (isa(Instr)) - return false; - - // Expected address must be a constant. - if (isa(Instr)) - return false; - - // Do not mess with intrinsics. - if (isa(Instr)) - return false; - - // Do not mess with inline asm. - const CallInst *CI = dyn_cast(Instr); - if (CI && isa(CI->getCalledValue())) - return false; - - return true; -} - -/// Check if the given Cst should be converted into -/// a load of a global variable initialized with Cst. -/// A constant should be converted if it is likely that the materialization of -/// the constant will be tricky. Thus, we give up on zero or undef values. -/// -/// \todo Currently, accept only vector related types. -/// Also we give up on all simple vector type to keep the existing -/// behavior. Otherwise, we should push here all the check of the lowering of -/// BUILD_VECTOR. By giving up, we lose the potential benefit of merging -/// constant via global merge and the fact that the same constant is stored -/// only once with this method (versus, as many function that uses the constant -/// for the regular approach, even for float). -/// Again, the simplest solution would be to promote every -/// constant and rematerialize them when they are actually cheap to create. -static bool shouldConvert(const Constant *Cst) { - if (isa(Cst)) - return false; - - // FIXME: In some cases, it may be interesting to promote in memory - // a zero initialized constant. - // E.g., when the type of Cst require more instructions than the - // adrp/add/load sequence or when this sequence can be shared by several - // instances of Cst. - // Ideally, we could promote this into a global and rematerialize the constant - // when it was a bad idea. - if (Cst->isZeroValue()) - return false; - - if (Stress) - return true; - - // FIXME: see function \todo - if (Cst->getType()->isVectorTy()) - return false; - return isConstantUsingVectorTy(Cst->getType()); -} - -Instruction * -ARM64PromoteConstant::findInsertionPoint(Value::user_iterator &Use) { - // If this user is a phi, the insertion point is in the related - // incoming basic block. - PHINode *PhiInst = dyn_cast(*Use); - Instruction *InsertionPoint; - if (PhiInst) - InsertionPoint = - PhiInst->getIncomingBlock(Use.getOperandNo())->getTerminator(); - else - InsertionPoint = dyn_cast(*Use); - assert(InsertionPoint && "User is not an instruction!"); - return InsertionPoint; -} - -bool ARM64PromoteConstant::isDominated(Instruction *NewPt, - Value::user_iterator &UseIt, - InsertionPoints &InsertPts) { - - DominatorTree &DT = getAnalysis( - *NewPt->getParent()->getParent()).getDomTree(); - - // Traverse all the existing insertion points and check if one is dominating - // NewPt. If it is, remember that. - for (auto &IPI : InsertPts) { - if (NewPt == IPI.first || DT.dominates(IPI.first, NewPt) || - // When IPI.first is a terminator instruction, DT may think that - // the result is defined on the edge. - // Here we are testing the insertion point, not the definition. - (IPI.first->getParent() != NewPt->getParent() && - DT.dominates(IPI.first->getParent(), NewPt->getParent()))) { - // No need to insert this point. Just record the dominated use. - DEBUG(dbgs() << "Insertion point dominated by:\n"); - DEBUG(IPI.first->print(dbgs())); - DEBUG(dbgs() << '\n'); - IPI.second.push_back(UseIt); - return true; - } - } - return false; -} - -bool ARM64PromoteConstant::tryAndMerge(Instruction *NewPt, - Value::user_iterator &UseIt, - InsertionPoints &InsertPts) { - DominatorTree &DT = getAnalysis( - *NewPt->getParent()->getParent()).getDomTree(); - BasicBlock *NewBB = NewPt->getParent(); - - // Traverse all the existing insertion point and check if one is dominated by - // NewPt and thus useless or can be combined with NewPt into a common - // dominator. - for (InsertionPoints::iterator IPI = InsertPts.begin(), - EndIPI = InsertPts.end(); - IPI != EndIPI; ++IPI) { - BasicBlock *CurBB = IPI->first->getParent(); - if (NewBB == CurBB) { - // Instructions are in the same block. - // By construction, NewPt is dominating the other. - // Indeed, isDominated returned false with the exact same arguments. - DEBUG(dbgs() << "Merge insertion point with:\n"); - DEBUG(IPI->first->print(dbgs())); - DEBUG(dbgs() << "\nat considered insertion point.\n"); - appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts); - return true; - } - - // Look for a common dominator - BasicBlock *CommonDominator = DT.findNearestCommonDominator(NewBB, CurBB); - // If none exists, we cannot merge these two points. - if (!CommonDominator) - continue; - - if (CommonDominator != NewBB) { - // By construction, the CommonDominator cannot be CurBB. - assert(CommonDominator != CurBB && - "Instruction has not been rejected during isDominated check!"); - // Take the last instruction of the CommonDominator as insertion point - NewPt = CommonDominator->getTerminator(); - } - // else, CommonDominator is the block of NewBB, hence NewBB is the last - // possible insertion point in that block. - DEBUG(dbgs() << "Merge insertion point with:\n"); - DEBUG(IPI->first->print(dbgs())); - DEBUG(dbgs() << '\n'); - DEBUG(NewPt->print(dbgs())); - DEBUG(dbgs() << '\n'); - appendAndTransferDominatedUses(NewPt, UseIt, IPI, InsertPts); - return true; - } - return false; -} - -void ARM64PromoteConstant::computeInsertionPoints( - Constant *Val, InsertionPointsPerFunc &InsPtsPerFunc) { - DEBUG(dbgs() << "** Compute insertion points **\n"); - for (Value::user_iterator UseIt = Val->user_begin(), - EndUseIt = Val->user_end(); - UseIt != EndUseIt; ++UseIt) { - // If the user is not an Instruction, we cannot modify it. - if (!isa(*UseIt)) - continue; - - // Filter out uses that should not be converted. - if (!shouldConvertUse(Val, cast(*UseIt), UseIt.getOperandNo())) - continue; - - DEBUG(dbgs() << "Considered use, opidx " << UseIt.getOperandNo() << ":\n"); - DEBUG((*UseIt)->print(dbgs())); - DEBUG(dbgs() << '\n'); - - Instruction *InsertionPoint = findInsertionPoint(UseIt); - - DEBUG(dbgs() << "Considered insertion point:\n"); - DEBUG(InsertionPoint->print(dbgs())); - DEBUG(dbgs() << '\n'); - - // Check if the current insertion point is useless, i.e., it is dominated - // by another one. - InsertionPoints &InsertPts = - InsPtsPerFunc[InsertionPoint->getParent()->getParent()]; - if (isDominated(InsertionPoint, UseIt, InsertPts)) - continue; - // This insertion point is useful, check if we can merge some insertion - // point in a common dominator or if NewPt dominates an existing one. - if (tryAndMerge(InsertionPoint, UseIt, InsertPts)) - continue; - - DEBUG(dbgs() << "Keep considered insertion point\n"); - - // It is definitely useful by its own - InsertPts[InsertionPoint].push_back(UseIt); - } -} - -bool -ARM64PromoteConstant::insertDefinitions(Constant *Cst, - InsertionPointsPerFunc &InsPtsPerFunc) { - // We will create one global variable per Module. - DenseMap ModuleToMergedGV; - bool HasChanged = false; - - // Traverse all insertion points in all the function. - for (InsertionPointsPerFunc::iterator FctToInstPtsIt = InsPtsPerFunc.begin(), - EndIt = InsPtsPerFunc.end(); - FctToInstPtsIt != EndIt; ++FctToInstPtsIt) { - InsertionPoints &InsertPts = FctToInstPtsIt->second; -// Do more checking for debug purposes. -#ifndef NDEBUG - DominatorTree &DT = getAnalysis( - *FctToInstPtsIt->first).getDomTree(); -#endif - GlobalVariable *PromotedGV; - assert(!InsertPts.empty() && "Empty uses does not need a definition"); - - Module *M = FctToInstPtsIt->first->getParent(); - DenseMap::iterator MapIt = - ModuleToMergedGV.find(M); - if (MapIt == ModuleToMergedGV.end()) { - PromotedGV = new GlobalVariable( - *M, Cst->getType(), true, GlobalValue::InternalLinkage, nullptr, - "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal); - PromotedGV->setInitializer(Cst); - ModuleToMergedGV[M] = PromotedGV; - DEBUG(dbgs() << "Global replacement: "); - DEBUG(PromotedGV->print(dbgs())); - DEBUG(dbgs() << '\n'); - ++NumPromoted; - HasChanged = true; - } else { - PromotedGV = MapIt->second; - } - - for (InsertionPoints::iterator IPI = InsertPts.begin(), - EndIPI = InsertPts.end(); - IPI != EndIPI; ++IPI) { - // Create the load of the global variable. - IRBuilder<> Builder(IPI->first->getParent(), IPI->first); - LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV); - DEBUG(dbgs() << "**********\n"); - DEBUG(dbgs() << "New def: "); - DEBUG(LoadedCst->print(dbgs())); - DEBUG(dbgs() << '\n'); - - // Update the dominated uses. - Users &DominatedUsers = IPI->second; - for (Value::user_iterator Use : DominatedUsers) { -#ifndef NDEBUG - assert((DT.dominates(LoadedCst, cast(*Use)) || - (isa(*Use) && - DT.dominates(LoadedCst, findInsertionPoint(Use)))) && - "Inserted definition does not dominate all its uses!"); -#endif - DEBUG(dbgs() << "Use to update " << Use.getOperandNo() << ":"); - DEBUG(Use->print(dbgs())); - DEBUG(dbgs() << '\n'); - Use->setOperand(Use.getOperandNo(), LoadedCst); - ++NumPromotedUses; - } - } - } - return HasChanged; -} - -bool ARM64PromoteConstant::computeAndInsertDefinitions(Constant *Val) { - InsertionPointsPerFunc InsertPtsPerFunc; - computeInsertionPoints(Val, InsertPtsPerFunc); - return insertDefinitions(Val, InsertPtsPerFunc); -} - -bool ARM64PromoteConstant::promoteConstant(Constant *Cst) { - assert(Cst && "Given variable is not a valid constant."); - - if (!shouldConvert(Cst)) - return false; - - DEBUG(dbgs() << "******************************\n"); - DEBUG(dbgs() << "Candidate constant: "); - DEBUG(Cst->print(dbgs())); - DEBUG(dbgs() << '\n'); - - return computeAndInsertDefinitions(Cst); -} - -bool ARM64PromoteConstant::runOnFunction(Function &F) { - // Look for instructions using constant vector. Promote that constant to a - // global variable. Create as few loads of this variable as possible and - // update the uses accordingly. - bool LocalChange = false; - SmallSet AlreadyChecked; - - for (auto &MBB : F) { - for (auto &MI : MBB) { - // Traverse the operand, looking for constant vectors. Replace them by a - // load of a global variable of constant vector type. - for (unsigned OpIdx = 0, EndOpIdx = MI.getNumOperands(); - OpIdx != EndOpIdx; ++OpIdx) { - Constant *Cst = dyn_cast(MI.getOperand(OpIdx)); - // There is no point in promoting global values as they are already - // global. Do not promote constant expressions either, as they may - // require some code expansion. - if (Cst && !isa(Cst) && !isa(Cst) && - AlreadyChecked.insert(Cst)) - LocalChange |= promoteConstant(Cst); - } - } - } - return LocalChange; -} diff --git a/lib/Target/ARM64/ARM64RegisterInfo.cpp b/lib/Target/ARM64/ARM64RegisterInfo.cpp deleted file mode 100644 index d3c647bd90b..00000000000 --- a/lib/Target/ARM64/ARM64RegisterInfo.cpp +++ /dev/null @@ -1,400 +0,0 @@ -//===- ARM64RegisterInfo.cpp - ARM64 Register Information -----------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the ARM64 implementation of the TargetRegisterInfo class. -// -//===----------------------------------------------------------------------===// - -#include "ARM64RegisterInfo.h" -#include "ARM64FrameLowering.h" -#include "ARM64InstrInfo.h" -#include "ARM64Subtarget.h" -#include "MCTargetDesc/ARM64AddressingModes.h" -#include "llvm/ADT/BitVector.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RegisterScavenging.h" -#include "llvm/IR/Function.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetFrameLowering.h" -#include "llvm/Target/TargetOptions.h" - -using namespace llvm; - -#define GET_REGINFO_TARGET_DESC -#include "ARM64GenRegisterInfo.inc" - -ARM64RegisterInfo::ARM64RegisterInfo(const ARM64InstrInfo *tii, - const ARM64Subtarget *sti) - : ARM64GenRegisterInfo(ARM64::LR), TII(tii), STI(sti) {} - -const MCPhysReg * -ARM64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { - assert(MF && "Invalid MachineFunction pointer."); - if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg) - return CSR_ARM64_AllRegs_SaveList; - else - return CSR_ARM64_AAPCS_SaveList; -} - -const uint32_t * -ARM64RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { - if (CC == CallingConv::AnyReg) - return CSR_ARM64_AllRegs_RegMask; - else - return CSR_ARM64_AAPCS_RegMask; -} - -const uint32_t *ARM64RegisterInfo::getTLSCallPreservedMask() const { - if (STI->isTargetDarwin()) - return CSR_ARM64_TLS_Darwin_RegMask; - - assert(STI->isTargetELF() && "only expect Darwin or ELF TLS"); - return CSR_ARM64_TLS_ELF_RegMask; -} - -const uint32_t * -ARM64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const { - // This should return a register mask that is the same as that returned by - // getCallPreservedMask but that additionally preserves the register used for - // the first i64 argument (which must also be the register used to return a - // single i64 return value) - // - // In case that the calling convention does not use the same register for - // both, the function should return NULL (does not currently apply) - return CSR_ARM64_AAPCS_ThisReturn_RegMask; -} - -BitVector ARM64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); - - // FIXME: avoid re-calculating this every time. - BitVector Reserved(getNumRegs()); - Reserved.set(ARM64::SP); - Reserved.set(ARM64::XZR); - Reserved.set(ARM64::WSP); - Reserved.set(ARM64::WZR); - - if (TFI->hasFP(MF) || STI->isTargetDarwin()) { - Reserved.set(ARM64::FP); - Reserved.set(ARM64::W29); - } - - if (STI->isTargetDarwin()) { - Reserved.set(ARM64::X18); // Platform register - Reserved.set(ARM64::W18); - } - - if (hasBasePointer(MF)) { - Reserved.set(ARM64::X19); - Reserved.set(ARM64::W19); - } - - return Reserved; -} - -bool ARM64RegisterInfo::isReservedReg(const MachineFunction &MF, - unsigned Reg) const { - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); - - switch (Reg) { - default: - break; - case ARM64::SP: - case ARM64::XZR: - case ARM64::WSP: - case ARM64::WZR: - return true; - case ARM64::X18: - case ARM64::W18: - return STI->isTargetDarwin(); - case ARM64::FP: - case ARM64::W29: - return TFI->hasFP(MF) || STI->isTargetDarwin(); - case ARM64::W19: - case ARM64::X19: - return hasBasePointer(MF); - } - - return false; -} - -const TargetRegisterClass * -ARM64RegisterInfo::getPointerRegClass(const MachineFunction &MF, - unsigned Kind) const { - return &ARM64::GPR64RegClass; -} - -const TargetRegisterClass * -ARM64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { - if (RC == &ARM64::CCRRegClass) - return nullptr; // Can't copy NZCV. - return RC; -} - -unsigned ARM64RegisterInfo::getBaseRegister() const { return ARM64::X19; } - -bool ARM64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - - // In the presence of variable sized objects, if the fixed stack size is - // large enough that referencing from the FP won't result in things being - // in range relatively often, we can use a base pointer to allow access - // from the other direction like the SP normally works. - if (MFI->hasVarSizedObjects()) { - // Conservatively estimate whether the negative offset from the frame - // pointer will be sufficient to reach. If a function has a smallish - // frame, it's less likely to have lots of spills and callee saved - // space, so it's all more likely to be within range of the frame pointer. - // If it's wrong, we'll materialize the constant and still get to the - // object; it's just suboptimal. Negative offsets use the unscaled - // load/store instructions, which have a 9-bit signed immediate. - if (MFI->getLocalFrameSize() < 256) - return false; - return true; - } - - return false; -} - -unsigned ARM64RegisterInfo::getFrameRegister(const MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); - - return TFI->hasFP(MF) ? ARM64::FP : ARM64::SP; -} - -bool -ARM64RegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const { - return true; -} - -bool ARM64RegisterInfo::requiresVirtualBaseRegisters(const MachineFunction &MF) - const { - return true; -} - -bool -ARM64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - // ARM64FrameLowering::resolveFrameIndexReference() can always fall back - // to the stack pointer, so only put the emergency spill slot next to the - // FP when there's no better way to access it (SP or base pointer). - return MFI->hasVarSizedObjects() && !hasBasePointer(MF); -} - -bool ARM64RegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) - const { - return true; -} - -bool ARM64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - // Only consider eliminating leaf frames. - if (MFI->hasCalls() || (MF.getTarget().Options.DisableFramePointerElim(MF) && - MFI->adjustsStack())) - return true; - return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken(); -} - -/// needsFrameBaseReg - Returns true if the instruction's frame index -/// reference would be better served by a base register other than FP -/// or SP. Used by LocalStackFrameAllocation to determine which frame index -/// references it should create new base registers for. -bool ARM64RegisterInfo::needsFrameBaseReg(MachineInstr *MI, - int64_t Offset) const { - for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i) - assert(i < MI->getNumOperands() && - "Instr doesn't have FrameIndex operand!"); - - // It's the load/store FI references that cause issues, as it can be difficult - // to materialize the offset if it won't fit in the literal field. Estimate - // based on the size of the local frame and some conservative assumptions - // about the rest of the stack frame (note, this is pre-regalloc, so - // we don't know everything for certain yet) whether this offset is likely - // to be out of range of the immediate. Return true if so. - - // We only generate virtual base registers for loads and stores, so - // return false for everything else. - if (!MI->mayLoad() && !MI->mayStore()) - return false; - - // Without a virtual base register, if the function has variable sized - // objects, all fixed-size local references will be via the frame pointer, - // Approximate the offset and see if it's legal for the instruction. - // Note that the incoming offset is based on the SP value at function entry, - // so it'll be negative. - MachineFunction &MF = *MI->getParent()->getParent(); - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); - MachineFrameInfo *MFI = MF.getFrameInfo(); - - // Estimate an offset from the frame pointer. - // Conservatively assume all GPR callee-saved registers get pushed. - // FP, LR, X19-X28, D8-D15. 64-bits each. - int64_t FPOffset = Offset - 16 * 20; - // Estimate an offset from the stack pointer. - // The incoming offset is relating to the SP at the start of the function, - // but when we access the local it'll be relative to the SP after local - // allocation, so adjust our SP-relative offset by that allocation size. - Offset += MFI->getLocalFrameSize(); - // Assume that we'll have at least some spill slots allocated. - // FIXME: This is a total SWAG number. We should run some statistics - // and pick a real one. - Offset += 128; // 128 bytes of spill slots - - // If there is a frame pointer, try using it. - // The FP is only available if there is no dynamic realignment. We - // don't know for sure yet whether we'll need that, so we guess based - // on whether there are any local variables that would trigger it. - if (TFI->hasFP(MF) && isFrameOffsetLegal(MI, FPOffset)) - return false; - - // If we can reference via the stack pointer or base pointer, try that. - // FIXME: This (and the code that resolves the references) can be improved - // to only disallow SP relative references in the live range of - // the VLA(s). In practice, it's unclear how much difference that - // would make, but it may be worth doing. - if (isFrameOffsetLegal(MI, Offset)) - return false; - - // The offset likely isn't legal; we want to allocate a virtual base register. - return true; -} - -bool ARM64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, - int64_t Offset) const { - assert(Offset <= INT_MAX && "Offset too big to fit in int."); - assert(MI && "Unable to get the legal offset for nil instruction."); - int SaveOffset = Offset; - return isARM64FrameOffsetLegal(*MI, SaveOffset) & ARM64FrameOffsetIsLegal; -} - -/// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx -/// at the beginning of the basic block. -void ARM64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB, - unsigned BaseReg, - int FrameIdx, - int64_t Offset) const { - MachineBasicBlock::iterator Ins = MBB->begin(); - DebugLoc DL; // Defaults to "unknown" - if (Ins != MBB->end()) - DL = Ins->getDebugLoc(); - - const MCInstrDesc &MCID = TII->get(ARM64::ADDXri); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - const MachineFunction &MF = *MBB->getParent(); - MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF)); - unsigned Shifter = ARM64_AM::getShifterImm(ARM64_AM::LSL, 0); - - BuildMI(*MBB, Ins, DL, MCID, BaseReg) - .addFrameIndex(FrameIdx) - .addImm(Offset) - .addImm(Shifter); -} - -void ARM64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, - int64_t Offset) const { - int Off = Offset; // ARM doesn't need the general 64-bit offsets - unsigned i = 0; - - while (!MI.getOperand(i).isFI()) { - ++i; - assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!"); - } - bool Done = rewriteARM64FrameIndex(MI, i, BaseReg, Off, TII); - assert(Done && "Unable to resolve frame index!"); - (void)Done; -} - -void ARM64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, - int SPAdj, unsigned FIOperandNum, - RegScavenger *RS) const { - assert(SPAdj == 0 && "Unexpected"); - - MachineInstr &MI = *II; - MachineBasicBlock &MBB = *MI.getParent(); - MachineFunction &MF = *MBB.getParent(); - const ARM64FrameLowering *TFI = static_cast( - MF.getTarget().getFrameLowering()); - - int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); - unsigned FrameReg; - int Offset; - - // Special handling of dbg_value, stackmap and patchpoint instructions. - if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP || - MI.getOpcode() == TargetOpcode::PATCHPOINT) { - Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg, - /*PreferFP=*/true); - Offset += MI.getOperand(FIOperandNum + 1).getImm(); - MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/); - MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset); - return; - } - - // Modify MI as necessary to handle as much of 'Offset' as possible - Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg); - if (rewriteARM64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII)) - return; - - assert((!RS || !RS->isScavengingFrameIndex(FrameIndex)) && - "Emergency spill slot is out of reach"); - - // If we get here, the immediate doesn't fit into the instruction. We folded - // as much as possible above. Handle the rest, providing a register that is - // SP+LargeImm. - unsigned ScratchReg = - MF.getRegInfo().createVirtualRegister(&ARM64::GPR64RegClass); - emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII); - MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true); -} - -namespace llvm { - -unsigned ARM64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, - MachineFunction &MF) const { - const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering(); - - switch (RC->getID()) { - default: - return 0; - case ARM64::GPR32RegClassID: - case ARM64::GPR32spRegClassID: - case ARM64::GPR32allRegClassID: - case ARM64::GPR64spRegClassID: - case ARM64::GPR64allRegClassID: - case ARM64::GPR64RegClassID: - case ARM64::GPR32commonRegClassID: - case ARM64::GPR64commonRegClassID: - return 32 - 1 // XZR/SP - - (TFI->hasFP(MF) || STI->isTargetDarwin()) // FP - - STI->isTargetDarwin() // X18 reserved as platform register - - hasBasePointer(MF); // X19 - case ARM64::FPR8RegClassID: - case ARM64::FPR16RegClassID: - case ARM64::FPR32RegClassID: - case ARM64::FPR64RegClassID: - case ARM64::FPR128RegClassID: - return 32; - - case ARM64::DDRegClassID: - case ARM64::DDDRegClassID: - case ARM64::DDDDRegClassID: - case ARM64::QQRegClassID: - case ARM64::QQQRegClassID: - case ARM64::QQQQRegClassID: - return 32; - - case ARM64::FPR128_loRegClassID: - return 16; - } -} - -} // namespace llvm diff --git a/lib/Target/ARM64/ARM64RegisterInfo.h b/lib/Target/ARM64/ARM64RegisterInfo.h deleted file mode 100644 index 7691fadbcc8..00000000000 --- a/lib/Target/ARM64/ARM64RegisterInfo.h +++ /dev/null @@ -1,101 +0,0 @@ -//===- ARM64RegisterInfo.h - ARM64 Register Information Impl ----*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the ARM64 implementation of the MRegisterInfo class. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TARGET_ARM64REGISTERINFO_H -#define LLVM_TARGET_ARM64REGISTERINFO_H - -#define GET_REGINFO_HEADER -#include "ARM64GenRegisterInfo.inc" - -namespace llvm { - -class ARM64InstrInfo; -class ARM64Subtarget; -class MachineFunction; -class RegScavenger; -class TargetRegisterClass; - -struct ARM64RegisterInfo : public ARM64GenRegisterInfo { -private: - const ARM64InstrInfo *TII; - const ARM64Subtarget *STI; - -public: - ARM64RegisterInfo(const ARM64InstrInfo *tii, const ARM64Subtarget *sti); - - bool isReservedReg(const MachineFunction &MF, unsigned Reg) const; - - /// Code Generation virtual methods... - const MCPhysReg * - getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override; - const uint32_t *getCallPreservedMask(CallingConv::ID) const override; - - unsigned getCSRFirstUseCost() const override { - // The cost will be compared against BlockFrequency where entry has the - // value of 1 << 14. A value of 5 will choose to spill or split really - // cold path instead of using a callee-saved register. - return 5; - } - - // Calls involved in thread-local variable lookup save more registers than - // normal calls, so they need a different mask to represent this. - const uint32_t *getTLSCallPreservedMask() const; - - /// getThisReturnPreservedMask - Returns a call preserved mask specific to the - /// case that 'returned' is on an i64 first argument if the calling convention - /// is one that can (partially) model this attribute with a preserved mask - /// (i.e. it is a calling convention that uses the same register for the first - /// i64 argument and an i64 return value) - /// - /// Should return NULL in the case that the calling convention does not have - /// this property - const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const; - - BitVector getReservedRegs(const MachineFunction &MF) const override; - const TargetRegisterClass * - getPointerRegClass(const MachineFunction &MF, - unsigned Kind = 0) const override; - const TargetRegisterClass * - getCrossCopyRegClass(const TargetRegisterClass *RC) const override; - - bool requiresRegisterScavenging(const MachineFunction &MF) const override; - bool useFPForScavengingIndex(const MachineFunction &MF) const override; - bool requiresFrameIndexScavenging(const MachineFunction &MF) const override; - - bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override; - bool isFrameOffsetLegal(const MachineInstr *MI, - int64_t Offset) const override; - void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg, - int FrameIdx, - int64_t Offset) const override; - void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg, - int64_t Offset) const override; - void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, - unsigned FIOperandNum, - RegScavenger *RS = nullptr) const override; - bool cannotEliminateFrame(const MachineFunction &MF) const; - - bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override; - bool hasBasePointer(const MachineFunction &MF) const; - unsigned getBaseRegister() const; - - // Debug information queries. - unsigned getFrameRegister(const MachineFunction &MF) const override; - - unsigned getRegPressureLimit(const TargetRegisterClass *RC, - MachineFunction &MF) const override; -}; - -} // end namespace llvm - -#endif // LLVM_TARGET_ARM64REGISTERINFO_H diff --git a/lib/Target/ARM64/ARM64RegisterInfo.td b/lib/Target/ARM64/ARM64RegisterInfo.td deleted file mode 100644 index 28d01809739..00000000000 --- a/lib/Target/ARM64/ARM64RegisterInfo.td +++ /dev/null @@ -1,593 +0,0 @@ -//===- ARM64RegisterInfo.td - Describe the ARM64 Regisers --*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - - -class ARM64Reg enc, string n, list subregs = [], - list altNames = []> - : Register { - let HWEncoding = enc; - let Namespace = "ARM64"; - let SubRegs = subregs; -} - -let Namespace = "ARM64" in { - def sub_32 : SubRegIndex<32>; - - def bsub : SubRegIndex<8>; - def hsub : SubRegIndex<16>; - def ssub : SubRegIndex<32>; - def dsub : SubRegIndex<32>; - def qhisub : SubRegIndex<64>; - def qsub : SubRegIndex<64>; - // Note: Code depends on these having consecutive numbers - def dsub0 : SubRegIndex<64>; - def dsub1 : SubRegIndex<64>; - def dsub2 : SubRegIndex<64>; - def dsub3 : SubRegIndex<64>; - // Note: Code depends on these having consecutive numbers - def qsub0 : SubRegIndex<128>; - def qsub1 : SubRegIndex<128>; - def qsub2 : SubRegIndex<128>; - def qsub3 : SubRegIndex<128>; -} - -let Namespace = "ARM64" in { - def vreg : RegAltNameIndex; - def vlist1 : RegAltNameIndex; -} - -//===----------------------------------------------------------------------===// -// Registers -//===----------------------------------------------------------------------===// -def W0 : ARM64Reg<0, "w0" >, DwarfRegNum<[0]>; -def W1 : ARM64Reg<1, "w1" >, DwarfRegNum<[1]>; -def W2 : ARM64Reg<2, "w2" >, DwarfRegNum<[2]>; -def W3 : ARM64Reg<3, "w3" >, DwarfRegNum<[3]>; -def W4 : ARM64Reg<4, "w4" >, DwarfRegNum<[4]>; -def W5 : ARM64Reg<5, "w5" >, DwarfRegNum<[5]>; -def W6 : ARM64Reg<6, "w6" >, DwarfRegNum<[6]>; -def W7 : ARM64Reg<7, "w7" >, DwarfRegNum<[7]>; -def W8 : ARM64Reg<8, "w8" >, DwarfRegNum<[8]>; -def W9 : ARM64Reg<9, "w9" >, DwarfRegNum<[9]>; -def W10 : ARM64Reg<10, "w10">, DwarfRegNum<[10]>; -def W11 : ARM64Reg<11, "w11">, DwarfRegNum<[11]>; -def W12 : ARM64Reg<12, "w12">, DwarfRegNum<[12]>; -def W13 : ARM64Reg<13, "w13">, DwarfRegNum<[13]>; -def W14 : ARM64Reg<14, "w14">, DwarfRegNum<[14]>; -def W15 : ARM64Reg<15, "w15">, DwarfRegNum<[15]>; -def W16 : ARM64Reg<16, "w16">, DwarfRegNum<[16]>; -def W17 : ARM64Reg<17, "w17">, DwarfRegNum<[17]>; -def W18 : ARM64Reg<18, "w18">, DwarfRegNum<[18]>; -def W19 : ARM64Reg<19, "w19">, DwarfRegNum<[19]>; -def W20 : ARM64Reg<20, "w20">, DwarfRegNum<[20]>; -def W21 : ARM64Reg<21, "w21">, DwarfRegNum<[21]>; -def W22 : ARM64Reg<22, "w22">, DwarfRegNum<[22]>; -def W23 : ARM64Reg<23, "w23">, DwarfRegNum<[23]>; -def W24 : ARM64Reg<24, "w24">, DwarfRegNum<[24]>; -def W25 : ARM64Reg<25, "w25">, DwarfRegNum<[25]>; -def W26 : ARM64Reg<26, "w26">, DwarfRegNum<[26]>; -def W27 : ARM64Reg<27, "w27">, DwarfRegNum<[27]>; -def W28 : ARM64Reg<28, "w28">, DwarfRegNum<[28]>; -def W29 : ARM64Reg<29, "w29">, DwarfRegNum<[29]>; -def W30 : ARM64Reg<30, "w30">, DwarfRegNum<[30]>; -def WSP : ARM64Reg<31, "wsp">, DwarfRegNum<[31]>; -def WZR : ARM64Reg<31, "wzr">, DwarfRegAlias; - -let SubRegIndices = [sub_32] in { -def X0 : ARM64Reg<0, "x0", [W0]>, DwarfRegAlias; -def X1 : ARM64Reg<1, "x1", [W1]>, DwarfRegAlias; -def X2 : ARM64Reg<2, "x2", [W2]>, DwarfRegAlias; -def X3 : ARM64Reg<3, "x3", [W3]>, DwarfRegAlias; -def X4 : ARM64Reg<4, "x4", [W4]>, DwarfRegAlias; -def X5 : ARM64Reg<5, "x5", [W5]>, DwarfRegAlias; -def X6 : ARM64Reg<6, "x6", [W6]>, DwarfRegAlias; -def X7 : ARM64Reg<7, "x7", [W7]>, DwarfRegAlias; -def X8 : ARM64Reg<8, "x8", [W8]>, DwarfRegAlias; -def X9 : ARM64Reg<9, "x9", [W9]>, DwarfRegAlias; -def X10 : ARM64Reg<10, "x10", [W10]>, DwarfRegAlias; -def X11 : ARM64Reg<11, "x11", [W11]>, DwarfRegAlias; -def X12 : ARM64Reg<12, "x12", [W12]>, DwarfRegAlias; -def X13 : ARM64Reg<13, "x13", [W13]>, DwarfRegAlias; -def X14 : ARM64Reg<14, "x14", [W14]>, DwarfRegAlias; -def X15 : ARM64Reg<15, "x15", [W15]>, DwarfRegAlias; -def X16 : ARM64Reg<16, "x16", [W16]>, DwarfRegAlias; -def X17 : ARM64Reg<17, "x17", [W17]>, DwarfRegAlias; -def X18 : ARM64Reg<18, "x18", [W18]>, DwarfRegAlias; -def X19 : ARM64Reg<19, "x19", [W19]>, DwarfRegAlias; -def X20 : ARM64Reg<20, "x20", [W20]>, DwarfRegAlias; -def X21 : ARM64Reg<21, "x21", [W21]>, DwarfRegAlias; -def X22 : ARM64Reg<22, "x22", [W22]>, DwarfRegAlias; -def X23 : ARM64Reg<23, "x23", [W23]>, DwarfRegAlias; -def X24 : ARM64Reg<24, "x24", [W24]>, DwarfRegAlias; -def X25 : ARM64Reg<25, "x25", [W25]>, DwarfRegAlias; -def X26 : ARM64Reg<26, "x26", [W26]>, DwarfRegAlias; -def X27 : ARM64Reg<27, "x27", [W27]>, DwarfRegAlias; -def X28 : ARM64Reg<28, "x28", [W28]>, DwarfRegAlias; -def FP : ARM64Reg<29, "x29", [W29]>, DwarfRegAlias; -def LR : ARM64Reg<30, "x30", [W30]>, DwarfRegAlias; -def SP : ARM64Reg<31, "sp", [WSP]>, DwarfRegAlias; -def XZR : ARM64Reg<31, "xzr", [WZR]>, DwarfRegAlias; -} - -// Condition code register. -def NZCV : ARM64Reg<0, "nzcv">; - -// GPR register classes with the intersections of GPR32/GPR32sp and -// GPR64/GPR64sp for use by the coalescer. -def GPR32common : RegisterClass<"ARM64", [i32], 32, (sequence "W%u", 0, 30)> { - let AltOrders = [(rotl GPR32common, 8)]; - let AltOrderSelect = [{ return 1; }]; -} -def GPR64common : RegisterClass<"ARM64", [i64], 64, - (add (sequence "X%u", 0, 28), FP, LR)> { - let AltOrders = [(rotl GPR64common, 8)]; - let AltOrderSelect = [{ return 1; }]; -} -// GPR register classes which exclude SP/WSP. -def GPR32 : RegisterClass<"ARM64", [i32], 32, (add GPR32common, WZR)> { - let AltOrders = [(rotl GPR32, 8)]; - let AltOrderSelect = [{ return 1; }]; -} -def GPR64 : RegisterClass<"ARM64", [i64], 64, (add GPR64common, XZR)> { - let AltOrders = [(rotl GPR64, 8)]; - let AltOrderSelect = [{ return 1; }]; -} - -// GPR register classes which include SP/WSP. -def GPR32sp : RegisterClass<"ARM64", [i32], 32, (add GPR32common, WSP)> { - let AltOrders = [(rotl GPR32sp, 8)]; - let AltOrderSelect = [{ return 1; }]; -} -def GPR64sp : RegisterClass<"ARM64", [i64], 64, (add GPR64common, SP)> { - let AltOrders = [(rotl GPR64sp, 8)]; - let AltOrderSelect = [{ return 1; }]; -} - -def GPR32sponly : RegisterClass<"ARM64", [i32], 32, (add WSP)>; -def GPR64sponly : RegisterClass<"ARM64", [i64], 64, (add SP)>; - -def GPR64spPlus0Operand : AsmOperandClass { - let Name = "GPR64sp0"; - let RenderMethod = "addRegOperands"; - let ParserMethod = "tryParseGPR64sp0Operand"; -} - -def GPR64sp0 : RegisterOperand { - let ParserMatchClass = GPR64spPlus0Operand; -} - -// GPR register classes which include WZR/XZR AND SP/WSP. This is not a -// constraint used by any instructions, it is used as a common super-class. -def GPR32all : RegisterClass<"ARM64", [i32], 32, (add GPR32common, WZR, WSP)>; -def GPR64all : RegisterClass<"ARM64", [i64], 64, (add GPR64common, XZR, SP)>; - -// For tail calls, we can't use callee-saved registers, as they are restored -// to the saved value before the tail call, which would clobber a call address. -// This is for indirect tail calls to store the address of the destination. -def tcGPR64 : RegisterClass<"ARM64", [i64], 64, (sub GPR64common, X19, X20, X21, - X22, X23, X24, X25, X26, - X27, X28)>; - -// GPR register classes for post increment amount of vector load/store that -// has alternate printing when Rm=31 and prints a constant immediate value -// equal to the total number of bytes transferred. - -// FIXME: TableGen *should* be able to do these itself now. There appears to be -// a bug in counting how many operands a Post-indexed MCInst should have which -// means the aliases don't trigger. -def GPR64pi1 : RegisterOperand">; -def GPR64pi2 : RegisterOperand">; -def GPR64pi3 : RegisterOperand">; -def GPR64pi4 : RegisterOperand">; -def GPR64pi6 : RegisterOperand">; -def GPR64pi8 : RegisterOperand">; -def GPR64pi12 : RegisterOperand">; -def GPR64pi16 : RegisterOperand">; -def GPR64pi24 : RegisterOperand">; -def GPR64pi32 : RegisterOperand">; -def GPR64pi48 : RegisterOperand">; -def GPR64pi64 : RegisterOperand">; - -// Condition code regclass. -def CCR : RegisterClass<"ARM64", [i32], 32, (add NZCV)> { - let CopyCost = -1; // Don't allow copying of status registers. - - // CCR is not allocatable. - let isAllocatable = 0; -} - -//===----------------------------------------------------------------------===// -// Floating Point Scalar Registers -//===----------------------------------------------------------------------===// - -def B0 : ARM64Reg<0, "b0">, DwarfRegNum<[64]>; -def B1 : ARM64Reg<1, "b1">, DwarfRegNum<[65]>; -def B2 : ARM64Reg<2, "b2">, DwarfRegNum<[66]>; -def B3 : ARM64Reg<3, "b3">, DwarfRegNum<[67]>; -def B4 : ARM64Reg<4, "b4">, DwarfRegNum<[68]>; -def B5 : ARM64Reg<5, "b5">, DwarfRegNum<[69]>; -def B6 : ARM64Reg<6, "b6">, DwarfRegNum<[70]>; -def B7 : ARM64Reg<7, "b7">, DwarfRegNum<[71]>; -def B8 : ARM64Reg<8, "b8">, DwarfRegNum<[72]>; -def B9 : ARM64Reg<9, "b9">, DwarfRegNum<[73]>; -def B10 : ARM64Reg<10, "b10">, DwarfRegNum<[74]>; -def B11 : ARM64Reg<11, "b11">, DwarfRegNum<[75]>; -def B12 : ARM64Reg<12, "b12">, DwarfRegNum<[76]>; -def B13 : ARM64Reg<13, "b13">, DwarfRegNum<[77]>; -def B14 : ARM64Reg<14, "b14">, DwarfRegNum<[78]>; -def B15 : ARM64Reg<15, "b15">, DwarfRegNum<[79]>; -def B16 : ARM64Reg<16, "b16">, DwarfRegNum<[80]>; -def B17 : ARM64Reg<17, "b17">, DwarfRegNum<[81]>; -def B18 : ARM64Reg<18, "b18">, DwarfRegNum<[82]>; -def B19 : ARM64Reg<19, "b19">, DwarfRegNum<[83]>; -def B20 : ARM64Reg<20, "b20">, DwarfRegNum<[84]>; -def B21 : ARM64Reg<21, "b21">, DwarfRegNum<[85]>; -def B22 : ARM64Reg<22, "b22">, DwarfRegNum<[86]>; -def B23 : ARM64Reg<23, "b23">, DwarfRegNum<[87]>; -def B24 : ARM64Reg<24, "b24">, DwarfRegNum<[88]>; -def B25 : ARM64Reg<25, "b25">, DwarfRegNum<[89]>; -def B26 : ARM64Reg<26, "b26">, DwarfRegNum<[90]>; -def B27 : ARM64Reg<27, "b27">, DwarfRegNum<[91]>; -def B28 : ARM64Reg<28, "b28">, DwarfRegNum<[92]>; -def B29 : ARM64Reg<29, "b29">, DwarfRegNum<[93]>; -def B30 : ARM64Reg<30, "b30">, DwarfRegNum<[94]>; -def B31 : ARM64Reg<31, "b31">, DwarfRegNum<[95]>; - -let SubRegIndices = [bsub] in { -def H0 : ARM64Reg<0, "h0", [B0]>, DwarfRegAlias; -def H1 : ARM64Reg<1, "h1", [B1]>, DwarfRegAlias; -def H2 : ARM64Reg<2, "h2", [B2]>, DwarfRegAlias; -def H3 : ARM64Reg<3, "h3", [B3]>, DwarfRegAlias; -def H4 : ARM64Reg<4, "h4", [B4]>, DwarfRegAlias; -def H5 : ARM64Reg<5, "h5", [B5]>, DwarfRegAlias; -def H6 : ARM64Reg<6, "h6", [B6]>, DwarfRegAlias; -def H7 : ARM64Reg<7, "h7", [B7]>, DwarfRegAlias; -def H8 : ARM64Reg<8, "h8", [B8]>, DwarfRegAlias; -def H9 : ARM64Reg<9, "h9", [B9]>, DwarfRegAlias; -def H10 : ARM64Reg<10, "h10", [B10]>, DwarfRegAlias; -def H11 : ARM64Reg<11, "h11", [B11]>, DwarfRegAlias; -def H12 : ARM64Reg<12, "h12", [B12]>, DwarfRegAlias; -def H13 : ARM64Reg<13, "h13", [B13]>, DwarfRegAlias; -def H14 : ARM64Reg<14, "h14", [B14]>, DwarfRegAlias; -def H15 : ARM64Reg<15, "h15", [B15]>, DwarfRegAlias; -def H16 : ARM64Reg<16, "h16", [B16]>, DwarfRegAlias; -def H17 : ARM64Reg<17, "h17", [B17]>, DwarfRegAlias; -def H18 : ARM64Reg<18, "h18", [B18]>, DwarfRegAlias; -def H19 : ARM64Reg<19, "h19", [B19]>, DwarfRegAlias; -def H20 : ARM64Reg<20, "h20", [B20]>, DwarfRegAlias; -def H21 : ARM64Reg<21, "h21", [B21]>, DwarfRegAlias; -def H22 : ARM64Reg<22, "h22", [B22]>, DwarfRegAlias; -def H23 : ARM64Reg<23, "h23", [B23]>, DwarfRegAlias; -def H24 : ARM64Reg<24, "h24", [B24]>, DwarfRegAlias; -def H25 : ARM64Reg<25, "h25", [B25]>, DwarfRegAlias; -def H26 : ARM64Reg<26, "h26", [B26]>, DwarfRegAlias; -def H27 : ARM64Reg<27, "h27", [B27]>, DwarfRegAlias; -def H28 : ARM64Reg<28, "h28", [B28]>, DwarfRegAlias; -def H29 : ARM64Reg<29, "h29", [B29]>, DwarfRegAlias; -def H30 : ARM64Reg<30, "h30", [B30]>, DwarfRegAlias; -def H31 : ARM64Reg<31, "h31", [B31]>, DwarfRegAlias; -} - -let SubRegIndices = [hsub] in { -def S0 : ARM64Reg<0, "s0", [H0]>, DwarfRegAlias; -def S1 : ARM64Reg<1, "s1", [H1]>, DwarfRegAlias; -def S2 : ARM64Reg<2, "s2", [H2]>, DwarfRegAlias; -def S3 : ARM64Reg<3, "s3", [H3]>, DwarfRegAlias; -def S4 : ARM64Reg<4, "s4", [H4]>, DwarfRegAlias; -def S5 : ARM64Reg<5, "s5", [H5]>, DwarfRegAlias; -def S6 : ARM64Reg<6, "s6", [H6]>, DwarfRegAlias; -def S7 : ARM64Reg<7, "s7", [H7]>, DwarfRegAlias; -def S8 : ARM64Reg<8, "s8", [H8]>, DwarfRegAlias; -def S9 : ARM64Reg<9, "s9", [H9]>, DwarfRegAlias; -def S10 : ARM64Reg<10, "s10", [H10]>, DwarfRegAlias; -def S11 : ARM64Reg<11, "s11", [H11]>, DwarfRegAlias; -def S12 : ARM64Reg<12, "s12", [H12]>, DwarfRegAlias; -def S13 : ARM64Reg<13, "s13", [H13]>, DwarfRegAlias; -def S14 : ARM64Reg<14, "s14", [H14]>, DwarfRegAlias; -def S15 : ARM64Reg<15, "s15", [H15]>, DwarfRegAlias; -def S16 : ARM64Reg<16, "s16", [H16]>, DwarfRegAlias; -def S17 : ARM64Reg<17, "s17", [H17]>, DwarfRegAlias; -def S18 : ARM64Reg<18, "s18", [H18]>, DwarfRegAlias; -def S19 : ARM64Reg<19, "s19", [H19]>, DwarfRegAlias; -def S20 : ARM64Reg<20, "s20", [H20]>, DwarfRegAlias; -def S21 : ARM64Reg<21, "s21", [H21]>, DwarfRegAlias; -def S22 : ARM64Reg<22, "s22", [H22]>, DwarfRegAlias; -def S23 : ARM64Reg<23, "s23", [H23]>, DwarfRegAlias; -def S24 : ARM64Reg<24, "s24", [H24]>, DwarfRegAlias; -def S25 : ARM64Reg<25, "s25", [H25]>, DwarfRegAlias; -def S26 : ARM64Reg<26, "s26", [H26]>, DwarfRegAlias; -def S27 : ARM64Reg<27, "s27", [H27]>, DwarfRegAlias; -def S28 : ARM64Reg<28, "s28", [H28]>, DwarfRegAlias; -def S29 : ARM64Reg<29, "s29", [H29]>, DwarfRegAlias; -def S30 : ARM64Reg<30, "s30", [H30]>, DwarfRegAlias; -def S31 : ARM64Reg<31, "s31", [H31]>, DwarfRegAlias; -} - -let SubRegIndices = [ssub], RegAltNameIndices = [vreg, vlist1] in { -def D0 : ARM64Reg<0, "d0", [S0], ["v0", ""]>, DwarfRegAlias; -def D1 : ARM64Reg<1, "d1", [S1], ["v1", ""]>, DwarfRegAlias; -def D2 : ARM64Reg<2, "d2", [S2], ["v2", ""]>, DwarfRegAlias; -def D3 : ARM64Reg<3, "d3", [S3], ["v3", ""]>, DwarfRegAlias; -def D4 : ARM64Reg<4, "d4", [S4], ["v4", ""]>, DwarfRegAlias; -def D5 : ARM64Reg<5, "d5", [S5], ["v5", ""]>, DwarfRegAlias; -def D6 : ARM64Reg<6, "d6", [S6], ["v6", ""]>, DwarfRegAlias; -def D7 : ARM64Reg<7, "d7", [S7], ["v7", ""]>, DwarfRegAlias; -def D8 : ARM64Reg<8, "d8", [S8], ["v8", ""]>, DwarfRegAlias; -def D9 : ARM64Reg<9, "d9", [S9], ["v9", ""]>, DwarfRegAlias; -def D10 : ARM64Reg<10, "d10", [S10], ["v10", ""]>, DwarfRegAlias; -def D11 : ARM64Reg<11, "d11", [S11], ["v11", ""]>, DwarfRegAlias; -def D12 : ARM64Reg<12, "d12", [S12], ["v12", ""]>, DwarfRegAlias; -def D13 : ARM64Reg<13, "d13", [S13], ["v13", ""]>, DwarfRegAlias; -def D14 : ARM64Reg<14, "d14", [S14], ["v14", ""]>, DwarfRegAlias; -def D15 : ARM64Reg<15, "d15", [S15], ["v15", ""]>, DwarfRegAlias; -def D16 : ARM64Reg<16, "d16", [S16], ["v16", ""]>, DwarfRegAlias; -def D17 : ARM64Reg<17, "d17", [S17], ["v17", ""]>, DwarfRegAlias; -def D18 : ARM64Reg<18, "d18", [S18], ["v18", ""]>, DwarfRegAlias; -def D19 : ARM64Reg<19, "d19", [S19], ["v19", ""]>, DwarfRegAlias; -def D20 : ARM64Reg<20, "d20", [S20], ["v20", ""]>, DwarfRegAlias; -def D21 : ARM64Reg<21, "d21", [S21], ["v21", ""]>, DwarfRegAlias; -def D22 : ARM64Reg<22, "d22", [S22], ["v22", ""]>, DwarfRegAlias; -def D23 : ARM64Reg<23, "d23", [S23], ["v23", ""]>, DwarfRegAlias; -def D24 : ARM64Reg<24, "d24", [S24], ["v24", ""]>, DwarfRegAlias; -def D25 : ARM64Reg<25, "d25", [S25], ["v25", ""]>, DwarfRegAlias; -def D26 : ARM64Reg<26, "d26", [S26], ["v26", ""]>, DwarfRegAlias; -def D27 : ARM64Reg<27, "d27", [S27], ["v27", ""]>, DwarfRegAlias; -def D28 : ARM64Reg<28, "d28", [S28], ["v28", ""]>, DwarfRegAlias; -def D29 : ARM64Reg<29, "d29", [S29], ["v29", ""]>, DwarfRegAlias; -def D30 : ARM64Reg<30, "d30", [S30], ["v30", ""]>, DwarfRegAlias; -def D31 : ARM64Reg<31, "d31", [S31], ["v31", ""]>, DwarfRegAlias; -} - -let SubRegIndices = [dsub], RegAltNameIndices = [vreg, vlist1] in { -def Q0 : ARM64Reg<0, "q0", [D0], ["v0", ""]>, DwarfRegAlias; -def Q1 : ARM64Reg<1, "q1", [D1], ["v1", ""]>, DwarfRegAlias; -def Q2 : ARM64Reg<2, "q2", [D2], ["v2", ""]>, DwarfRegAlias; -def Q3 : ARM64Reg<3, "q3", [D3], ["v3", ""]>, DwarfRegAlias; -def Q4 : ARM64Reg<4, "q4", [D4], ["v4", ""]>, DwarfRegAlias; -def Q5 : ARM64Reg<5, "q5", [D5], ["v5", ""]>, DwarfRegAlias; -def Q6 : ARM64Reg<6, "q6", [D6], ["v6", ""]>, DwarfRegAlias; -def Q7 : ARM64Reg<7, "q7", [D7], ["v7", ""]>, DwarfRegAlias; -def Q8 : ARM64Reg<8, "q8", [D8], ["v8", ""]>, DwarfRegAlias; -def Q9 : ARM64Reg<9, "q9", [D9], ["v9", ""]>, DwarfRegAlias; -def Q10 : ARM64Reg<10, "q10", [D10], ["v10", ""]>, DwarfRegAlias; -def Q11 : ARM64Reg<11, "q11", [D11], ["v11", ""]>, DwarfRegAlias; -def Q12 : ARM64Reg<12, "q12", [D12], ["v12", ""]>, DwarfRegAlias; -def Q13 : ARM64Reg<13, "q13", [D13], ["v13", ""]>, DwarfRegAlias; -def Q14 : ARM64Reg<14, "q14", [D14], ["v14", ""]>, DwarfRegAlias; -def Q15 : ARM64Reg<15, "q15", [D15], ["v15", ""]>, DwarfRegAlias; -def Q16 : ARM64Reg<16, "q16", [D16], ["v16", ""]>, DwarfRegAlias; -def Q17 : ARM64Reg<17, "q17", [D17], ["v17", ""]>, DwarfRegAlias; -def Q18 : ARM64Reg<18, "q18", [D18], ["v18", ""]>, DwarfRegAlias; -def Q19 : ARM64Reg<19, "q19", [D19], ["v19", ""]>, DwarfRegAlias; -def Q20 : ARM64Reg<20, "q20", [D20], ["v20", ""]>, DwarfRegAlias; -def Q21 : ARM64Reg<21, "q21", [D21], ["v21", ""]>, DwarfRegAlias; -def Q22 : ARM64Reg<22, "q22", [D22], ["v22", ""]>, DwarfRegAlias; -def Q23 : ARM64Reg<23, "q23", [D23], ["v23", ""]>, DwarfRegAlias; -def Q24 : ARM64Reg<24, "q24", [D24], ["v24", ""]>, DwarfRegAlias; -def Q25 : ARM64Reg<25, "q25", [D25], ["v25", ""]>, DwarfRegAlias; -def Q26 : ARM64Reg<26, "q26", [D26], ["v26", ""]>, DwarfRegAlias; -def Q27 : ARM64Reg<27, "q27", [D27], ["v27", ""]>, DwarfRegAlias; -def Q28 : ARM64Reg<28, "q28", [D28], ["v28", ""]>, DwarfRegAlias; -def Q29 : ARM64Reg<29, "q29", [D29], ["v29", ""]>, DwarfRegAlias; -def Q30 : ARM64Reg<30, "q30", [D30], ["v30", ""]>, DwarfRegAlias; -def Q31 : ARM64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias; -} - -def FPR8 : RegisterClass<"ARM64", [untyped], 8, (sequence "B%u", 0, 31)> { - let Size = 8; -} -def FPR16 : RegisterClass<"ARM64", [f16], 16, (sequence "H%u", 0, 31)> { - let Size = 16; -} -def FPR32 : RegisterClass<"ARM64", [f32, i32], 32,(sequence "S%u", 0, 31)>; -def FPR64 : RegisterClass<"ARM64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32, - v1i64], - 64, (sequence "D%u", 0, 31)>; -// We don't (yet) have an f128 legal type, so don't use that here. We -// normalize 128-bit vectors to v2f64 for arg passing and such, so use -// that here. -def FPR128 : RegisterClass<"ARM64", - [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128], - 128, (sequence "Q%u", 0, 31)>; - -// The lower 16 vector registers. Some instructions can only take registers -// in this range. -def FPR128_lo : RegisterClass<"ARM64", - [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - 128, (trunc FPR128, 16)>; - -// Pairs, triples, and quads of 64-bit vector registers. -def DSeqPairs : RegisterTuples<[dsub0, dsub1], [(rotl FPR64, 0), (rotl FPR64, 1)]>; -def DSeqTriples : RegisterTuples<[dsub0, dsub1, dsub2], - [(rotl FPR64, 0), (rotl FPR64, 1), - (rotl FPR64, 2)]>; -def DSeqQuads : RegisterTuples<[dsub0, dsub1, dsub2, dsub3], - [(rotl FPR64, 0), (rotl FPR64, 1), - (rotl FPR64, 2), (rotl FPR64, 3)]>; -def DD : RegisterClass<"ARM64", [untyped], 64, (add DSeqPairs)> { - let Size = 128; -} -def DDD : RegisterClass<"ARM64", [untyped], 64, (add DSeqTriples)> { - let Size = 196; -} -def DDDD : RegisterClass<"ARM64", [untyped], 64, (add DSeqQuads)> { - let Size = 256; -} - -// Pairs, triples, and quads of 128-bit vector registers. -def QSeqPairs : RegisterTuples<[qsub0, qsub1], [(rotl FPR128, 0), (rotl FPR128, 1)]>; -def QSeqTriples : RegisterTuples<[qsub0, qsub1, qsub2], - [(rotl FPR128, 0), (rotl FPR128, 1), - (rotl FPR128, 2)]>; -def QSeqQuads : RegisterTuples<[qsub0, qsub1, qsub2, qsub3], - [(rotl FPR128, 0), (rotl FPR128, 1), - (rotl FPR128, 2), (rotl FPR128, 3)]>; -def QQ : RegisterClass<"ARM64", [untyped], 128, (add QSeqPairs)> { - let Size = 256; -} -def QQQ : RegisterClass<"ARM64", [untyped], 128, (add QSeqTriples)> { - let Size = 384; -} -def QQQQ : RegisterClass<"ARM64", [untyped], 128, (add QSeqQuads)> { - let Size = 512; -} - - -// Vector operand versions of the FP registers. Alternate name printing and -// assmebler matching. -def VectorReg64AsmOperand : AsmOperandClass { - let Name = "VectorReg64"; - let PredicateMethod = "isVectorReg"; -} -def VectorReg128AsmOperand : AsmOperandClass { - let Name = "VectorReg128"; - let PredicateMethod = "isVectorReg"; -} - -def V64 : RegisterOperand { - let ParserMatchClass = VectorReg64AsmOperand; -} - -def V128 : RegisterOperand { - let ParserMatchClass = VectorReg128AsmOperand; -} - -def VectorRegLoAsmOperand : AsmOperandClass { let Name = "VectorRegLo"; } -def V128_lo : RegisterOperand { - let ParserMatchClass = VectorRegLoAsmOperand; -} - -class TypedVecListAsmOperand - : AsmOperandClass { - let Name = "TypedVectorList" # count # "_" # lanes # kind; - - let PredicateMethod - = "isTypedVectorList<" # count # ", " # lanes # ", '" # kind # "'>"; - let RenderMethod = "addVectorList" # regsize # "Operands<" # count # ">"; -} - -class TypedVecListRegOperand - : RegisterOperand">; - -multiclass VectorList { - // With implicit types (probably on instruction instead). E.g. { v0, v1 } - def _64AsmOperand : AsmOperandClass { - let Name = NAME # "64"; - let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">"; - let RenderMethod = "addVectorList64Operands<" # count # ">"; - } - - def "64" : RegisterOperand { - let ParserMatchClass = !cast(NAME # "_64AsmOperand"); - } - - def _128AsmOperand : AsmOperandClass { - let Name = NAME # "128"; - let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">"; - let RenderMethod = "addVectorList128Operands<" # count # ">"; - } - - def "128" : RegisterOperand { - let ParserMatchClass = !cast(NAME # "_128AsmOperand"); - } - - // 64-bit register lists with explicit type. - - // { v0.8b, v1.8b } - def _8bAsmOperand : TypedVecListAsmOperand; - def "8b" : TypedVecListRegOperand { - let ParserMatchClass = !cast(NAME # "_8bAsmOperand"); - } - - // { v0.4h, v1.4h } - def _4hAsmOperand : TypedVecListAsmOperand; - def "4h" : TypedVecListRegOperand { - let ParserMatchClass = !cast(NAME # "_4hAsmOperand"); - } - - // { v0.2s, v1.2s } - def _2sAsmOperand : TypedVecListAsmOperand; - def "2s" : TypedVecListRegOperand { - let ParserMatchClass = !cast(NAME # "_2sAsmOperand"); - } - - // { v0.1d, v1.1d } - def _1dAsmOperand : TypedVecListAsmOperand; - def "1d" : TypedVecListRegOperand { - let ParserMatchClass = !cast(NAME # "_1dAsmOperand"); - } - - // 128-bit register lists with explicit type - - // { v0.16b, v1.16b } - def _16bAsmOperand : TypedVecListAsmOperand; - def "16b" : TypedVecListRegOperand { - let ParserMatchClass = !cast(NAME # "_16bAsmOperand"); - } - - // { v0.8h, v1.8h } - def _8hAsmOperand : TypedVecListAsmOperand; - def "8h" : TypedVecListRegOperand { - let ParserMatchClass = !cast(NAME # "_8hAsmOperand"); - } - - // { v0.4s, v1.4s } - def _4sAsmOperand : TypedVecListAsmOperand; - def "4s" : TypedVecListRegOperand { - let ParserMatchClass = !cast(NAME # "_4sAsmOperand"); - } - - // { v0.2d, v1.2d } - def _2dAsmOperand : TypedVecListAsmOperand; - def "2d" : TypedVecListRegOperand { - let ParserMatchClass = !cast(NAME # "_2dAsmOperand"); - } - - // { v0.b, v1.b } - def _bAsmOperand : TypedVecListAsmOperand; - def "b" : TypedVecListRegOperand { - let ParserMatchClass = !cast(NAME # "_bAsmOperand"); - } - - // { v0.h, v1.h } - def _hAsmOperand : TypedVecListAsmOperand; - def "h" : TypedVecListRegOperand { - let ParserMatchClass = !cast(NAME # "_hAsmOperand"); - } - - // { v0.s, v1.s } - def _sAsmOperand : TypedVecListAsmOperand; - def "s" : TypedVecListRegOperand { - let ParserMatchClass = !cast(NAME # "_sAsmOperand"); - } - - // { v0.d, v1.d } - def _dAsmOperand : TypedVecListAsmOperand; - def "d" : TypedVecListRegOperand { - let ParserMatchClass = !cast(NAME # "_dAsmOperand"); - } - - -} - -defm VecListOne : VectorList<1, FPR64, FPR128>; -defm VecListTwo : VectorList<2, DD, QQ>; -defm VecListThree : VectorList<3, DDD, QQQ>; -defm VecListFour : VectorList<4, DDDD, QQQQ>; - - -// Register operand versions of the scalar FP registers. -def FPR16Op : RegisterOperand; -def FPR32Op : RegisterOperand; -def FPR64Op : RegisterOperand; -def FPR128Op : RegisterOperand; diff --git a/lib/Target/ARM64/ARM64SchedA53.td b/lib/Target/ARM64/ARM64SchedA53.td deleted file mode 100644 index cf1a8202764..00000000000 --- a/lib/Target/ARM64/ARM64SchedA53.td +++ /dev/null @@ -1,291 +0,0 @@ -//=- ARM64SchedA53.td - ARM Cortex-A53 Scheduling Definitions -*- tablegen -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines the itinerary class data for the ARM Cortex A53 processors. -// -//===----------------------------------------------------------------------===// - -// ===---------------------------------------------------------------------===// -// The following definitions describe the simpler per-operand machine model. -// This works with MachineScheduler. See MCSchedModel.h for details. - -// Cortex-A53 machine model for scheduling and other instruction cost heuristics. -def CortexA53Model : SchedMachineModel { - let MicroOpBufferSize = 0; // Explicitly set to zero since A53 is in-order. - let IssueWidth = 2; // 2 micro-ops are dispatched per cycle. - let MinLatency = 1 ; // OperandCycles are interpreted as MinLatency. - let LoadLatency = 3; // Optimistic load latency assuming bypass. - // This is overriden by OperandCycles if the - // Itineraries are queried instead. - let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation - // Specification - Instruction Timings" - // v 1.0 Spreadsheet -} - - -//===----------------------------------------------------------------------===// -// Define each kind of processor resource and number available. - -// Modeling each pipeline as a ProcResource using the BufferSize = 0 since -// Cortex-A53 is in-order. - -def A53UnitALU : ProcResource<2> { let BufferSize = 0; } // Int ALU -def A53UnitMAC : ProcResource<1> { let BufferSize = 0; } // Int MAC -def A53UnitDiv : ProcResource<1> { let BufferSize = 0; } // Int Division -def A53UnitLdSt : ProcResource<1> { let BufferSize = 0; } // Load/Store -def A53UnitB : ProcResource<1> { let BufferSize = 0; } // Branch -def A53UnitFPALU : ProcResource<1> { let BufferSize = 0; } // FP ALU -def A53UnitFPMDS : ProcResource<1> { let BufferSize = 0; } // FP Mult/Div/Sqrt - - -//===----------------------------------------------------------------------===// -// Subtarget-specific SchedWrite types which both map the ProcResources and -// set the latency. - -let SchedModel = CortexA53Model in { - -// ALU - Despite having a full latency of 4, most of the ALU instructions can -// forward a cycle earlier and then two cycles earlier in the case of a -// shift-only instruction. These latencies will be incorrect when the -// result cannot be forwarded, but modeling isn't rocket surgery. -def : WriteRes { let Latency = 3; } -def : WriteRes { let Latency = 3; } -def : WriteRes { let Latency = 3; } -def : WriteRes { let Latency = 3; } -def : WriteRes { let Latency = 2; } -def : WriteRes { let Latency = 3; } - -// MAC -def : WriteRes { let Latency = 4; } -def : WriteRes { let Latency = 4; } - -// Div -def : WriteRes { let Latency = 4; } -def : WriteRes { let Latency = 4; } - -// Load -def : WriteRes { let Latency = 4; } -def : WriteRes { let Latency = 4; } -def : WriteRes { let Latency = 4; } - -// Vector Load - Vector loads take 1-5 cycles to issue. For the WriteVecLd -// below, choosing the median of 3 which makes the latency 6. -// May model this more carefully in the future. The remaining -// A53WriteVLD# types represent the 1-5 cycle issues explicitly. -def : WriteRes { let Latency = 6; - let ResourceCycles = [3]; } -def A53WriteVLD1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; } -def A53WriteVLD2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5; - let ResourceCycles = [2]; } -def A53WriteVLD3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6; - let ResourceCycles = [3]; } -def A53WriteVLD4 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 7; - let ResourceCycles = [4]; } -def A53WriteVLD5 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 8; - let ResourceCycles = [5]; } - -// Pre/Post Indexing - Performed as part of address generation which is already -// accounted for in the WriteST* latencies below -def : WriteRes { let Latency = 0; } - -// Store -def : WriteRes { let Latency = 4; } -def : WriteRes { let Latency = 4; } -def : WriteRes { let Latency = 4; } -def : WriteRes { let Latency = 4; } - -// Vector Store - Similar to vector loads, can take 1-3 cycles to issue. -def : WriteRes { let Latency = 5; - let ResourceCycles = [2];} -def A53WriteVST1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; } -def A53WriteVST2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5; - let ResourceCycles = [2]; } -def A53WriteVST3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6; - let ResourceCycles = [3]; } - -// Branch -def : WriteRes; -def : WriteRes; -def : WriteRes; -def : WriteRes; -def : WriteRes; - -// FP ALU -def : WriteRes { let Latency = 6; } -def : WriteRes { let Latency = 6; } -def : WriteRes { let Latency = 6; } -def : WriteRes { let Latency = 6; } -def : WriteRes { let Latency = 6; } -def : WriteRes { let Latency = 6; } - -// FP Mul, Div, Sqrt -def : WriteRes { let Latency = 6; } -def : WriteRes { let Latency = 33; - let ResourceCycles = [29]; } -def A53WriteFMAC : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 10; } -def A53WriteFDivSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 18; - let ResourceCycles = [14]; } -def A53WriteFDivDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 33; - let ResourceCycles = [29]; } -def A53WriteFSqrtSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 17; - let ResourceCycles = [13]; } -def A53WriteFSqrtDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 32; - let ResourceCycles = [28]; } - -//===----------------------------------------------------------------------===// -// Subtarget-specific SchedRead types. - -// No forwarding for these reads. -def : ReadAdvance; -def : ReadAdvance; -def : ReadAdvance; - -// ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable -// operands are needed one cycle later if and only if they are to be -// shifted. Otherwise, they too are needed two cycle later. This same -// ReadAdvance applies to Extended registers as well, even though there is -// a seperate SchedPredicate for them. -def : ReadAdvance; -def A53ReadShifted : SchedReadAdvance<1, [WriteImm,WriteI, - WriteISReg, WriteIEReg,WriteIS, - WriteID32,WriteID64, - WriteIM32,WriteIM64]>; -def A53ReadNotShifted : SchedReadAdvance<2, [WriteImm,WriteI, - WriteISReg, WriteIEReg,WriteIS, - WriteID32,WriteID64, - WriteIM32,WriteIM64]>; -def A53ReadISReg : SchedReadVariant<[ - SchedVar, - SchedVar]>; -def : SchedAlias; - -def A53ReadIEReg : SchedReadVariant<[ - SchedVar, - SchedVar]>; -def : SchedAlias; - -// MAC - Operands are generally needed one cycle later in the MAC pipe. -// Accumulator operands are needed two cycles later. -def : ReadAdvance; -def : ReadAdvance; - -// Div -def : ReadAdvance; - -//===----------------------------------------------------------------------===// -// Subtarget-specific InstRWs. - -//--- -// Miscellaneous -//--- -def : InstRW<[WriteI], (instrs COPY)>; - -//--- -// Vector Loads -//--- -def : InstRW<[A53WriteVLD1], (instregex "LD1i(8|16|32|64)$")>; -def : InstRW<[A53WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[A53WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[A53WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[A53WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[A53WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>; -def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -def : InstRW<[A53WriteVLD1], (instregex "LD2i(8|16|32|64)$")>; -def : InstRW<[A53WriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[A53WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>; -def : InstRW<[A53WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>; -def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>; -def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>; -def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>; -def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>; - -def : InstRW<[A53WriteVLD2], (instregex "LD3i(8|16|32|64)$")>; -def : InstRW<[A53WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[A53WriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)$")>; -def : InstRW<[A53WriteVLD3], (instregex "LD3Threev(2d)$")>; -def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>; -def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>; -def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD3Threev(2d)_POST$")>; - -def : InstRW<[A53WriteVLD2], (instregex "LD4i(8|16|32|64)$")>; -def : InstRW<[A53WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[A53WriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>; -def : InstRW<[A53WriteVLD4], (instregex "LD4Fourv(2d)$")>; -def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>; -def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[A53WriteVLD5, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>; -def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>; - -//--- -// Vector Stores -//--- -def : InstRW<[A53WriteVST1], (instregex "ST1i(8|16|32|64)$")>; -def : InstRW<[A53WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[A53WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[A53WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[A53WriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>; -def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; -def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -def : InstRW<[A53WriteVST1], (instregex "ST2i(8|16|32|64)$")>; -def : InstRW<[A53WriteVST1], (instregex "ST2Twov(8b|4h|2s)$")>; -def : InstRW<[A53WriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)$")>; -def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>; -def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>; -def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>; - -def : InstRW<[A53WriteVST2], (instregex "ST3i(8|16|32|64)$")>; -def : InstRW<[A53WriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)$")>; -def : InstRW<[A53WriteVST2], (instregex "ST3Threev(2d)$")>; -def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>; -def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>; -def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3Threev(2d)_POST$")>; - -def : InstRW<[A53WriteVST2], (instregex "ST4i(8|16|32|64)$")>; -def : InstRW<[A53WriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>; -def : InstRW<[A53WriteVST2], (instregex "ST4Fourv(2d)$")>; -def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>; -def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>; -def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>; - -//--- -// Floating Point MAC, DIV, SQRT -//--- -def : InstRW<[A53WriteFMAC], (instregex "^FN?M(ADD|SUB).*")>; -def : InstRW<[A53WriteFMAC], (instregex "^FML(A|S).*")>; -def : InstRW<[A53WriteFDivSP], (instrs FDIVSrr)>; -def : InstRW<[A53WriteFDivDP], (instrs FDIVDrr)>; -def : InstRW<[A53WriteFDivSP], (instregex "^FDIVv.*32$")>; -def : InstRW<[A53WriteFDivDP], (instregex "^FDIVv.*64$")>; -def : InstRW<[A53WriteFSqrtSP], (instregex "^.*SQRT.*32$")>; -def : InstRW<[A53WriteFSqrtDP], (instregex "^.*SQRT.*64$")>; - -} diff --git a/lib/Target/ARM64/ARM64SchedCyclone.td b/lib/Target/ARM64/ARM64SchedCyclone.td deleted file mode 100644 index c04a7bb8baf..00000000000 --- a/lib/Target/ARM64/ARM64SchedCyclone.td +++ /dev/null @@ -1,865 +0,0 @@ -//=- ARMSchedCyclone.td - ARM64 Cyclone Scheduling Defs ------*- tablegen -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines the machine model for ARM64 Cyclone to support -// instruction scheduling and other instruction cost heuristics. -// -//===----------------------------------------------------------------------===// - -def CycloneModel : SchedMachineModel { - let IssueWidth = 6; // 6 micro-ops are dispatched per cycle. - let MicroOpBufferSize = 192; // Based on the reorder buffer. - let LoadLatency = 4; // Optimistic load latency. - let MispredictPenalty = 16; // 14-19 cycles are typical. -} - -//===----------------------------------------------------------------------===// -// Define each kind of processor resource and number available on Cyclone. - -// 4 integer pipes -def CyUnitI : ProcResource<4> { - let BufferSize = 48; -} - -// 2 branch units: I[0..1] -def CyUnitB : ProcResource<2> { - let Super = CyUnitI; - let BufferSize = 24; -} - -// 1 indirect-branch unit: I[0] -def CyUnitBR : ProcResource<1> { - let Super = CyUnitB; -} - -// 2 shifter pipes: I[2..3] -// When an instruction consumes a CyUnitIS, it also consumes a CyUnitI -def CyUnitIS : ProcResource<2> { - let Super = CyUnitI; - let BufferSize = 24; -} - -// 1 mul pipe: I[0] -def CyUnitIM : ProcResource<1> { - let Super = CyUnitBR; - let BufferSize = 32; -} - -// 1 div pipe: I[1] -def CyUnitID : ProcResource<1> { - let Super = CyUnitB; - let BufferSize = 16; -} - -// 1 integer division unit. This is driven by the ID pipe, but only -// consumes the pipe for one cycle at issue and another cycle at writeback. -def CyUnitIntDiv : ProcResource<1>; - -// 2 ld/st pipes. -def CyUnitLS : ProcResource<2> { - let BufferSize = 28; -} - -// 3 fp/vector pipes. -def CyUnitV : ProcResource<3> { - let BufferSize = 48; -} -// 2 fp/vector arithmetic and multiply pipes: V[0-1] -def CyUnitVM : ProcResource<2> { - let Super = CyUnitV; - let BufferSize = 32; -} -// 1 fp/vector division/sqrt pipe: V[2] -def CyUnitVD : ProcResource<1> { - let Super = CyUnitV; - let BufferSize = 16; -} -// 1 fp compare pipe: V[0] -def CyUnitVC : ProcResource<1> { - let Super = CyUnitVM; - let BufferSize = 16; -} - -// 2 fp division/square-root units. These are driven by the VD pipe, -// but only consume the pipe for one cycle at issue and a cycle at writeback. -def CyUnitFloatDiv : ProcResource<2>; - -//===----------------------------------------------------------------------===// -// Define scheduler read/write resources and latency on Cyclone. -// This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1. - -let SchedModel = CycloneModel in { - -//--- -// 7.8.1. Moves -//--- - -// A single nop micro-op (uX). -def WriteX : SchedWriteRes<[]> { let Latency = 0; } - -// Move zero is a register rename (to machine register zero). -// The move is replaced by a single nop micro-op. -// MOVZ Rd, #0 -// AND Rd, Rzr, #imm -def WriteZPred : SchedPredicate<[{TII->isGPRZero(MI)}]>; -def WriteImmZ : SchedWriteVariant<[ - SchedVar, - SchedVar]>; -def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>; - -// Move GPR is a register rename and single nop micro-op. -// ORR Xd, XZR, Xm -// ADD Xd, Xn, #0 -def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(MI)}]>; -def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(MI)}]>; -def WriteMov : SchedWriteVariant<[ - SchedVar, - SchedVar, - SchedVar]>; -def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>; - -// Move non-zero immediate is an integer ALU op. -// MOVN,MOVZ,MOVK -def : WriteRes; - -//--- -// 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional, -// Shifts and Bitfield Operations -//--- - -// ADR,ADRP -// ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri -// ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr -// ADC(S),SBC(S) -// Aliases: CMN, CMP, TST -// -// Conditional operations. -// CCMNi,CCMPi,CCMNr,CCMPr, -// CSEL,CSINC,CSINV,CSNEG -// -// Bit counting and reversal operations. -// CLS,CLZ,RBIT,REV,REV16,REV32 -def : WriteRes; - -// ADD with shifted register operand is a single micro-op that -// consumes a shift pipeline for two cycles. -// ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs -// EXAMPLE: ADDrs Xn, Xm LSL #imm -def : WriteRes { - let Latency = 2; - let ResourceCycles = [2]; -} - -// ADD with extended register operand is the same as shifted reg operand. -// ADD(S)re,SUB(S)re -// EXAMPLE: ADDXre Xn, Xm, UXTB #1 -def : WriteRes { - let Latency = 2; - let ResourceCycles = [2]; -} - -// Variable shift and bitfield operations. -// ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM -def : WriteRes; - -// EXTR Shifts a pair of registers and requires two micro-ops. -// The second micro-op is delayed, as modeled by ReadExtrHi. -// EXTR Xn, Xm, #imm -def : WriteRes { - let Latency = 2; - let NumMicroOps = 2; -} - -// EXTR's first register read is delayed by one cycle, effectively -// shortening its writer's latency. -// EXTR Xn, Xm, #imm -def : ReadAdvance; - -//--- -// 7.8.6. Multiplies -//--- - -// MUL/MNEG are aliases for MADD/MSUB. -// MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL -def : WriteRes { - let Latency = 4; -} -// MADDX,MSUBX,SMULH,UMULH -def : WriteRes { - let Latency = 5; -} - -//--- -// 7.8.7. Divide -//--- - -// 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient. -// The ID pipe is consumed for 2 cycles: issue and writeback. -// SDIVW,UDIVW -def : WriteRes { - let Latency = 10; - let ResourceCycles = [2, 10]; -} -// 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient. -// The ID pipe is consumed for 2 cycles: issue and writeback. -// SDIVX,UDIVX -def : WriteRes { - let Latency = 13; - let ResourceCycles = [2, 13]; -} - -//--- -// 7.8.8,7.8.10. Load/Store, single element -//--- - -// Integer loads take 4 cycles and use one LS unit for one cycle. -def : WriteRes { - let Latency = 4; -} - -// Store-load forwarding is 4 cycles. -// -// Note: The store-exclusive sequence incorporates this -// latency. However, general heuristics should not model the -// dependence between a store and subsequent may-alias load because -// hardware speculation works. -def : WriteRes { - let Latency = 4; -} - -// Load from base address plus an optionally scaled register offset. -// Rt latency is latency WriteIS + WriteLD. -// EXAMPLE: LDR Xn, Xm [, lsl 3] -def CyWriteLDIdx : SchedWriteVariant<[ - SchedVar, // Load from scaled register. - SchedVar]>; // Load from register offset. -def : SchedAlias; // Map ARM64->Cyclone type. - -// EXAMPLE: STR Xn, Xm [, lsl 3] -def CyWriteSTIdx : SchedWriteVariant<[ - SchedVar, // Store to scaled register. - SchedVar]>; // Store to register offset. -def : SchedAlias; // Map ARM64->Cyclone type. - -// Read the (unshifted) base register Xn in the second micro-op one cycle later. -// EXAMPLE: LDR Xn, Xm [, lsl 3] -def ReadBaseRS : SchedReadAdvance<1>; -def CyReadAdrBase : SchedReadVariant<[ - SchedVar, // Read base reg after shifting offset. - SchedVar]>; // Read base reg with no shift. -def : SchedAlias; // Map ARM64->Cyclone type. - -//--- -// 7.8.9,7.8.11. Load/Store, paired -//--- - -// Address pre/post increment is a simple ALU op with one cycle latency. -def : WriteRes; - -// LDP high register write is fused with the load, but a nop micro-op remains. -def : WriteRes { - let Latency = 4; -} - -// STP is a vector op and store, except for QQ, which is just two stores. -def : SchedAlias; -def : InstRW<[WriteST, WriteST], (instrs STPQi)>; - -//--- -// 7.8.13. Branches -//--- - -// Branches take a single micro-op. -// The misprediction penalty is defined as a SchedMachineModel property. -def : WriteRes {let Latency = 0;} -def : WriteRes {let Latency = 0;} - -//--- -// 7.8.14. Never-issued Instructions, Barrier and Hint Operations -//--- - -// NOP,SEV,SEVL,WFE,WFI,YIELD -def : WriteRes {let Latency = 0;} -// ISB -def : InstRW<[WriteI], (instrs ISB)>; -// SLREX,DMB,DSB -def : WriteRes; - -// System instructions get an invalid latency because the latency of -// other operations across them is meaningless. -def : WriteRes {let Latency = -1;} - -//===----------------------------------------------------------------------===// -// 7.9 Vector Unit Instructions - -// Simple vector operations take 2 cycles. -def : WriteRes {let Latency = 2;} - -// Define some longer latency vector op types for Cyclone. -def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;} -def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;} -def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;} -def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;} - -// Simple floating-point operations take 2 cycles. -def : WriteRes {let Latency = 2;} - -//--- -// 7.9.1 Vector Moves -//--- - -// TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently -// generates expensive int-float conversion instead: -// FMOVDi Dd, #0.0 -// FMOVv2f64ns Vd.2d, #0.0 - -// FMOVSi,FMOVDi -def : WriteRes {let Latency = 2;} - -// MOVI,MVNI are WriteV -// FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV - -// Move FPR is a register rename and single nop micro-op. -// ORR.16b Vd,Vn,Vn -// COPY is handled above in the WriteMov Variant. -def WriteVMov : SchedWriteVariant<[ - SchedVar, - SchedVar]>; -def : InstRW<[WriteVMov], (instrs ORRv16i8)>; - -// FMOVSr,FMOVDr are WriteF. - -// MOV V,V is a WriteV. - -// CPY D,V[x] is a WriteV - -// INS V[x],V[y] is a WriteV. - -// FMOVWSr,FMOVXDr,FMOVXDHighr -def : WriteRes { - let Latency = 5; -} - -// FMOVSWr,FMOVDXr -def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>; - -// INS V[x],R -def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>; -def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>; - -// SMOV,UMOV R,V[x] -def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>; -def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>; - -// DUP V,R -def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>; - -// DUP V,V[x] is a WriteV. - -//--- -// 7.9.2 Integer Arithmetic, Logical, and Comparisons -//--- - -// BIC,ORR V,#imm are WriteV - -def : InstRW<[CyWriteV3], (instregex "ABSv")>; - -// MVN,NEG,NOT are WriteV - -def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>; - -// ADDP is a WriteV. -def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;} -def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>; - -def : InstRW<[CyWriteV3], - (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>; - -def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>; - -// ADD,SUB are WriteV - -// Forward declare. -def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;} - -// Add/Diff and accumulate uses the vector multiply unit. -def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;} -def CyReadVAccum : SchedReadAdvance<1, - [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>; - -def : InstRW<[CyWriteVAccum, CyReadVAccum], - (instregex "SADALP","UADALP")>; - -def : InstRW<[CyWriteVAccum, CyReadVAccum], - (instregex "SABAv","UABAv","SABALv","UABALv")>; - -def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>; - -def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>; - -def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>; - -// WriteV includes: -// AND,BIC,CMTST,EOR,ORN,ORR -// ADDP -// SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD -// SADDL,SSUBL,UADDL,USUBL -// SADDW,SSUBW,UADDW,USUBW - -def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv", - "CMLEv","CMLTv", - "CMHIv","CMHSv")>; - -def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv", - "SMAXPv","SMINPv","UMAXPv","UMINPv")>; - -def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv", - "SABDLv","UABDLv")>; - -//--- -// 7.9.3 Floating Point Arithmetic and Comparisons -//--- - -// FABS,FNEG are WriteF - -def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>; -def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>; - -def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i", - "FMINPv2i","FMINNMPv2i")>; - -def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>; - -def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32, - FSUBSrr,FSUBv2f32,FSUBv4f32, - FADDPv2f32,FADDPv4f32, - FABD32,FABDv2f32,FABDv4f32)>; -def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64, - FSUBDrr,FSUBv2f64, - FADDPv2f64, - FABD64,FABDv2f64)>; - -def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>; - -def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT", - "FMAXS","FMAXD","FMAXv", - "FMINS","FMIND","FMINv", - "FMAXNMS","FMAXNMD","FMAXNMv", - "FMINNMS","FMINNMD","FMINNMv", - "FMAXPv2f","FMAXPv4f", - "FMINPv2f","FMINPv4f", - "FMAXNMPv2f","FMAXNMPv4f", - "FMINNMPv2f","FMINNMPv4f")>; - -// FCMP,FCMPE,FCCMP,FCCMPE -def : WriteRes {let Latency = 4;} - -// FCSEL is a WriteF. - -//--- -// 7.9.4 Shifts and Bitfield Operations -//--- - -// SHL is a WriteV - -def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;} -def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>; - -def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;} -def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>; - -// Shift and accumulate uses the vector multiply unit. -def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;} -def CyReadVShiftAcc : SchedReadAdvance<1, - [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>; -def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc], - (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>; - -// SSHL,USHL are WriteV. - -def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>; - -// SQSHL,SQSHLU,UQSHL are WriteV. - -def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>; - -// WriteV includes: -// SHLL,SSHLL,USHLL -// SLI,SRI -// BIF,BIT,BSL -// EXT -// CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN -// XTN2 - -def : InstRW<[CyWriteV4], - (instregex "RSHRNv","SHRNv", - "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv", - "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>; - -//--- -// 7.9.5 Multiplication -//--- - -def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;} -def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv", - "SQDMULLv","SQDMULHv","SQRDMULHv")>; - -// FMUL,FMULX,FNMUL default to WriteFMul. -def : WriteRes { let Latency = 4;} - -def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;} -def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed, - FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>; - -def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>; -def : InstRW<[CyWriteVMul, CyReadVMulAcc], - (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL", - "SQDMLAL","SQDMLSL")>; - -def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;} -def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;} -def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>; -def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>; - -def : InstRW<[CyWriteSMul, CyReadSMul], - (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr, - FMLAv2f32,FMLAv4f32, - FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>; -def : InstRW<[CyWriteDMul, CyReadDMul], - (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr, - FMLAv2f64,FMLAv2i64_indexed, - FMLSv2f64,FMLSv2i64_indexed)>; - -def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; } -def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>; - -//--- -// 7.9.6 Divide and Square Root -//--- - -// FDIV,FSQRT -// TODO: Add 64-bit variant with 19 cycle latency. -// TODO: Specialize FSQRT for longer latency. -def : WriteRes { - let Latency = 17; - let ResourceCycles = [2, 17]; -} - -def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>; - -def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; } -def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>; - -def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; } -def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; } -def : InstRW<[WriteFRECPS], (instregex "FRECPSv")>; -def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>; - -//--- -// 7.9.7 Integer-FP Conversions -//--- - -// FCVT lengthen f16/s32 -def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>; - -// FCVT,FCVTN,FCVTXN -// SCVTF,UCVTF V,V -// FRINT(AIMNPXZ) V,V -def : WriteRes {let Latency = 4;} - -// SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles. -def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>; -def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>; - -// FCVT Rd, S/D = V6+LD4: 10 cycles -def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>; -def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>; - -// FCVTL is a WriteV - -//--- -// 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup -//--- - -def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;} -def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr, - AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr, - SHA1SU0rrr)>; - -def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;} -def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>; - -def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;} -def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr, - SHA256Hrrr,SHA256H2rrr)>; - -// TRN,UZP,ZUP are WriteV. - -// TBL,TBX are WriteV. - -//--- -// 7.9.11-7.9.14 Load/Store, single element and paired -//--- - -// Loading into the vector unit takes 5 cycles vs 4 for integer loads. -def : WriteRes { - let Latency = 5; -} - -// Store-load forwarding is 4 cycles. -def : WriteRes { - let Latency = 4; -} - -// WriteVLDPair/VSTPair sequences are expanded by the target description. - -//--- -// 7.9.15 Load, element operations -//--- - -// Only the first WriteVLD and WriteAdr for writeback matches def operands. -// Subsequent WriteVLDs consume resources. Since all loaded values have the -// same latency, this is acceptable. - -// Vd is read 5 cycles after issuing the vector load. -def : ReadAdvance; - -def : InstRW<[WriteVLD], - (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[WriteVLD, WriteAdr], - (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; - -// Register writes from the load's high half are fused micro-ops. -def : InstRW<[WriteVLD], - (instregex "LD1Twov(8b|4h|2s|1d)$")>; -def : InstRW<[WriteVLD, WriteAdr], - (instregex "LD1Twov(8b|4h|2s|1d)_POST")>; -def : InstRW<[WriteVLD, WriteVLD], - (instregex "LD1Twov(16b|8h|4s|2d)$")>; -def : InstRW<[WriteVLD, WriteAdr, WriteVLD], - (instregex "LD1Twov(16b|8h|4s|2d)_POST")>; - -def : InstRW<[WriteVLD, WriteVLD], - (instregex "LD1Threev(8b|4h|2s|1d)$")>; -def : InstRW<[WriteVLD, WriteAdr, WriteVLD], - (instregex "LD1Threev(8b|4h|2s|1d)_POST")>; -def : InstRW<[WriteVLD, WriteVLD, WriteVLD], - (instregex "LD1Threev(16b|8h|4s|2d)$")>; -def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD], - (instregex "LD1Threev(16b|8h|4s|2d)_POST")>; - -def : InstRW<[WriteVLD, WriteVLD], - (instregex "LD1Fourv(8b|4h|2s|1d)$")>; -def : InstRW<[WriteVLD, WriteAdr, WriteVLD], - (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>; -def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD], - (instregex "LD1Fourv(16b|8h|4s|2d)$")>; -def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD], - (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>; - -def : InstRW<[WriteVLDShuffle, ReadVLD], - (instregex "LD1i(8|16|32)$")>; -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr], - (instregex "LD1i(8|16|32)_POST")>; - -def : InstRW<[WriteVLDShuffle, ReadVLD], (instrs LD1i64)>; -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>; - -def : InstRW<[WriteVLDShuffle], - (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[WriteVLDShuffle, WriteAdr], - (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; - -def : InstRW<[WriteVLDShuffle, WriteV], - (instregex "LD2Twov(8b|4h|2s)$")>; -def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV], - (instregex "LD2Twov(8b|4h|2s)_POST$")>; -def : InstRW<[WriteVLDShuffle, WriteVLDShuffle], - (instregex "LD2Twov(16b|8h|4s|2d)$")>; -def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle], - (instregex "LD2Twov(16b|8h|4s|2d)_POST")>; - -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV], - (instregex "LD2i(8|16|32)$")>; -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV], - (instregex "LD2i(8|16|32)_POST")>; -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV], - (instregex "LD2i64$")>; -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV], - (instregex "LD2i64_POST")>; - -def : InstRW<[WriteVLDShuffle, WriteV], - (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV], - (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; - -def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV], - (instregex "LD3Threev(8b|4h|2s)$")>; -def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV], - (instregex "LD3Threev(8b|4h|2s)_POST")>; -def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle], - (instregex "LD3Threev(16b|8h|4s|2d)$")>; -def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle], - (instregex "LD3Threev(16b|8h|4s|2d)_POST")>; - -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV], - (instregex "LD3i(8|16|32)$")>; -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV], - (instregex "LD3i(8|16|32)_POST")>; - -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV], - (instregex "LD3i64$")>; -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV], - (instregex "LD3i64_POST")>; - -def : InstRW<[WriteVLDShuffle, WriteV, WriteV], - (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>; -def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV], - (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>; - -def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV], - (instrs LD3Rv1d,LD3Rv2d)>; -def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV], - (instrs LD3Rv2d_POST,LD3Rv2d_POST)>; - -def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV], - (instregex "LD4Fourv(8b|4h|2s)$")>; -def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV], - (instregex "LD4Fourv(8b|4h|2s)_POST")>; -def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle, - WriteVLDPairShuffle, WriteVLDPairShuffle], - (instregex "LD4Fourv(16b|8h|4s|2d)$")>; -def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle, - WriteVLDPairShuffle, WriteVLDPairShuffle], - (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>; - -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV], - (instregex "LD4i(8|16|32)$")>; -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV], - (instregex "LD4i(8|16|32)_POST")>; - - -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV], - (instrs LD4i64)>; -def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV], - (instrs LD4i64_POST)>; - -def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV], - (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>; -def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV], - (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>; - -def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV], - (instrs LD4Rv1d,LD4Rv2d)>; -def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV], - (instrs LD4Rv1d_POST,LD4Rv2d_POST)>; - -//--- -// 7.9.16 Store, element operations -//--- - -// Only the WriteAdr for writeback matches a def operands. -// Subsequent WriteVLDs only consume resources. - -def : InstRW<[WriteVST], - (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; -def : InstRW<[WriteAdr, WriteVST], - (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; - -def : InstRW<[WriteVSTShuffle], - (instregex "ST1Twov(8b|4h|2s|1d)$")>; -def : InstRW<[WriteAdr, WriteVSTShuffle], - (instregex "ST1Twov(8b|4h|2s|1d)_POST")>; -def : InstRW<[WriteVST, WriteVST], - (instregex "ST1Twov(16b|8h|4s|2d)$")>; -def : InstRW<[WriteAdr, WriteVST, WriteVST], - (instregex "ST1Twov(16b|8h|4s|2d)_POST")>; - -def : InstRW<[WriteVSTShuffle, WriteVST], - (instregex "ST1Threev(8b|4h|2s|1d)$")>; -def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST], - (instregex "ST1Threev(8b|4h|2s|1d)_POST")>; -def : InstRW<[WriteVST, WriteVST, WriteVST], - (instregex "ST1Threev(16b|8h|4s|2d)$")>; -def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST], - (instregex "ST1Threev(16b|8h|4s|2d)_POST")>; - -def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], - (instregex "ST1Fourv(8b|4h|2s|1d)$")>; -def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], - (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>; -def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST], - (instregex "ST1Fourv(16b|8h|4s|2d)$")>; -def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST], - (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>; - -def : InstRW<[WriteVSTShuffle], (instregex "ST1i(8|16|32)$")>; -def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>; - -def : InstRW<[WriteVSTShuffle], (instrs ST1i64)>; -def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>; - -def : InstRW<[WriteVSTShuffle], - (instregex "ST2Twov(8b|4h|2s)$")>; -def : InstRW<[WriteAdr, WriteVSTShuffle], - (instregex "ST2Twov(8b|4h|2s)_POST")>; -def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], - (instregex "ST2Twov(16b|8h|4s|2d)$")>; -def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], - (instregex "ST2Twov(16b|8h|4s|2d)_POST")>; - -def : InstRW<[WriteVSTShuffle], (instregex "ST2i(8|16|32)$")>; -def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>; -def : InstRW<[WriteVSTShuffle], (instrs ST2i64)>; -def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>; - -def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], - (instregex "ST3Threev(8b|4h|2s)$")>; -def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], - (instregex "ST3Threev(8b|4h|2s)_POST")>; -def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle], - (instregex "ST3Threev(16b|8h|4s|2d)$")>; -def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle], - (instregex "ST3Threev(16b|8h|4s|2d)_POST")>; - -def : InstRW<[WriteVSTShuffle], (instregex "ST3i(8|16|32)$")>; -def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>; - -def :InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64)>; -def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>; - -def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle], - (instregex "ST4Fourv(8b|4h|2s|1d)$")>; -def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle], - (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>; -def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle, - WriteVSTPairShuffle, WriteVSTPairShuffle], - (instregex "ST4Fourv(16b|8h|4s|2d)$")>; -def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle, - WriteVSTPairShuffle, WriteVSTPairShuffle], - (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>; - -def : InstRW<[WriteVSTPairShuffle], (instregex "ST4i(8|16|32)$")>; -def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>; - -def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST4i64)>; -def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>; - -//--- -// Unused SchedRead types -//--- - -def : ReadAdvance; -def : ReadAdvance; -def : ReadAdvance; -def : ReadAdvance; -def : ReadAdvance; -def : ReadAdvance; - -} // SchedModel = CycloneModel diff --git a/lib/Target/ARM64/ARM64Schedule.td b/lib/Target/ARM64/ARM64Schedule.td deleted file mode 100644 index 3a4194173a8..00000000000 --- a/lib/Target/ARM64/ARM64Schedule.td +++ /dev/null @@ -1,104 +0,0 @@ -//===-- ARMSchedule.td - ARM Scheduling Definitions --------*- tablegen -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -// Define TII for use in SchedVariant Predicates. -// const MachineInstr *MI and const TargetSchedModel *SchedModel -// are defined by default. -def : PredicateProlog<[{ - const ARM64InstrInfo *TII = - static_cast(SchedModel->getInstrInfo()); - (void)TII; -}]>; - -// ARM64 Scheduler Definitions - -def WriteImm : SchedWrite; // MOVN, MOVZ -// TODO: Provide variants for MOV32/64imm Pseudos that dynamically -// select the correct sequence of WriteImms. - -def WriteI : SchedWrite; // ALU -def WriteISReg : SchedWrite; // ALU of Shifted-Reg -def WriteIEReg : SchedWrite; // ALU of Extended-Reg -def ReadI : SchedRead; // ALU -def ReadISReg : SchedRead; // ALU of Shifted-Reg -def ReadIEReg : SchedRead; // ALU of Extended-Reg -def WriteExtr : SchedWrite; // EXTR shifts a reg pair -def ReadExtrHi : SchedRead; // Read the high reg of the EXTR pair -def WriteIS : SchedWrite; // Shift/Scale -def WriteID32 : SchedWrite; // 32-bit Divide -def WriteID64 : SchedWrite; // 64-bit Divide -def ReadID : SchedRead; // 32/64-bit Divide -def WriteIM32 : SchedWrite; // 32-bit Multiply -def WriteIM64 : SchedWrite; // 64-bit Multiply -def ReadIM : SchedRead; // 32/64-bit Multiply -def ReadIMA : SchedRead; // 32/64-bit Multiply Accumulate -def WriteBr : SchedWrite; // Branch -def WriteBrReg : SchedWrite; // Indirect Branch - -def WriteLD : SchedWrite; // Load from base addr plus immediate offset -def WriteST : SchedWrite; // Store to base addr plus immediate offset -def WriteSTP : SchedWrite; // Store a register pair. -def WriteAdr : SchedWrite; // Address pre/post increment. - -def WriteLDIdx : SchedWrite; // Load from a register index (maybe scaled). -def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled). -def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST. - -// Predicate for determining when a shiftable register is shifted. -def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(MI)}]>; - -// Predicate for determining when a extendedable register is extended. -def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(MI)}]>; - -// ScaledIdxPred is true if a WriteLDIdx operand will be -// scaled. Subtargets can use this to dynamically select resources and -// latency for WriteLDIdx and ReadAdrBase. -def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(MI)}]>; - -// Serialized two-level address load. -// EXAMPLE: LOADGot -def WriteLDAdr : WriteSequence<[WriteAdr, WriteLD]>; - -// Serialized two-level address lookup. -// EXAMPLE: MOVaddr... -def WriteAdrAdr : WriteSequence<[WriteAdr, WriteAdr]>; - -// The second register of a load-pair. -// LDP,LDPSW,LDNP,LDXP,LDAXP -def WriteLDHi : SchedWrite; - -// Store-exclusive is a store followed by a dependent load. -def WriteSTX : WriteSequence<[WriteST, WriteLD]>; - -def WriteSys : SchedWrite; // Long, variable latency system ops. -def WriteBarrier : SchedWrite; // Memory barrier. -def WriteHint : SchedWrite; // Hint instruction. - -def WriteF : SchedWrite; // General floating-point ops. -def WriteFCmp : SchedWrite; // Floating-point compare. -def WriteFCvt : SchedWrite; // Float conversion. -def WriteFCopy : SchedWrite; // Float-int register copy. -def WriteFImm : SchedWrite; // Floating-point immediate. -def WriteFMul : SchedWrite; // Floating-point multiply. -def WriteFDiv : SchedWrite; // Floating-point division. - -def WriteV : SchedWrite; // Vector ops. -def WriteVLD : SchedWrite; // Vector loads. -def WriteVST : SchedWrite; // Vector stores. - -// Read the unwritten lanes of the VLD's destination registers. -def ReadVLD : SchedRead; - -// Sequential vector load and shuffle. -def WriteVLDShuffle : WriteSequence<[WriteVLD, WriteV]>; -def WriteVLDPairShuffle : WriteSequence<[WriteVLD, WriteV, WriteV]>; - -// Store a shuffled vector. -def WriteVSTShuffle : WriteSequence<[WriteV, WriteVST]>; -def WriteVSTPairShuffle : WriteSequence<[WriteV, WriteV, WriteVST]>; diff --git a/lib/Target/ARM64/ARM64SelectionDAGInfo.cpp b/lib/Target/ARM64/ARM64SelectionDAGInfo.cpp deleted file mode 100644 index f8a2527616c..00000000000 --- a/lib/Target/ARM64/ARM64SelectionDAGInfo.cpp +++ /dev/null @@ -1,58 +0,0 @@ -//===-- ARM64SelectionDAGInfo.cpp - ARM64 SelectionDAG Info ---------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the ARM64SelectionDAGInfo class. -// -//===----------------------------------------------------------------------===// - -#include "ARM64TargetMachine.h" -using namespace llvm; - -#define DEBUG_TYPE "arm64-selectiondag-info" - -ARM64SelectionDAGInfo::ARM64SelectionDAGInfo(const TargetMachine &TM) - : TargetSelectionDAGInfo(TM), - Subtarget(&TM.getSubtarget()) {} - -ARM64SelectionDAGInfo::~ARM64SelectionDAGInfo() {} - -SDValue ARM64SelectionDAGInfo::EmitTargetCodeForMemset( - SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, bool isVolatile, - MachinePointerInfo DstPtrInfo) const { - // Check to see if there is a specialized entry-point for memory zeroing. - ConstantSDNode *V = dyn_cast(Src); - ConstantSDNode *SizeValue = dyn_cast(Size); - const char *bzeroEntry = - (V && V->isNullValue()) ? Subtarget->getBZeroEntry() : nullptr; - // For small size (< 256), it is not beneficial to use bzero - // instead of memset. - if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) { - const ARM64TargetLowering &TLI = *static_cast( - DAG.getTarget().getTargetLowering()); - - EVT IntPtr = TLI.getPointerTy(); - Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext()); - TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; - Entry.Node = Dst; - Entry.Ty = IntPtrTy; - Args.push_back(Entry); - Entry.Node = Size; - Args.push_back(Entry); - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(Chain) - .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0) - .setDiscardResult(); - std::pair CallResult = TLI.LowerCallTo(CLI); - return CallResult.second; - } - return SDValue(); -} diff --git a/lib/Target/ARM64/ARM64SelectionDAGInfo.h b/lib/Target/ARM64/ARM64SelectionDAGInfo.h deleted file mode 100644 index 770775fc02d..00000000000 --- a/lib/Target/ARM64/ARM64SelectionDAGInfo.h +++ /dev/null @@ -1,37 +0,0 @@ -//===-- ARM64SelectionDAGInfo.h - ARM64 SelectionDAG Info -------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines the ARM64 subclass for TargetSelectionDAGInfo. -// -//===----------------------------------------------------------------------===// - -#ifndef ARM64SELECTIONDAGINFO_H -#define ARM64SELECTIONDAGINFO_H - -#include "llvm/Target/TargetSelectionDAGInfo.h" - -namespace llvm { - -class ARM64SelectionDAGInfo : public TargetSelectionDAGInfo { - /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can - /// make the right decision when generating code for different targets. - const ARM64Subtarget *Subtarget; - -public: - explicit ARM64SelectionDAGInfo(const TargetMachine &TM); - ~ARM64SelectionDAGInfo(); - - SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain, - SDValue Dst, SDValue Src, SDValue Size, - unsigned Align, bool isVolatile, - MachinePointerInfo DstPtrInfo) const override; -}; -} - -#endif diff --git a/lib/Target/ARM64/ARM64StorePairSuppress.cpp b/lib/Target/ARM64/ARM64StorePairSuppress.cpp deleted file mode 100644 index a9501ed9217..00000000000 --- a/lib/Target/ARM64/ARM64StorePairSuppress.cpp +++ /dev/null @@ -1,168 +0,0 @@ -//===---- ARM64StorePairSuppress.cpp --- Suppress store pair formation ----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass identifies floating point stores that should not be combined into -// store pairs. Later we may do the same for floating point loads. -// ===---------------------------------------------------------------------===// - -#include "ARM64InstrInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineTraceMetrics.h" -#include "llvm/CodeGen/TargetSchedule.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" - -using namespace llvm; - -#define DEBUG_TYPE "arm64-stp-suppress" - -namespace { -class ARM64StorePairSuppress : public MachineFunctionPass { - const ARM64InstrInfo *TII; - const TargetRegisterInfo *TRI; - const MachineRegisterInfo *MRI; - MachineFunction *MF; - TargetSchedModel SchedModel; - MachineTraceMetrics *Traces; - MachineTraceMetrics::Ensemble *MinInstr; - -public: - static char ID; - ARM64StorePairSuppress() : MachineFunctionPass(ID) {} - - virtual const char *getPassName() const override { - return "ARM64 Store Pair Suppression"; - } - - bool runOnMachineFunction(MachineFunction &F) override; - -private: - bool shouldAddSTPToBlock(const MachineBasicBlock *BB); - - bool isNarrowFPStore(const MachineInstr &MI); - - virtual void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired(); - AU.addPreserved(); - MachineFunctionPass::getAnalysisUsage(AU); - } -}; -char ARM64StorePairSuppress::ID = 0; -} // anonymous - -FunctionPass *llvm::createARM64StorePairSuppressPass() { - return new ARM64StorePairSuppress(); -} - -/// Return true if an STP can be added to this block without increasing the -/// critical resource height. STP is good to form in Ld/St limited blocks and -/// bad to form in float-point limited blocks. This is true independent of the -/// critical path. If the critical path is longer than the resource height, the -/// extra vector ops can limit physreg renaming. Otherwise, it could simply -/// oversaturate the vector units. -bool ARM64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB) { - if (!MinInstr) - MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); - - MachineTraceMetrics::Trace BBTrace = MinInstr->getTrace(BB); - unsigned ResLength = BBTrace.getResourceLength(); - - // Get the machine model's scheduling class for STPQi. - // Bypass TargetSchedule's SchedClass resolution since we only have an opcode. - unsigned SCIdx = TII->get(ARM64::STPDi).getSchedClass(); - const MCSchedClassDesc *SCDesc = - SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx); - - // If a subtarget does not define resources for STPQi, bail here. - if (SCDesc->isValid() && !SCDesc->isVariant()) { - unsigned ResLenWithSTP = BBTrace.getResourceLength( - ArrayRef(), SCDesc); - if (ResLenWithSTP > ResLength) { - DEBUG(dbgs() << " Suppress STP in BB: " << BB->getNumber() - << " resources " << ResLength << " -> " << ResLenWithSTP - << "\n"); - return false; - } - } - return true; -} - -/// Return true if this is a floating-point store smaller than the V reg. On -/// cyclone, these require a vector shuffle before storing a pair. -/// Ideally we would call getMatchingPairOpcode() and have the machine model -/// tell us if it's profitable with no cpu knowledge here. -/// -/// FIXME: We plan to develop a decent Target abstraction for simple loads and -/// stores. Until then use a nasty switch similar to ARM64LoadStoreOptimizer. -bool ARM64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) { - switch (MI.getOpcode()) { - default: - return false; - case ARM64::STRSui: - case ARM64::STRDui: - case ARM64::STURSi: - case ARM64::STURDi: - return true; - } -} - -bool ARM64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) { - MF = &mf; - TII = static_cast(MF->getTarget().getInstrInfo()); - TRI = MF->getTarget().getRegisterInfo(); - MRI = &MF->getRegInfo(); - const TargetSubtargetInfo &ST = - MF->getTarget().getSubtarget(); - SchedModel.init(*ST.getSchedModel(), &ST, TII); - - Traces = &getAnalysis(); - MinInstr = nullptr; - - DEBUG(dbgs() << "*** " << getPassName() << ": " << MF->getName() << '\n'); - - if (!SchedModel.hasInstrSchedModel()) { - DEBUG(dbgs() << " Skipping pass: no machine model present.\n"); - return false; - } - - // Check for a sequence of stores to the same base address. We don't need to - // precisely determine whether a store pair can be formed. But we do want to - // filter out most situations where we can't form store pairs to avoid - // computing trace metrics in those cases. - for (auto &MBB : *MF) { - bool SuppressSTP = false; - unsigned PrevBaseReg = 0; - for (auto &MI : MBB) { - if (!isNarrowFPStore(MI)) - continue; - unsigned BaseReg; - unsigned Offset; - if (TII->getLdStBaseRegImmOfs(&MI, BaseReg, Offset, TRI)) { - if (PrevBaseReg == BaseReg) { - // If this block can take STPs, skip ahead to the next block. - if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent())) - break; - // Otherwise, continue unpairing the stores in this block. - DEBUG(dbgs() << "Unpairing store " << MI << "\n"); - SuppressSTP = true; - TII->suppressLdStPair(&MI); - } - PrevBaseReg = BaseReg; - } else - PrevBaseReg = 0; - } - } - // This pass just sets some internal MachineMemOperand flags. It can't really - // invalidate anything. - return false; -} diff --git a/lib/Target/ARM64/ARM64Subtarget.cpp b/lib/Target/ARM64/ARM64Subtarget.cpp deleted file mode 100644 index 624e47483ff..00000000000 --- a/lib/Target/ARM64/ARM64Subtarget.cpp +++ /dev/null @@ -1,115 +0,0 @@ -//===-- ARM64Subtarget.cpp - ARM64 Subtarget Information --------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the ARM64 specific subclass of TargetSubtarget. -// -//===----------------------------------------------------------------------===// - -#include "ARM64InstrInfo.h" -#include "ARM64Subtarget.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/CodeGen/MachineScheduler.h" -#include "llvm/IR/GlobalValue.h" -#include "llvm/Support/TargetRegistry.h" - -using namespace llvm; - -#define DEBUG_TYPE "arm64-subtarget" - -#define GET_SUBTARGETINFO_CTOR -#define GET_SUBTARGETINFO_TARGET_DESC -#include "ARM64GenSubtargetInfo.inc" - -static cl::opt -EnableEarlyIfConvert("arm64-early-ifcvt", cl::desc("Enable the early if " - "converter pass"), cl::init(true), cl::Hidden); - -ARM64Subtarget::ARM64Subtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool LittleEndian) - : ARM64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others), - HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false), - HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), - CPUString(CPU), TargetTriple(TT), IsLittleEndian(LittleEndian) { - // Determine default and user-specified characteristics - - if (CPUString.empty()) - CPUString = "generic"; - - ParseSubtargetFeatures(CPUString, FS); -} - -/// ClassifyGlobalReference - Find the target operand flags that describe -/// how a global value should be referenced for the current subtarget. -unsigned char -ARM64Subtarget::ClassifyGlobalReference(const GlobalValue *GV, - const TargetMachine &TM) const { - - // Determine whether this is a reference to a definition or a declaration. - // Materializable GVs (in JIT lazy compilation mode) do not require an extra - // load from stub. - bool isDecl = GV->hasAvailableExternallyLinkage(); - if (GV->isDeclaration() && !GV->isMaterializable()) - isDecl = true; - - // MachO large model always goes via a GOT, simply to get a single 8-byte - // absolute relocation on all global addresses. - if (TM.getCodeModel() == CodeModel::Large && isTargetMachO()) - return ARM64II::MO_GOT; - - // The small code mode's direct accesses use ADRP, which cannot necessarily - // produce the value 0 (if the code is above 4GB). Therefore they must use the - // GOT. - if (TM.getCodeModel() == CodeModel::Small && GV->isWeakForLinker() && isDecl) - return ARM64II::MO_GOT; - - // If symbol visibility is hidden, the extra load is not needed if - // the symbol is definitely defined in the current translation unit. - - // The handling of non-hidden symbols in PIC mode is rather target-dependent: - // + On MachO, if the symbol is defined in this module the GOT can be - // skipped. - // + On ELF, the R_AARCH64_COPY relocation means that even symbols actually - // defined could end up in unexpected places. Use a GOT. - if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility()) { - if (isTargetMachO()) - return (isDecl || GV->isWeakForLinker()) ? ARM64II::MO_GOT - : ARM64II::MO_NO_FLAG; - else - // No need to go through the GOT for local symbols on ELF. - return GV->hasLocalLinkage() ? ARM64II::MO_NO_FLAG : ARM64II::MO_GOT; - } - - return ARM64II::MO_NO_FLAG; -} - -/// This function returns the name of a function which has an interface -/// like the non-standard bzero function, if such a function exists on -/// the current subtarget and it is considered prefereable over -/// memset with zero passed as the second argument. Otherwise it -/// returns null. -const char *ARM64Subtarget::getBZeroEntry() const { - // Prefer bzero on Darwin only. - if(isTargetDarwin()) - return "bzero"; - - return nullptr; -} - -void ARM64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, - MachineInstr *begin, MachineInstr *end, - unsigned NumRegionInstrs) const { - // LNT run (at least on Cyclone) showed reasonably significant gains for - // bi-directional scheduling. 253.perlbmk. - Policy.OnlyTopDown = false; - Policy.OnlyBottomUp = false; -} - -bool ARM64Subtarget::enableEarlyIfConversion() const { - return EnableEarlyIfConvert; -} diff --git a/lib/Target/ARM64/ARM64Subtarget.h b/lib/Target/ARM64/ARM64Subtarget.h deleted file mode 100644 index 9cea3c387d6..00000000000 --- a/lib/Target/ARM64/ARM64Subtarget.h +++ /dev/null @@ -1,110 +0,0 @@ -//=====---- ARM64Subtarget.h - Define Subtarget for the ARM64 -*- C++ -*--====// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file declares the ARM64 specific subclass of TargetSubtarget. -// -//===----------------------------------------------------------------------===// - -#ifndef ARM64SUBTARGET_H -#define ARM64SUBTARGET_H - -#include "llvm/Target/TargetSubtargetInfo.h" -#include "ARM64RegisterInfo.h" -#include - -#define GET_SUBTARGETINFO_HEADER -#include "ARM64GenSubtargetInfo.inc" - -namespace llvm { -class GlobalValue; -class StringRef; - -class ARM64Subtarget : public ARM64GenSubtargetInfo { -protected: - enum ARMProcFamilyEnum {Others, CortexA53, CortexA57, Cyclone}; - - /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others. - ARMProcFamilyEnum ARMProcFamily; - - bool HasFPARMv8; - bool HasNEON; - bool HasCrypto; - bool HasCRC; - - // HasZeroCycleRegMove - Has zero-cycle register mov instructions. - bool HasZeroCycleRegMove; - - // HasZeroCycleZeroing - Has zero-cycle zeroing instructions. - bool HasZeroCycleZeroing; - - /// CPUString - String name of used CPU. - std::string CPUString; - - /// TargetTriple - What processor and OS we're targeting. - Triple TargetTriple; - - /// IsLittleEndian - Is the target little endian? - bool IsLittleEndian; - -public: - /// This constructor initializes the data members to match that - /// of the specified triple. - ARM64Subtarget(const std::string &TT, const std::string &CPU, - const std::string &FS, bool LittleEndian); - - bool enableMachineScheduler() const override { return true; } - - bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; } - - bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; } - - bool hasFPARMv8() const { return HasFPARMv8; } - bool hasNEON() const { return HasNEON; } - bool hasCrypto() const { return HasCrypto; } - bool hasCRC() const { return HasCRC; } - - bool isLittleEndian() const { return IsLittleEndian; } - - bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } - - bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } - - bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } - - bool isCyclone() const { return CPUString == "cyclone"; } - - /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size - /// that still makes it profitable to inline the call. - unsigned getMaxInlineSizeThreshold() const { return 64; } - - /// ParseSubtargetFeatures - Parses features string setting specified - /// subtarget options. Definition of function is auto generated by tblgen. - void ParseSubtargetFeatures(StringRef CPU, StringRef FS); - - /// ClassifyGlobalReference - Find the target operand flags that describe - /// how a global value should be referenced for the current subtarget. - unsigned char ClassifyGlobalReference(const GlobalValue *GV, - const TargetMachine &TM) const; - - /// This function returns the name of a function which has an interface - /// like the non-standard bzero function, if such a function exists on - /// the current subtarget and it is considered prefereable over - /// memset with zero passed as the second argument. Otherwise it - /// returns null. - const char *getBZeroEntry() const; - - void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin, - MachineInstr *end, - unsigned NumRegionInstrs) const override; - - bool enableEarlyIfConversion() const override; -}; -} // End llvm namespace - -#endif // ARM64SUBTARGET_H diff --git a/lib/Target/ARM64/ARM64TargetMachine.cpp b/lib/Target/ARM64/ARM64TargetMachine.cpp deleted file mode 100644 index fc73145be3f..00000000000 --- a/lib/Target/ARM64/ARM64TargetMachine.cpp +++ /dev/null @@ -1,208 +0,0 @@ -//===-- ARM64TargetMachine.cpp - Define TargetMachine for ARM64 -----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#include "ARM64.h" -#include "ARM64TargetMachine.h" -#include "llvm/PassManager.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Target/TargetOptions.h" -#include "llvm/Transforms/Scalar.h" -using namespace llvm; - -static cl::opt -EnableCCMP("arm64-ccmp", cl::desc("Enable the CCMP formation pass"), - cl::init(true), cl::Hidden); - -static cl::opt -EnableStPairSuppress("arm64-stp-suppress", cl::desc("Suppress STP for ARM64"), - cl::init(true), cl::Hidden); - -static cl::opt -EnableAdvSIMDScalar("arm64-simd-scalar", cl::desc("Enable use of AdvSIMD scalar" - " integer instructions"), cl::init(false), cl::Hidden); - -static cl::opt -EnablePromoteConstant("arm64-promote-const", cl::desc("Enable the promote " - "constant pass"), cl::init(true), cl::Hidden); - -static cl::opt -EnableCollectLOH("arm64-collect-loh", cl::desc("Enable the pass that emits the" - " linker optimization hints (LOH)"), cl::init(true), - cl::Hidden); - -static cl::opt -EnableDeadRegisterElimination("arm64-dead-def-elimination", cl::Hidden, - cl::desc("Enable the pass that removes dead" - " definitons and replaces stores to" - " them with stores to the zero" - " register"), - cl::init(true)); - -static cl::opt -EnableLoadStoreOpt("arm64-load-store-opt", cl::desc("Enable the load/store pair" - " optimization pass"), cl::init(true), cl::Hidden); - -extern "C" void LLVMInitializeARM64Target() { - // Register the target. - RegisterTargetMachine X(TheARM64leTarget); - RegisterTargetMachine Y(TheARM64beTarget); - - RegisterTargetMachine Z(TheAArch64leTarget); - RegisterTargetMachine W(TheAArch64beTarget); -} - -/// TargetMachine ctor - Create an ARM64 architecture model. -/// -ARM64TargetMachine::ARM64TargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, - const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL, - bool LittleEndian) - : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS, LittleEndian), - // This nested ternary is horrible, but DL needs to be properly initialized - // before TLInfo is constructed. - DL(Subtarget.isTargetMachO() ? - "e-m:o-i64:64-i128:128-n32:64-S128" : - (LittleEndian ? - "e-m:e-i64:64-i128:128-n32:64-S128" : - "E-m:e-i64:64-i128:128-n32:64-S128")), - InstrInfo(Subtarget), TLInfo(*this), FrameLowering(*this, Subtarget), - TSInfo(*this) { - initAsmInfo(); -} - -void ARM64leTargetMachine::anchor() { } - -ARM64leTargetMachine:: -ARM64leTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) - : ARM64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} - -void ARM64beTargetMachine::anchor() { } - -ARM64beTargetMachine:: -ARM64beTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL) - : ARM64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} - -namespace { -/// ARM64 Code Generator Pass Configuration Options. -class ARM64PassConfig : public TargetPassConfig { -public: - ARM64PassConfig(ARM64TargetMachine *TM, PassManagerBase &PM) - : TargetPassConfig(TM, PM) {} - - ARM64TargetMachine &getARM64TargetMachine() const { - return getTM(); - } - - bool addPreISel() override; - bool addInstSelector() override; - bool addILPOpts() override; - bool addPreRegAlloc() override; - bool addPostRegAlloc() override; - bool addPreSched2() override; - bool addPreEmitPass() override; -}; -} // namespace - -void ARM64TargetMachine::addAnalysisPasses(PassManagerBase &PM) { - // Add first the target-independent BasicTTI pass, then our ARM64 pass. This - // allows the ARM64 pass to delegate to the target independent layer when - // appropriate. - PM.add(createBasicTargetTransformInfoPass(this)); - PM.add(createARM64TargetTransformInfoPass(this)); -} - -TargetPassConfig *ARM64TargetMachine::createPassConfig(PassManagerBase &PM) { - return new ARM64PassConfig(this, PM); -} - -// Pass Pipeline Configuration -bool ARM64PassConfig::addPreISel() { - // Run promote constant before global merge, so that the promoted constants - // get a chance to be merged - if (TM->getOptLevel() != CodeGenOpt::None && EnablePromoteConstant) - addPass(createARM64PromoteConstantPass()); - if (TM->getOptLevel() != CodeGenOpt::None) - addPass(createGlobalMergePass(TM)); - if (TM->getOptLevel() != CodeGenOpt::None) - addPass(createARM64AddressTypePromotionPass()); - - // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg - // ourselves. - addPass(createAtomicExpandLoadLinkedPass(TM)); - - return false; -} - -bool ARM64PassConfig::addInstSelector() { - addPass(createARM64ISelDag(getARM64TargetMachine(), getOptLevel())); - - // For ELF, cleanup any local-dynamic TLS accesses (i.e. combine as many - // references to _TLS_MODULE_BASE_ as possible. - if (TM->getSubtarget().isTargetELF() && - getOptLevel() != CodeGenOpt::None) - addPass(createARM64CleanupLocalDynamicTLSPass()); - - return false; -} - -bool ARM64PassConfig::addILPOpts() { - if (EnableCCMP) - addPass(createARM64ConditionalCompares()); - addPass(&EarlyIfConverterID); - if (EnableStPairSuppress) - addPass(createARM64StorePairSuppressPass()); - return true; -} - -bool ARM64PassConfig::addPreRegAlloc() { - // Use AdvSIMD scalar instructions whenever profitable. - if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar) - addPass(createARM64AdvSIMDScalar()); - return true; -} - -bool ARM64PassConfig::addPostRegAlloc() { - // Change dead register definitions to refer to the zero register. - if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination) - addPass(createARM64DeadRegisterDefinitions()); - return true; -} - -bool ARM64PassConfig::addPreSched2() { - // Expand some pseudo instructions to allow proper scheduling. - addPass(createARM64ExpandPseudoPass()); - // Use load/store pair instructions when possible. - if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt) - addPass(createARM64LoadStoreOptimizationPass()); - return true; -} - -bool ARM64PassConfig::addPreEmitPass() { - // Relax conditional branch instructions if they're otherwise out of - // range of their destination. - addPass(createARM64BranchRelaxation()); - if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH && - TM->getSubtarget().isTargetMachO()) - addPass(createARM64CollectLOHPass()); - return true; -} diff --git a/lib/Target/ARM64/ARM64TargetMachine.h b/lib/Target/ARM64/ARM64TargetMachine.h deleted file mode 100644 index 730ffcaaf6d..00000000000 --- a/lib/Target/ARM64/ARM64TargetMachine.h +++ /dev/null @@ -1,92 +0,0 @@ -//===-- ARM64TargetMachine.h - Define TargetMachine for ARM64 ---*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file declares the ARM64 specific subclass of TargetMachine. -// -//===----------------------------------------------------------------------===// - -#ifndef ARM64TARGETMACHINE_H -#define ARM64TARGETMACHINE_H - -#include "ARM64InstrInfo.h" -#include "ARM64ISelLowering.h" -#include "ARM64Subtarget.h" -#include "ARM64FrameLowering.h" -#include "ARM64SelectionDAGInfo.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/MC/MCStreamer.h" - -namespace llvm { - -class ARM64TargetMachine : public LLVMTargetMachine { -protected: - ARM64Subtarget Subtarget; - -private: - const DataLayout DL; - ARM64InstrInfo InstrInfo; - ARM64TargetLowering TLInfo; - ARM64FrameLowering FrameLowering; - ARM64SelectionDAGInfo TSInfo; - -public: - ARM64TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, - const TargetOptions &Options, Reloc::Model RM, - CodeModel::Model CM, CodeGenOpt::Level OL, - bool IsLittleEndian); - - const ARM64Subtarget *getSubtargetImpl() const override { return &Subtarget; } - const ARM64TargetLowering *getTargetLowering() const override { - return &TLInfo; - } - const DataLayout *getDataLayout() const override { return &DL; } - const ARM64FrameLowering *getFrameLowering() const override { - return &FrameLowering; - } - const ARM64InstrInfo *getInstrInfo() const override { return &InstrInfo; } - const ARM64RegisterInfo *getRegisterInfo() const override { - return &InstrInfo.getRegisterInfo(); - } - const ARM64SelectionDAGInfo *getSelectionDAGInfo() const override { - return &TSInfo; - } - - // Pass Pipeline Configuration - TargetPassConfig *createPassConfig(PassManagerBase &PM) override; - - /// \brief Register ARM64 analysis passes with a pass manager. - void addAnalysisPasses(PassManagerBase &PM) override; -}; - -// ARM64leTargetMachine - ARM64 little endian target machine. -// -class ARM64leTargetMachine : public ARM64TargetMachine { - virtual void anchor(); -public: - ARM64leTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL); -}; - -// ARM64beTargetMachine - ARM64 big endian target machine. -// -class ARM64beTargetMachine : public ARM64TargetMachine { - virtual void anchor(); -public: - ARM64beTargetMachine(const Target &T, StringRef TT, - StringRef CPU, StringRef FS, const TargetOptions &Options, - Reloc::Model RM, CodeModel::Model CM, - CodeGenOpt::Level OL); -}; - -} // end namespace llvm - -#endif diff --git a/lib/Target/ARM64/ARM64TargetObjectFile.cpp b/lib/Target/ARM64/ARM64TargetObjectFile.cpp deleted file mode 100644 index cde01e515dc..00000000000 --- a/lib/Target/ARM64/ARM64TargetObjectFile.cpp +++ /dev/null @@ -1,52 +0,0 @@ -//===-- ARM64TargetObjectFile.cpp - ARM64 Object Info ---------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "ARM64TargetObjectFile.h" -#include "ARM64TargetMachine.h" -#include "llvm/IR/Mangler.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/Support/Dwarf.h" -using namespace llvm; -using namespace dwarf; - -void ARM64_ELFTargetObjectFile::Initialize(MCContext &Ctx, - const TargetMachine &TM) { - TargetLoweringObjectFileELF::Initialize(Ctx, TM); - InitializeELF(TM.Options.UseInitArray); -} - -const MCExpr *ARM64_MachoTargetObjectFile::getTTypeGlobalReference( - const GlobalValue *GV, unsigned Encoding, Mangler &Mang, - const TargetMachine &TM, MachineModuleInfo *MMI, - MCStreamer &Streamer) const { - // On Darwin, we can reference dwarf symbols with foo@GOT-., which - // is an indirect pc-relative reference. The default implementation - // won't reference using the GOT, so we need this target-specific - // version. - if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) { - const MCSymbol *Sym = TM.getSymbol(GV, Mang); - const MCExpr *Res = - MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, getContext()); - MCSymbol *PCSym = getContext().CreateTempSymbol(); - Streamer.EmitLabel(PCSym); - const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, getContext()); - return MCBinaryExpr::CreateSub(Res, PC, getContext()); - } - - return TargetLoweringObjectFileMachO::getTTypeGlobalReference( - GV, Encoding, Mang, TM, MMI, Streamer); -} - -MCSymbol *ARM64_MachoTargetObjectFile::getCFIPersonalitySymbol( - const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM, - MachineModuleInfo *MMI) const { - return TM.getSymbol(GV, Mang); -} diff --git a/lib/Target/ARM64/ARM64TargetObjectFile.h b/lib/Target/ARM64/ARM64TargetObjectFile.h deleted file mode 100644 index 62446f94f17..00000000000 --- a/lib/Target/ARM64/ARM64TargetObjectFile.h +++ /dev/null @@ -1,40 +0,0 @@ -//===-- ARM64TargetObjectFile.h - ARM64 Object Info -*- C++ -------------*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TARGET_ARM64_TARGETOBJECTFILE_H -#define LLVM_TARGET_ARM64_TARGETOBJECTFILE_H - -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" -#include "llvm/Target/TargetLoweringObjectFile.h" - -namespace llvm { -class ARM64TargetMachine; - -/// This implementation is used for AArch64 ELF targets (Linux in particular). -class ARM64_ELFTargetObjectFile : public TargetLoweringObjectFileELF { - void Initialize(MCContext &Ctx, const TargetMachine &TM) override; -}; - -/// ARM64_MachoTargetObjectFile - This TLOF implementation is used for Darwin. -class ARM64_MachoTargetObjectFile : public TargetLoweringObjectFileMachO { -public: - const MCExpr *getTTypeGlobalReference(const GlobalValue *GV, - unsigned Encoding, Mangler &Mang, - const TargetMachine &TM, - MachineModuleInfo *MMI, - MCStreamer &Streamer) const override; - - MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang, - const TargetMachine &TM, - MachineModuleInfo *MMI) const override; -}; - -} // end namespace llvm - -#endif diff --git a/lib/Target/ARM64/ARM64TargetTransformInfo.cpp b/lib/Target/ARM64/ARM64TargetTransformInfo.cpp deleted file mode 100644 index cc4cdff62b5..00000000000 --- a/lib/Target/ARM64/ARM64TargetTransformInfo.cpp +++ /dev/null @@ -1,463 +0,0 @@ -//===-- ARM64TargetTransformInfo.cpp - ARM64 specific TTI pass ------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// \file -/// This file implements a TargetTransformInfo analysis pass specific to the -/// ARM64 target machine. It uses the target's detailed information to provide -/// more precise answers to certain TTI queries, while letting the target -/// independent and default TTI implementations handle the rest. -/// -//===----------------------------------------------------------------------===// - -#include "ARM64.h" -#include "ARM64TargetMachine.h" -#include "MCTargetDesc/ARM64AddressingModes.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Target/CostTable.h" -#include "llvm/Target/TargetLowering.h" -#include -using namespace llvm; - -#define DEBUG_TYPE "arm64tti" - -// Declare the pass initialization routine locally as target-specific passes -// don't have a target-wide initialization entry point, and so we rely on the -// pass constructor initialization. -namespace llvm { -void initializeARM64TTIPass(PassRegistry &); -} - -namespace { - -class ARM64TTI final : public ImmutablePass, public TargetTransformInfo { - const ARM64TargetMachine *TM; - const ARM64Subtarget *ST; - const ARM64TargetLowering *TLI; - - /// Estimate the overhead of scalarizing an instruction. Insert and Extract - /// are set if the result needs to be inserted and/or extracted from vectors. - unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; - -public: - ARM64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) { - llvm_unreachable("This pass cannot be directly constructed"); - } - - ARM64TTI(const ARM64TargetMachine *TM) - : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), - TLI(TM->getTargetLowering()) { - initializeARM64TTIPass(*PassRegistry::getPassRegistry()); - } - - void initializePass() override { pushTTIStack(this); } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - TargetTransformInfo::getAnalysisUsage(AU); - } - - /// Pass identification. - static char ID; - - /// Provide necessary pointer adjustments for the two base classes. - void *getAdjustedAnalysisPointer(const void *ID) override { - if (ID == &TargetTransformInfo::ID) - return (TargetTransformInfo *)this; - return this; - } - - /// \name Scalar TTI Implementations - /// @{ - unsigned getIntImmCost(int64_t Val) const; - unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override; - unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty) const override; - unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, - Type *Ty) const override; - PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override; - - /// @} - - /// \name Vector TTI Implementations - /// @{ - - unsigned getNumberOfRegisters(bool Vector) const override { - if (Vector) { - if (ST->hasNEON()) - return 32; - return 0; - } - return 31; - } - - unsigned getRegisterBitWidth(bool Vector) const override { - if (Vector) { - if (ST->hasNEON()) - return 128; - return 0; - } - return 64; - } - - unsigned getMaximumUnrollFactor() const override { return 2; } - - unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const - override; - - unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const - override; - - unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind Opd1Info = OK_AnyValue, - OperandValueKind Opd2Info = OK_AnyValue) const - override; - - unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override; - - unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const - override; - - unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace) const override; - /// @} -}; - -} // end anonymous namespace - -INITIALIZE_AG_PASS(ARM64TTI, TargetTransformInfo, "arm64tti", - "ARM64 Target Transform Info", true, true, false) -char ARM64TTI::ID = 0; - -ImmutablePass * -llvm::createARM64TargetTransformInfoPass(const ARM64TargetMachine *TM) { - return new ARM64TTI(TM); -} - -/// \brief Calculate the cost of materializing a 64-bit value. This helper -/// method might only calculate a fraction of a larger immediate. Therefore it -/// is valid to return a cost of ZERO. -unsigned ARM64TTI::getIntImmCost(int64_t Val) const { - // Check if the immediate can be encoded within an instruction. - if (Val == 0 || ARM64_AM::isLogicalImmediate(Val, 64)) - return 0; - - if (Val < 0) - Val = ~Val; - - // Calculate how many moves we will need to materialize this constant. - unsigned LZ = countLeadingZeros((uint64_t)Val); - return (64 - LZ + 15) / 16; -} - -/// \brief Calculate the cost of materializing the given constant. -unsigned ARM64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const { - assert(Ty->isIntegerTy()); - - unsigned BitSize = Ty->getPrimitiveSizeInBits(); - if (BitSize == 0) - return ~0U; - - // Sign-extend all constants to a multiple of 64-bit. - APInt ImmVal = Imm; - if (BitSize & 0x3f) - ImmVal = Imm.sext((BitSize + 63) & ~0x3fU); - - // Split the constant into 64-bit chunks and calculate the cost for each - // chunk. - unsigned Cost = 0; - for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { - APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); - int64_t Val = Tmp.getSExtValue(); - Cost += getIntImmCost(Val); - } - // We need at least one instruction to materialze the constant. - return std::max(1U, Cost); -} - -unsigned ARM64TTI::getIntImmCost(unsigned Opcode, unsigned Idx, - const APInt &Imm, Type *Ty) const { - assert(Ty->isIntegerTy()); - - unsigned BitSize = Ty->getPrimitiveSizeInBits(); - // There is no cost model for constants with a bit size of 0. Return TCC_Free - // here, so that constant hoisting will ignore this constant. - if (BitSize == 0) - return TCC_Free; - - unsigned ImmIdx = ~0U; - switch (Opcode) { - default: - return TCC_Free; - case Instruction::GetElementPtr: - // Always hoist the base address of a GetElementPtr. - if (Idx == 0) - return 2 * TCC_Basic; - return TCC_Free; - case Instruction::Store: - ImmIdx = 0; - break; - case Instruction::Add: - case Instruction::Sub: - case Instruction::Mul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - case Instruction::ICmp: - ImmIdx = 1; - break; - // Always return TCC_Free for the shift value of a shift instruction. - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - if (Idx == 1) - return TCC_Free; - break; - case Instruction::Trunc: - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::IntToPtr: - case Instruction::PtrToInt: - case Instruction::BitCast: - case Instruction::PHI: - case Instruction::Call: - case Instruction::Select: - case Instruction::Ret: - case Instruction::Load: - break; - } - - if (Idx == ImmIdx) { - unsigned NumConstants = (BitSize + 63) / 64; - unsigned Cost = ARM64TTI::getIntImmCost(Imm, Ty); - return (Cost <= NumConstants * TCC_Basic) - ? static_cast(TCC_Free) : Cost; - } - return ARM64TTI::getIntImmCost(Imm, Ty); -} - -unsigned ARM64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx, - const APInt &Imm, Type *Ty) const { - assert(Ty->isIntegerTy()); - - unsigned BitSize = Ty->getPrimitiveSizeInBits(); - // There is no cost model for constants with a bit size of 0. Return TCC_Free - // here, so that constant hoisting will ignore this constant. - if (BitSize == 0) - return TCC_Free; - - switch (IID) { - default: - return TCC_Free; - case Intrinsic::sadd_with_overflow: - case Intrinsic::uadd_with_overflow: - case Intrinsic::ssub_with_overflow: - case Intrinsic::usub_with_overflow: - case Intrinsic::smul_with_overflow: - case Intrinsic::umul_with_overflow: - if (Idx == 1) { - unsigned NumConstants = (BitSize + 63) / 64; - unsigned Cost = ARM64TTI::getIntImmCost(Imm, Ty); - return (Cost <= NumConstants * TCC_Basic) - ? static_cast(TCC_Free) : Cost; - } - break; - case Intrinsic::experimental_stackmap: - if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) - return TCC_Free; - break; - case Intrinsic::experimental_patchpoint_void: - case Intrinsic::experimental_patchpoint_i64: - if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue()))) - return TCC_Free; - break; - } - return ARM64TTI::getIntImmCost(Imm, Ty); -} - -ARM64TTI::PopcntSupportKind ARM64TTI::getPopcntSupport(unsigned TyWidth) const { - assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); - if (TyWidth == 32 || TyWidth == 64) - return PSK_FastHardware; - // TODO: ARM64TargetLowering::LowerCTPOP() supports 128bit popcount. - return PSK_Software; -} - -unsigned ARM64TTI::getCastInstrCost(unsigned Opcode, Type *Dst, - Type *Src) const { - int ISD = TLI->InstructionOpcodeToISD(Opcode); - assert(ISD && "Invalid opcode"); - - EVT SrcTy = TLI->getValueType(Src); - EVT DstTy = TLI->getValueType(Dst); - - if (!SrcTy.isSimple() || !DstTy.isSimple()) - return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); - - static const TypeConversionCostTblEntry ConversionTbl[] = { - // LowerVectorINT_TO_FP: - { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, - { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, - // LowerVectorFP_TO_INT - { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, - { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, - { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, - { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, - { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 }, - { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 }, - { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 4 }, - { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 4 }, - { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 }, - { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 4 }, - { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 4 }, - { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 4 }, - }; - - int Idx = ConvertCostTableLookup( - ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(), - SrcTy.getSimpleVT()); - if (Idx != -1) - return ConversionTbl[Idx].Cost; - - return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src); -} - -unsigned ARM64TTI::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) const { - assert(Val->isVectorTy() && "This must be a vector type"); - - if (Index != -1U) { - // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(Val); - - // This type is legalized to a scalar type. - if (!LT.second.isVector()) - return 0; - - // The type may be split. Normalize the index to the new type. - unsigned Width = LT.second.getVectorNumElements(); - Index = Index % Width; - - // The element at index zero is already inside the vector. - if (Index == 0) - return 0; - } - - // All other insert/extracts cost this much. - return 2; -} - -unsigned ARM64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, - OperandValueKind Opd1Info, - OperandValueKind Opd2Info) const { - // Legalize the type. - std::pair LT = TLI->getTypeLegalizationCost(Ty); - - int ISD = TLI->InstructionOpcodeToISD(Opcode); - - switch (ISD) { - default: - return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info, - Opd2Info); - case ISD::ADD: - case ISD::MUL: - case ISD::XOR: - case ISD::OR: - case ISD::AND: - // These nodes are marked as 'custom' for combining purposes only. - // We know that they are legal. See LowerAdd in ISelLowering. - return 1 * LT.first; - } -} - -unsigned ARM64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const { - // Address computations in vectorized code with non-consecutive addresses will - // likely result in more instructions compared to scalar code where the - // computation can more often be merged into the index mode. The resulting - // extra micro-ops can significantly decrease throughput. - unsigned NumVectorInstToHideOverhead = 10; - - if (Ty->isVectorTy() && IsComplex) - return NumVectorInstToHideOverhead; - - // In many cases the address computation is not merged into the instruction - // addressing mode. - return 1; -} - -unsigned ARM64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) const { - - int ISD = TLI->InstructionOpcodeToISD(Opcode); - // We don't lower vector selects well that are wider than the register width. - if (ValTy->isVectorTy() && ISD == ISD::SELECT) { - // We would need this many instructions to hide the scalarization happening. - unsigned AmortizationCost = 20; - static const TypeConversionCostTblEntry - VectorSelectTbl[] = { - { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost }, - { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost }, - { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost }, - { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, - { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, - { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } - }; - - EVT SelCondTy = TLI->getValueType(CondTy); - EVT SelValTy = TLI->getValueType(ValTy); - if (SelCondTy.isSimple() && SelValTy.isSimple()) { - int Idx = - ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(), - SelValTy.getSimpleVT()); - if (Idx != -1) - return VectorSelectTbl[Idx].Cost; - } - } - return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy); -} - -unsigned ARM64TTI::getMemoryOpCost(unsigned Opcode, Type *Src, - unsigned Alignment, - unsigned AddressSpace) const { - std::pair LT = TLI->getTypeLegalizationCost(Src); - - if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 && - Src->getVectorElementType()->isIntegerTy(64)) { - // Unaligned stores are extremely inefficient. We don't split - // unaligned v2i64 stores because the negative impact that has shown in - // practice on inlined memcpy code. - // We make v2i64 stores expensive so that we will only vectorize if there - // are 6 other instructions getting vectorized. - unsigned AmortizationCost = 6; - - return LT.first * 2 * AmortizationCost; - } - - if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) && - Src->getVectorNumElements() < 8) { - // We scalarize the loads/stores because there is not v.4b register and we - // have to promote the elements to v.4h. - unsigned NumVecElts = Src->getVectorNumElements(); - unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; - // We generate 2 instructions per vector element. - return NumVectorizableInstsToAmortize * NumVecElts * 2; - } - - return LT.first; -} diff --git a/lib/Target/ARM64/AsmParser/ARM64AsmParser.cpp b/lib/Target/ARM64/AsmParser/ARM64AsmParser.cpp deleted file mode 100644 index 4d710db1d93..00000000000 --- a/lib/Target/ARM64/AsmParser/ARM64AsmParser.cpp +++ /dev/null @@ -1,4030 +0,0 @@ -//===-- ARM64AsmParser.cpp - Parse ARM64 assembly to MCInst instructions --===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "MCTargetDesc/ARM64AddressingModes.h" -#include "MCTargetDesc/ARM64MCExpr.h" -#include "Utils/ARM64BaseInfo.h" -#include "llvm/MC/MCParser/MCAsmLexer.h" -#include "llvm/MC/MCParser/MCAsmParser.h" -#include "llvm/MC/MCParser/MCParsedAsmOperand.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/MC/MCTargetAsmParser.h" -#include "llvm/Support/SourceMgr.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/StringSwitch.h" -#include "llvm/ADT/Twine.h" -#include -using namespace llvm; - -namespace { - -class ARM64Operand; - -class ARM64AsmParser : public MCTargetAsmParser { -public: - typedef SmallVectorImpl OperandVector; - -private: - StringRef Mnemonic; ///< Instruction mnemonic. - MCSubtargetInfo &STI; - MCAsmParser &Parser; - - MCAsmParser &getParser() const { return Parser; } - MCAsmLexer &getLexer() const { return Parser.getLexer(); } - - SMLoc getLoc() const { return Parser.getTok().getLoc(); } - - bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands); - ARM64CC::CondCode parseCondCodeString(StringRef Cond); - bool parseCondCode(OperandVector &Operands, bool invertCondCode); - int tryParseRegister(); - int tryMatchVectorRegister(StringRef &Kind, bool expected); - bool parseRegister(OperandVector &Operands); - bool parseSymbolicImmVal(const MCExpr *&ImmVal); - bool parseVectorList(OperandVector &Operands); - bool parseOperand(OperandVector &Operands, bool isCondCode, - bool invertCondCode); - - void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); } - bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); } - bool showMatchError(SMLoc Loc, unsigned ErrCode); - - bool parseDirectiveWord(unsigned Size, SMLoc L); - bool parseDirectiveTLSDescCall(SMLoc L); - - bool parseDirectiveLOH(StringRef LOH, SMLoc L); - - bool validateInstruction(MCInst &Inst, SmallVectorImpl &Loc); - bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, - OperandVector &Operands, MCStreamer &Out, - unsigned &ErrorInfo, - bool MatchingInlineAsm) override; -/// @name Auto-generated Match Functions -/// { - -#define GET_ASSEMBLER_HEADER -#include "ARM64GenAsmMatcher.inc" - - /// } - - OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands); - OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands); - OperandMatchResultTy tryParseMRSSystemRegister(OperandVector &Operands); - OperandMatchResultTy tryParseSysReg(OperandVector &Operands); - OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands); - OperandMatchResultTy tryParsePrefetch(OperandVector &Operands); - OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands); - OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands); - OperandMatchResultTy tryParseFPImm(OperandVector &Operands); - OperandMatchResultTy tryParseAddSubImm(OperandVector &Operands); - OperandMatchResultTy tryParseGPR64sp0Operand(OperandVector &Operands); - bool tryParseVectorRegister(OperandVector &Operands); - -public: - enum ARM64MatchResultTy { - Match_InvalidSuffix = FIRST_TARGET_MATCH_RESULT_TY, -#define GET_OPERAND_DIAGNOSTIC_TYPES -#include "ARM64GenAsmMatcher.inc" - }; - ARM64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser, - const MCInstrInfo &MII, - const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(_STI), Parser(_Parser) { - MCAsmParserExtension::Initialize(_Parser); - - // Initialize the set of available features. - setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); - } - - bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, - SMLoc NameLoc, OperandVector &Operands) override; - bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override; - bool ParseDirective(AsmToken DirectiveID) override; - unsigned validateTargetOperandClass(MCParsedAsmOperand *Op, - unsigned Kind) override; - - static bool classifySymbolRef(const MCExpr *Expr, - ARM64MCExpr::VariantKind &ELFRefKind, - MCSymbolRefExpr::VariantKind &DarwinRefKind, - int64_t &Addend); -}; -} // end anonymous namespace - -namespace { - -/// ARM64Operand - Instances of this class represent a parsed ARM64 machine -/// instruction. -class ARM64Operand : public MCParsedAsmOperand { -private: - enum KindTy { - k_Immediate, - k_ShiftedImm, - k_CondCode, - k_Register, - k_VectorList, - k_VectorIndex, - k_Token, - k_SysReg, - k_SysCR, - k_Prefetch, - k_ShiftExtend, - k_FPImm, - k_Barrier - } Kind; - - SMLoc StartLoc, EndLoc; - - struct TokOp { - const char *Data; - unsigned Length; - bool IsSuffix; // Is the operand actually a suffix on the mnemonic. - }; - - struct RegOp { - unsigned RegNum; - bool isVector; - }; - - struct VectorListOp { - unsigned RegNum; - unsigned Count; - unsigned NumElements; - unsigned ElementKind; - }; - - struct VectorIndexOp { - unsigned Val; - }; - - struct ImmOp { - const MCExpr *Val; - }; - - struct ShiftedImmOp { - const MCExpr *Val; - unsigned ShiftAmount; - }; - - struct CondCodeOp { - ARM64CC::CondCode Code; - }; - - struct FPImmOp { - unsigned Val; // Encoded 8-bit representation. - }; - - struct BarrierOp { - unsigned Val; // Not the enum since not all values have names. - }; - - struct SysRegOp { - const char *Data; - unsigned Length; - uint64_t FeatureBits; // We need to pass through information about which - // core we are compiling for so that the SysReg - // Mappers can appropriately conditionalize. - }; - - struct SysCRImmOp { - unsigned Val; - }; - - struct PrefetchOp { - unsigned Val; - }; - - struct ShiftExtendOp { - ARM64_AM::ShiftExtendType Type; - unsigned Amount; - bool HasExplicitAmount; - }; - - struct ExtendOp { - unsigned Val; - }; - - union { - struct TokOp Tok; - struct RegOp Reg; - struct VectorListOp VectorList; - struct VectorIndexOp VectorIndex; - struct ImmOp Imm; - struct ShiftedImmOp ShiftedImm; - struct CondCodeOp CondCode; - struct FPImmOp FPImm; - struct BarrierOp Barrier; - struct SysRegOp SysReg; - struct SysCRImmOp SysCRImm; - struct PrefetchOp Prefetch; - struct ShiftExtendOp ShiftExtend; - }; - - // Keep the MCContext around as the MCExprs may need manipulated during - // the add<>Operands() calls. - MCContext &Ctx; - - ARM64Operand(KindTy K, MCContext &_Ctx) - : MCParsedAsmOperand(), Kind(K), Ctx(_Ctx) {} - -public: - ARM64Operand(const ARM64Operand &o) : MCParsedAsmOperand(), Ctx(o.Ctx) { - Kind = o.Kind; - StartLoc = o.StartLoc; - EndLoc = o.EndLoc; - switch (Kind) { - case k_Token: - Tok = o.Tok; - break; - case k_Immediate: - Imm = o.Imm; - break; - case k_ShiftedImm: - ShiftedImm = o.ShiftedImm; - break; - case k_CondCode: - CondCode = o.CondCode; - break; - case k_FPImm: - FPImm = o.FPImm; - break; - case k_Barrier: - Barrier = o.Barrier; - break; - case k_Register: - Reg = o.Reg; - break; - case k_VectorList: - VectorList = o.VectorList; - break; - case k_VectorIndex: - VectorIndex = o.VectorIndex; - break; - case k_SysReg: - SysReg = o.SysReg; - break; - case k_SysCR: - SysCRImm = o.SysCRImm; - break; - case k_Prefetch: - Prefetch = o.Prefetch; - break; - case k_ShiftExtend: - ShiftExtend = o.ShiftExtend; - break; - } - } - - /// getStartLoc - Get the location of the first token of this operand. - SMLoc getStartLoc() const override { return StartLoc; } - /// getEndLoc - Get the location of the last token of this operand. - SMLoc getEndLoc() const override { return EndLoc; } - - StringRef getToken() const { - assert(Kind == k_Token && "Invalid access!"); - return StringRef(Tok.Data, Tok.Length); - } - - bool isTokenSuffix() const { - assert(Kind == k_Token && "Invalid access!"); - return Tok.IsSuffix; - } - - const MCExpr *getImm() const { - assert(Kind == k_Immediate && "Invalid access!"); - return Imm.Val; - } - - const MCExpr *getShiftedImmVal() const { - assert(Kind == k_ShiftedImm && "Invalid access!"); - return ShiftedImm.Val; - } - - unsigned getShiftedImmShift() const { - assert(Kind == k_ShiftedImm && "Invalid access!"); - return ShiftedImm.ShiftAmount; - } - - ARM64CC::CondCode getCondCode() const { - assert(Kind == k_CondCode && "Invalid access!"); - return CondCode.Code; - } - - unsigned getFPImm() const { - assert(Kind == k_FPImm && "Invalid access!"); - return FPImm.Val; - } - - unsigned getBarrier() const { - assert(Kind == k_Barrier && "Invalid access!"); - return Barrier.Val; - } - - unsigned getReg() const override { - assert(Kind == k_Register && "Invalid access!"); - return Reg.RegNum; - } - - unsigned getVectorListStart() const { - assert(Kind == k_VectorList && "Invalid access!"); - return VectorList.RegNum; - } - - unsigned getVectorListCount() const { - assert(Kind == k_VectorList && "Invalid access!"); - return VectorList.Count; - } - - unsigned getVectorIndex() const { - assert(Kind == k_VectorIndex && "Invalid access!"); - return VectorIndex.Val; - } - - StringRef getSysReg() const { - assert(Kind == k_SysReg && "Invalid access!"); - return StringRef(SysReg.Data, SysReg.Length); - } - - uint64_t getSysRegFeatureBits() const { - assert(Kind == k_SysReg && "Invalid access!"); - return SysReg.FeatureBits; - } - - unsigned getSysCR() const { - assert(Kind == k_SysCR && "Invalid access!"); - return SysCRImm.Val; - } - - unsigned getPrefetch() const { - assert(Kind == k_Prefetch && "Invalid access!"); - return Prefetch.Val; - } - - ARM64_AM::ShiftExtendType getShiftExtendType() const { - assert(Kind == k_ShiftExtend && "Invalid access!"); - return ShiftExtend.Type; - } - - unsigned getShiftExtendAmount() const { - assert(Kind == k_ShiftExtend && "Invalid access!"); - return ShiftExtend.Amount; - } - - bool hasShiftExtendAmount() const { - assert(Kind == k_ShiftExtend && "Invalid access!"); - return ShiftExtend.HasExplicitAmount; - } - - bool isImm() const override { return Kind == k_Immediate; } - bool isMem() const override { return false; } - bool isSImm9() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= -256 && Val < 256); - } - bool isSImm7s4() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= -256 && Val <= 252 && (Val & 3) == 0); - } - bool isSImm7s8() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= -512 && Val <= 504 && (Val & 7) == 0); - } - bool isSImm7s16() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= -1024 && Val <= 1008 && (Val & 15) == 0); - } - - bool isSymbolicUImm12Offset(const MCExpr *Expr, unsigned Scale) const { - ARM64MCExpr::VariantKind ELFRefKind; - MCSymbolRefExpr::VariantKind DarwinRefKind; - int64_t Addend; - if (!ARM64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, - Addend)) { - // If we don't understand the expression, assume the best and - // let the fixup and relocation code deal with it. - return true; - } - - if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF || - ELFRefKind == ARM64MCExpr::VK_LO12 || - ELFRefKind == ARM64MCExpr::VK_GOT_LO12 || - ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12 || - ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12_NC || - ELFRefKind == ARM64MCExpr::VK_TPREL_LO12 || - ELFRefKind == ARM64MCExpr::VK_TPREL_LO12_NC || - ELFRefKind == ARM64MCExpr::VK_GOTTPREL_LO12_NC || - ELFRefKind == ARM64MCExpr::VK_TLSDESC_LO12) { - // Note that we don't range-check the addend. It's adjusted modulo page - // size when converted, so there is no "out of range" condition when using - // @pageoff. - return Addend >= 0 && (Addend % Scale) == 0; - } else if (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF || - DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) { - // @gotpageoff/@tlvppageoff can only be used directly, not with an addend. - return Addend == 0; - } - - return false; - } - - template bool isUImm12Offset() const { - if (!isImm()) - return false; - - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return isSymbolicUImm12Offset(getImm(), Scale); - - int64_t Val = MCE->getValue(); - return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000; - } - - bool isImm0_7() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 8); - } - bool isImm1_8() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val > 0 && Val < 9); - } - bool isImm0_15() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 16); - } - bool isImm1_16() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val > 0 && Val < 17); - } - bool isImm0_31() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 32); - } - bool isImm1_31() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 1 && Val < 32); - } - bool isImm1_32() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 1 && Val < 33); - } - bool isImm0_63() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 64); - } - bool isImm1_63() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 1 && Val < 64); - } - bool isImm1_64() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 1 && Val < 65); - } - bool isImm0_127() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 128); - } - bool isImm0_255() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 256); - } - bool isImm0_65535() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 0 && Val < 65536); - } - bool isImm32_63() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return false; - int64_t Val = MCE->getValue(); - return (Val >= 32 && Val < 64); - } - bool isLogicalImm32() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return false; - return ARM64_AM::isLogicalImmediate(MCE->getValue(), 32); - } - bool isLogicalImm64() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return false; - return ARM64_AM::isLogicalImmediate(MCE->getValue(), 64); - } - bool isShiftedImm() const { return Kind == k_ShiftedImm; } - bool isAddSubImm() const { - if (!isShiftedImm() && !isImm()) - return false; - - const MCExpr *Expr; - - // An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'. - if (isShiftedImm()) { - unsigned Shift = ShiftedImm.ShiftAmount; - Expr = ShiftedImm.Val; - if (Shift != 0 && Shift != 12) - return false; - } else { - Expr = getImm(); - } - - ARM64MCExpr::VariantKind ELFRefKind; - MCSymbolRefExpr::VariantKind DarwinRefKind; - int64_t Addend; - if (ARM64AsmParser::classifySymbolRef(Expr, ELFRefKind, - DarwinRefKind, Addend)) { - return DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF - || DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF - || (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF && Addend == 0) - || ELFRefKind == ARM64MCExpr::VK_LO12 - || ELFRefKind == ARM64MCExpr::VK_DTPREL_HI12 - || ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12 - || ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12_NC - || ELFRefKind == ARM64MCExpr::VK_TPREL_HI12 - || ELFRefKind == ARM64MCExpr::VK_TPREL_LO12 - || ELFRefKind == ARM64MCExpr::VK_TPREL_LO12_NC - || ELFRefKind == ARM64MCExpr::VK_TLSDESC_LO12; - } - - // Otherwise it should be a real immediate in range: - const MCConstantExpr *CE = cast(Expr); - return CE->getValue() >= 0 && CE->getValue() <= 0xfff; - } - bool isCondCode() const { return Kind == k_CondCode; } - bool isSIMDImmType10() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return false; - return ARM64_AM::isAdvSIMDModImmType10(MCE->getValue()); - } - bool isBranchTarget26() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return true; - int64_t Val = MCE->getValue(); - if (Val & 0x3) - return false; - return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2)); - } - bool isPCRelLabel19() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return true; - int64_t Val = MCE->getValue(); - if (Val & 0x3) - return false; - return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2)); - } - bool isBranchTarget14() const { - if (!isImm()) - return false; - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - return true; - int64_t Val = MCE->getValue(); - if (Val & 0x3) - return false; - return (Val >= -(0x2000 << 2) && Val <= (0x1fff << 2)); - } - - bool isMovWSymbol(ArrayRef AllowedModifiers) const { - if (!isImm()) - return false; - - ARM64MCExpr::VariantKind ELFRefKind; - MCSymbolRefExpr::VariantKind DarwinRefKind; - int64_t Addend; - if (!ARM64AsmParser::classifySymbolRef(getImm(), ELFRefKind, DarwinRefKind, - Addend)) { - return false; - } - if (DarwinRefKind != MCSymbolRefExpr::VK_None) - return false; - - for (unsigned i = 0; i != AllowedModifiers.size(); ++i) { - if (ELFRefKind == AllowedModifiers[i]) - return Addend == 0; - } - - return false; - } - - bool isMovZSymbolG3() const { - static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G3 }; - return isMovWSymbol(Variants); - } - - bool isMovZSymbolG2() const { - static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G2, - ARM64MCExpr::VK_ABS_G2_S, - ARM64MCExpr::VK_TPREL_G2, - ARM64MCExpr::VK_DTPREL_G2 }; - return isMovWSymbol(Variants); - } - - bool isMovZSymbolG1() const { - static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G1, - ARM64MCExpr::VK_ABS_G1_S, - ARM64MCExpr::VK_GOTTPREL_G1, - ARM64MCExpr::VK_TPREL_G1, - ARM64MCExpr::VK_DTPREL_G1, }; - return isMovWSymbol(Variants); - } - - bool isMovZSymbolG0() const { - static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G0, - ARM64MCExpr::VK_ABS_G0_S, - ARM64MCExpr::VK_TPREL_G0, - ARM64MCExpr::VK_DTPREL_G0 }; - return isMovWSymbol(Variants); - } - - bool isMovKSymbolG3() const { - static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G3 }; - return isMovWSymbol(Variants); - } - - bool isMovKSymbolG2() const { - static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G2_NC }; - return isMovWSymbol(Variants); - } - - bool isMovKSymbolG1() const { - static ARM64MCExpr::VariantKind Variants[] = { - ARM64MCExpr::VK_ABS_G1_NC, ARM64MCExpr::VK_TPREL_G1_NC, - ARM64MCExpr::VK_DTPREL_G1_NC - }; - return isMovWSymbol(Variants); - } - - bool isMovKSymbolG0() const { - static ARM64MCExpr::VariantKind Variants[] = { - ARM64MCExpr::VK_ABS_G0_NC, ARM64MCExpr::VK_GOTTPREL_G0_NC, - ARM64MCExpr::VK_TPREL_G0_NC, ARM64MCExpr::VK_DTPREL_G0_NC - }; - return isMovWSymbol(Variants); - } - - template - bool isMOVZMovAlias() const { - if (!isImm()) return false; - - const MCConstantExpr *CE = dyn_cast(getImm()); - if (!CE) return false; - uint64_t Value = CE->getValue(); - - if (RegWidth == 32) - Value &= 0xffffffffULL; - - // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0". - if (Value == 0 && Shift != 0) - return false; - - return (Value & ~(0xffffULL << Shift)) == 0; - } - - template - bool isMOVNMovAlias() const { - if (!isImm()) return false; - - const MCConstantExpr *CE = dyn_cast(getImm()); - if (!CE) return false; - uint64_t Value = CE->getValue(); - - // MOVZ takes precedence over MOVN. - for (int MOVZShift = 0; MOVZShift <= 48; MOVZShift += 16) - if ((Value & ~(0xffffULL << MOVZShift)) == 0) - return false; - - Value = ~Value; - if (RegWidth == 32) - Value &= 0xffffffffULL; - - return (Value & ~(0xffffULL << Shift)) == 0; - } - - bool isFPImm() const { return Kind == k_FPImm; } - bool isBarrier() const { return Kind == k_Barrier; } - bool isSysReg() const { return Kind == k_SysReg; } - bool isMRSSystemRegister() const { - if (!isSysReg()) return false; - - bool IsKnownRegister; - auto Mapper = ARM64SysReg::MRSMapper(getSysRegFeatureBits()); - Mapper.fromString(getSysReg(), IsKnownRegister); - - return IsKnownRegister; - } - bool isMSRSystemRegister() const { - if (!isSysReg()) return false; - - bool IsKnownRegister; - auto Mapper = ARM64SysReg::MSRMapper(getSysRegFeatureBits()); - Mapper.fromString(getSysReg(), IsKnownRegister); - - return IsKnownRegister; - } - bool isSystemPStateField() const { - if (!isSysReg()) return false; - - bool IsKnownRegister; - ARM64PState::PStateMapper().fromString(getSysReg(), IsKnownRegister); - - return IsKnownRegister; - } - bool isReg() const override { return Kind == k_Register && !Reg.isVector; } - bool isVectorReg() const { return Kind == k_Register && Reg.isVector; } - bool isVectorRegLo() const { - return Kind == k_Register && Reg.isVector && - ARM64MCRegisterClasses[ARM64::FPR128_loRegClassID].contains(Reg.RegNum); - } - bool isGPR32as64() const { - return Kind == k_Register && !Reg.isVector && - ARM64MCRegisterClasses[ARM64::GPR64RegClassID].contains(Reg.RegNum); - } - - bool isGPR64sp0() const { - return Kind == k_Register && !Reg.isVector && - ARM64MCRegisterClasses[ARM64::GPR64spRegClassID].contains(Reg.RegNum); - } - - /// Is this a vector list with the type implicit (presumably attached to the - /// instruction itself)? - template bool isImplicitlyTypedVectorList() const { - return Kind == k_VectorList && VectorList.Count == NumRegs && - !VectorList.ElementKind; - } - - template - bool isTypedVectorList() const { - if (Kind != k_VectorList) - return false; - if (VectorList.Count != NumRegs) - return false; - if (VectorList.ElementKind != ElementKind) - return false; - return VectorList.NumElements == NumElements; - } - - bool isVectorIndex1() const { - return Kind == k_VectorIndex && VectorIndex.Val == 1; - } - bool isVectorIndexB() const { - return Kind == k_VectorIndex && VectorIndex.Val < 16; - } - bool isVectorIndexH() const { - return Kind == k_VectorIndex && VectorIndex.Val < 8; - } - bool isVectorIndexS() const { - return Kind == k_VectorIndex && VectorIndex.Val < 4; - } - bool isVectorIndexD() const { - return Kind == k_VectorIndex && VectorIndex.Val < 2; - } - bool isToken() const override { return Kind == k_Token; } - bool isTokenEqual(StringRef Str) const { - return Kind == k_Token && getToken() == Str; - } - bool isSysCR() const { return Kind == k_SysCR; } - bool isPrefetch() const { return Kind == k_Prefetch; } - bool isShiftExtend() const { return Kind == k_ShiftExtend; } - bool isShifter() const { - if (!isShiftExtend()) - return false; - - ARM64_AM::ShiftExtendType ST = getShiftExtendType(); - return (ST == ARM64_AM::LSL || ST == ARM64_AM::LSR || ST == ARM64_AM::ASR || - ST == ARM64_AM::ROR || ST == ARM64_AM::MSL); - } - bool isExtend() const { - if (!isShiftExtend()) - return false; - - ARM64_AM::ShiftExtendType ET = getShiftExtendType(); - return (ET == ARM64_AM::UXTB || ET == ARM64_AM::SXTB || - ET == ARM64_AM::UXTH || ET == ARM64_AM::SXTH || - ET == ARM64_AM::UXTW || ET == ARM64_AM::SXTW || - ET == ARM64_AM::UXTX || ET == ARM64_AM::SXTX || - ET == ARM64_AM::LSL) && - getShiftExtendAmount() <= 4; - } - - bool isExtend64() const { - if (!isExtend()) - return false; - // UXTX and SXTX require a 64-bit source register (the ExtendLSL64 class). - ARM64_AM::ShiftExtendType ET = getShiftExtendType(); - return ET != ARM64_AM::UXTX && ET != ARM64_AM::SXTX; - } - bool isExtendLSL64() const { - if (!isExtend()) - return false; - ARM64_AM::ShiftExtendType ET = getShiftExtendType(); - return (ET == ARM64_AM::UXTX || ET == ARM64_AM::SXTX || ET == ARM64_AM::LSL) && - getShiftExtendAmount() <= 4; - } - - template bool isMemXExtend() const { - if (!isExtend()) - return false; - ARM64_AM::ShiftExtendType ET = getShiftExtendType(); - return (ET == ARM64_AM::LSL || ET == ARM64_AM::SXTX) && - (getShiftExtendAmount() == Log2_32(Width / 8) || - getShiftExtendAmount() == 0); - } - - template bool isMemWExtend() const { - if (!isExtend()) - return false; - ARM64_AM::ShiftExtendType ET = getShiftExtendType(); - return (ET == ARM64_AM::UXTW || ET == ARM64_AM::SXTW) && - (getShiftExtendAmount() == Log2_32(Width / 8) || - getShiftExtendAmount() == 0); - } - - template - bool isArithmeticShifter() const { - if (!isShifter()) - return false; - - // An arithmetic shifter is LSL, LSR, or ASR. - ARM64_AM::ShiftExtendType ST = getShiftExtendType(); - return (ST == ARM64_AM::LSL || ST == ARM64_AM::LSR || - ST == ARM64_AM::ASR) && getShiftExtendAmount() < width; - } - - template - bool isLogicalShifter() const { - if (!isShifter()) - return false; - - // A logical shifter is LSL, LSR, ASR or ROR. - ARM64_AM::ShiftExtendType ST = getShiftExtendType(); - return (ST == ARM64_AM::LSL || ST == ARM64_AM::LSR || ST == ARM64_AM::ASR || - ST == ARM64_AM::ROR) && - getShiftExtendAmount() < width; - } - - bool isMovImm32Shifter() const { - if (!isShifter()) - return false; - - // A MOVi shifter is LSL of 0, 16, 32, or 48. - ARM64_AM::ShiftExtendType ST = getShiftExtendType(); - if (ST != ARM64_AM::LSL) - return false; - uint64_t Val = getShiftExtendAmount(); - return (Val == 0 || Val == 16); - } - - bool isMovImm64Shifter() const { - if (!isShifter()) - return false; - - // A MOVi shifter is LSL of 0 or 16. - ARM64_AM::ShiftExtendType ST = getShiftExtendType(); - if (ST != ARM64_AM::LSL) - return false; - uint64_t Val = getShiftExtendAmount(); - return (Val == 0 || Val == 16 || Val == 32 || Val == 48); - } - - bool isLogicalVecShifter() const { - if (!isShifter()) - return false; - - // A logical vector shifter is a left shift by 0, 8, 16, or 24. - unsigned Shift = getShiftExtendAmount(); - return getShiftExtendType() == ARM64_AM::LSL && - (Shift == 0 || Shift == 8 || Shift == 16 || Shift == 24); - } - - bool isLogicalVecHalfWordShifter() const { - if (!isLogicalVecShifter()) - return false; - - // A logical vector shifter is a left shift by 0 or 8. - unsigned Shift = getShiftExtendAmount(); - return getShiftExtendType() == ARM64_AM::LSL && (Shift == 0 || Shift == 8); - } - - bool isMoveVecShifter() const { - if (!isShiftExtend()) - return false; - - // A logical vector shifter is a left shift by 8 or 16. - unsigned Shift = getShiftExtendAmount(); - return getShiftExtendType() == ARM64_AM::MSL && (Shift == 8 || Shift == 16); - } - - // Fallback unscaled operands are for aliases of LDR/STR that fall back - // to LDUR/STUR when the offset is not legal for the former but is for - // the latter. As such, in addition to checking for being a legal unscaled - // address, also check that it is not a legal scaled address. This avoids - // ambiguity in the matcher. - template - bool isSImm9OffsetFB() const { - return isSImm9() && !isUImm12Offset(); - } - - bool isAdrpLabel() const { - // Validation was handled during parsing, so we just sanity check that - // something didn't go haywire. - if (!isImm()) - return false; - - if (const MCConstantExpr *CE = dyn_cast(Imm.Val)) { - int64_t Val = CE->getValue(); - int64_t Min = - (4096 * (1LL << (21 - 1))); - int64_t Max = 4096 * ((1LL << (21 - 1)) - 1); - return (Val % 4096) == 0 && Val >= Min && Val <= Max; - } - - return true; - } - - bool isAdrLabel() const { - // Validation was handled during parsing, so we just sanity check that - // something didn't go haywire. - if (!isImm()) - return false; - - if (const MCConstantExpr *CE = dyn_cast(Imm.Val)) { - int64_t Val = CE->getValue(); - int64_t Min = - (1LL << (21 - 1)); - int64_t Max = ((1LL << (21 - 1)) - 1); - return Val >= Min && Val <= Max; - } - - return true; - } - - void addExpr(MCInst &Inst, const MCExpr *Expr) const { - // Add as immediates when possible. Null MCExpr = 0. - if (!Expr) - Inst.addOperand(MCOperand::CreateImm(0)); - else if (const MCConstantExpr *CE = dyn_cast(Expr)) - Inst.addOperand(MCOperand::CreateImm(CE->getValue())); - else - Inst.addOperand(MCOperand::CreateExpr(Expr)); - } - - void addRegOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::CreateReg(getReg())); - } - - void addGPR32as64Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - assert(ARM64MCRegisterClasses[ARM64::GPR64RegClassID].contains(getReg())); - - const MCRegisterInfo *RI = Ctx.getRegisterInfo(); - uint32_t Reg = RI->getRegClass(ARM64::GPR32RegClassID).getRegister( - RI->getEncodingValue(getReg())); - - Inst.addOperand(MCOperand::CreateReg(Reg)); - } - - void addVectorReg64Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - assert(ARM64MCRegisterClasses[ARM64::FPR128RegClassID].contains(getReg())); - Inst.addOperand(MCOperand::CreateReg(ARM64::D0 + getReg() - ARM64::Q0)); - } - - void addVectorReg128Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - assert(ARM64MCRegisterClasses[ARM64::FPR128RegClassID].contains(getReg())); - Inst.addOperand(MCOperand::CreateReg(getReg())); - } - - void addVectorRegLoOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::CreateReg(getReg())); - } - - template - void addVectorList64Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - static unsigned FirstRegs[] = { ARM64::D0, ARM64::D0_D1, - ARM64::D0_D1_D2, ARM64::D0_D1_D2_D3 }; - unsigned FirstReg = FirstRegs[NumRegs - 1]; - - Inst.addOperand( - MCOperand::CreateReg(FirstReg + getVectorListStart() - ARM64::Q0)); - } - - template - void addVectorList128Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - static unsigned FirstRegs[] = { ARM64::Q0, ARM64::Q0_Q1, - ARM64::Q0_Q1_Q2, ARM64::Q0_Q1_Q2_Q3 }; - unsigned FirstReg = FirstRegs[NumRegs - 1]; - - Inst.addOperand( - MCOperand::CreateReg(FirstReg + getVectorListStart() - ARM64::Q0)); - } - - void addVectorIndex1Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::CreateImm(getVectorIndex())); - } - - void addVectorIndexBOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::CreateImm(getVectorIndex())); - } - - void addVectorIndexHOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::CreateImm(getVectorIndex())); - } - - void addVectorIndexSOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::CreateImm(getVectorIndex())); - } - - void addVectorIndexDOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::CreateImm(getVectorIndex())); - } - - void addImmOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - // If this is a pageoff symrefexpr with an addend, adjust the addend - // to be only the page-offset portion. Otherwise, just add the expr - // as-is. - addExpr(Inst, getImm()); - } - - void addAddSubImmOperands(MCInst &Inst, unsigned N) const { - assert(N == 2 && "Invalid number of operands!"); - if (isShiftedImm()) { - addExpr(Inst, getShiftedImmVal()); - Inst.addOperand(MCOperand::CreateImm(getShiftedImmShift())); - } else { - addExpr(Inst, getImm()); - Inst.addOperand(MCOperand::CreateImm(0)); - } - } - - void addCondCodeOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::CreateImm(getCondCode())); - } - - void addAdrpLabelOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) - addExpr(Inst, getImm()); - else - Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 12)); - } - - void addAdrLabelOperands(MCInst &Inst, unsigned N) const { - addImmOperands(Inst, N); - } - - template - void addUImm12OffsetOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - - if (!MCE) { - Inst.addOperand(MCOperand::CreateExpr(getImm())); - return; - } - Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / Scale)); - } - - void addSImm9Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); - Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); - } - - void addSImm7s4Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); - Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 4)); - } - - void addSImm7s8Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); - Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 8)); - } - - void addSImm7s16Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); - Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 16)); - } - - void addImm0_7Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); - Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); - } - - void addImm1_8Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); - Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); - } - - void addImm0_15Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); - Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); - } - - void addImm1_16Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); - Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); - } - - void addImm0_31Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); - Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); - } - - void addImm1_31Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); - Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); - } - - void addImm1_32Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); - Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); - } - - void addImm0_63Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); - Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); - } - - void addImm1_63Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); - Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); - } - - void addImm1_64Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); - Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); - } - - void addImm0_127Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); - Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); - } - - void addImm0_255Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); - Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); - } - - void addImm0_65535Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); - Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); - } - - void addImm32_63Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid constant immediate operand!"); - Inst.addOperand(MCOperand::CreateImm(MCE->getValue())); - } - - void addLogicalImm32Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid logical immediate operand!"); - uint64_t encoding = ARM64_AM::encodeLogicalImmediate(MCE->getValue(), 32); - Inst.addOperand(MCOperand::CreateImm(encoding)); - } - - void addLogicalImm64Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid logical immediate operand!"); - uint64_t encoding = ARM64_AM::encodeLogicalImmediate(MCE->getValue(), 64); - Inst.addOperand(MCOperand::CreateImm(encoding)); - } - - void addSIMDImmType10Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - assert(MCE && "Invalid immediate operand!"); - uint64_t encoding = ARM64_AM::encodeAdvSIMDModImmType10(MCE->getValue()); - Inst.addOperand(MCOperand::CreateImm(encoding)); - } - - void addBranchTarget26Operands(MCInst &Inst, unsigned N) const { - // Branch operands don't encode the low bits, so shift them off - // here. If it's a label, however, just put it on directly as there's - // not enough information now to do anything. - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) { - addExpr(Inst, getImm()); - return; - } - assert(MCE && "Invalid constant immediate operand!"); - Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2)); - } - - void addPCRelLabel19Operands(MCInst &Inst, unsigned N) const { - // Branch operands don't encode the low bits, so shift them off - // here. If it's a label, however, just put it on directly as there's - // not enough information now to do anything. - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) { - addExpr(Inst, getImm()); - return; - } - assert(MCE && "Invalid constant immediate operand!"); - Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2)); - } - - void addBranchTarget14Operands(MCInst &Inst, unsigned N) const { - // Branch operands don't encode the low bits, so shift them off - // here. If it's a label, however, just put it on directly as there's - // not enough information now to do anything. - assert(N == 1 && "Invalid number of operands!"); - const MCConstantExpr *MCE = dyn_cast(getImm()); - if (!MCE) { - addExpr(Inst, getImm()); - return; - } - assert(MCE && "Invalid constant immediate operand!"); - Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2)); - } - - void addFPImmOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::CreateImm(getFPImm())); - } - - void addBarrierOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::CreateImm(getBarrier())); - } - - void addMRSSystemRegisterOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - - bool Valid; - auto Mapper = ARM64SysReg::MRSMapper(getSysRegFeatureBits()); - uint32_t Bits = Mapper.fromString(getSysReg(), Valid); - - Inst.addOperand(MCOperand::CreateImm(Bits)); - } - - void addMSRSystemRegisterOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - - bool Valid; - auto Mapper = ARM64SysReg::MSRMapper(getSysRegFeatureBits()); - uint32_t Bits = Mapper.fromString(getSysReg(), Valid); - - Inst.addOperand(MCOperand::CreateImm(Bits)); - } - - void addSystemPStateFieldOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - - bool Valid; - uint32_t Bits = ARM64PState::PStateMapper().fromString(getSysReg(), Valid); - - Inst.addOperand(MCOperand::CreateImm(Bits)); - } - - void addSysCROperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::CreateImm(getSysCR())); - } - - void addPrefetchOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::CreateImm(getPrefetch())); - } - - void addShifterOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - unsigned Imm = - ARM64_AM::getShifterImm(getShiftExtendType(), getShiftExtendAmount()); - Inst.addOperand(MCOperand::CreateImm(Imm)); - } - - void addExtendOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - ARM64_AM::ShiftExtendType ET = getShiftExtendType(); - if (ET == ARM64_AM::LSL) ET = ARM64_AM::UXTW; - unsigned Imm = ARM64_AM::getArithExtendImm(ET, getShiftExtendAmount()); - Inst.addOperand(MCOperand::CreateImm(Imm)); - } - - void addExtend64Operands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - ARM64_AM::ShiftExtendType ET = getShiftExtendType(); - if (ET == ARM64_AM::LSL) ET = ARM64_AM::UXTX; - unsigned Imm = ARM64_AM::getArithExtendImm(ET, getShiftExtendAmount()); - Inst.addOperand(MCOperand::CreateImm(Imm)); - } - - void addMemExtendOperands(MCInst &Inst, unsigned N) const { - assert(N == 2 && "Invalid number of operands!"); - ARM64_AM::ShiftExtendType ET = getShiftExtendType(); - bool IsSigned = ET == ARM64_AM::SXTW || ET == ARM64_AM::SXTX; - Inst.addOperand(MCOperand::CreateImm(IsSigned)); - Inst.addOperand(MCOperand::CreateImm(getShiftExtendAmount() != 0)); - } - - // For 8-bit load/store instructions with a register offset, both the - // "DoShift" and "NoShift" variants have a shift of 0. Because of this, - // they're disambiguated by whether the shift was explicit or implicit rather - // than its size. - void addMemExtend8Operands(MCInst &Inst, unsigned N) const { - assert(N == 2 && "Invalid number of operands!"); - ARM64_AM::ShiftExtendType ET = getShiftExtendType(); - bool IsSigned = ET == ARM64_AM::SXTW || ET == ARM64_AM::SXTX; - Inst.addOperand(MCOperand::CreateImm(IsSigned)); - Inst.addOperand(MCOperand::CreateImm(hasShiftExtendAmount())); - } - - template - void addMOVZMovAliasOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - - const MCConstantExpr *CE = cast(getImm()); - uint64_t Value = CE->getValue(); - Inst.addOperand(MCOperand::CreateImm((Value >> Shift) & 0xffff)); - } - - template - void addMOVNMovAliasOperands(MCInst &Inst, unsigned N) const { - assert(N == 1 && "Invalid number of operands!"); - - const MCConstantExpr *CE = cast(getImm()); - uint64_t Value = CE->getValue(); - Inst.addOperand(MCOperand::CreateImm((~Value >> Shift) & 0xffff)); - } - - void print(raw_ostream &OS) const override; - - static ARM64Operand *CreateToken(StringRef Str, bool IsSuffix, SMLoc S, - MCContext &Ctx) { - ARM64Operand *Op = new ARM64Operand(k_Token, Ctx); - Op->Tok.Data = Str.data(); - Op->Tok.Length = Str.size(); - Op->Tok.IsSuffix = IsSuffix; - Op->StartLoc = S; - Op->EndLoc = S; - return Op; - } - - static ARM64Operand *CreateReg(unsigned RegNum, bool isVector, SMLoc S, - SMLoc E, MCContext &Ctx) { - ARM64Operand *Op = new ARM64Operand(k_Register, Ctx); - Op->Reg.RegNum = RegNum; - Op->Reg.isVector = isVector; - Op->StartLoc = S; - Op->EndLoc = E; - return Op; - } - - static ARM64Operand *CreateVectorList(unsigned RegNum, unsigned Count, - unsigned NumElements, char ElementKind, - SMLoc S, SMLoc E, MCContext &Ctx) { - ARM64Operand *Op = new ARM64Operand(k_VectorList, Ctx); - Op->VectorList.RegNum = RegNum; - Op->VectorList.Count = Count; - Op->VectorList.NumElements = NumElements; - Op->VectorList.ElementKind = ElementKind; - Op->StartLoc = S; - Op->EndLoc = E; - return Op; - } - - static ARM64Operand *CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E, - MCContext &Ctx) { - ARM64Operand *Op = new ARM64Operand(k_VectorIndex, Ctx); - Op->VectorIndex.Val = Idx; - Op->StartLoc = S; - Op->EndLoc = E; - return Op; - } - - static ARM64Operand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E, - MCContext &Ctx) { - ARM64Operand *Op = new ARM64Operand(k_Immediate, Ctx); - Op->Imm.Val = Val; - Op->StartLoc = S; - Op->EndLoc = E; - return Op; - } - - static ARM64Operand *CreateShiftedImm(const MCExpr *Val, unsigned ShiftAmount, - SMLoc S, SMLoc E, MCContext &Ctx) { - ARM64Operand *Op = new ARM64Operand(k_ShiftedImm, Ctx); - Op->ShiftedImm .Val = Val; - Op->ShiftedImm.ShiftAmount = ShiftAmount; - Op->StartLoc = S; - Op->EndLoc = E; - return Op; - } - - static ARM64Operand *CreateCondCode(ARM64CC::CondCode Code, SMLoc S, SMLoc E, - MCContext &Ctx) { - ARM64Operand *Op = new ARM64Operand(k_CondCode, Ctx); - Op->CondCode.Code = Code; - Op->StartLoc = S; - Op->EndLoc = E; - return Op; - } - - static ARM64Operand *CreateFPImm(unsigned Val, SMLoc S, MCContext &Ctx) { - ARM64Operand *Op = new ARM64Operand(k_FPImm, Ctx); - Op->FPImm.Val = Val; - Op->StartLoc = S; - Op->EndLoc = S; - return Op; - } - - static ARM64Operand *CreateBarrier(unsigned Val, SMLoc S, MCContext &Ctx) { - ARM64Operand *Op = new ARM64Operand(k_Barrier, Ctx); - Op->Barrier.Val = Val; - Op->StartLoc = S; - Op->EndLoc = S; - return Op; - } - - static ARM64Operand *CreateSysReg(StringRef Str, SMLoc S, - uint64_t FeatureBits, MCContext &Ctx) { - ARM64Operand *Op = new ARM64Operand(k_SysReg, Ctx); - Op->SysReg.Data = Str.data(); - Op->SysReg.Length = Str.size(); - Op->SysReg.FeatureBits = FeatureBits; - Op->StartLoc = S; - Op->EndLoc = S; - return Op; - } - - static ARM64Operand *CreateSysCR(unsigned Val, SMLoc S, SMLoc E, - MCContext &Ctx) { - ARM64Operand *Op = new ARM64Operand(k_SysCR, Ctx); - Op->SysCRImm.Val = Val; - Op->StartLoc = S; - Op->EndLoc = E; - return Op; - } - - static ARM64Operand *CreatePrefetch(unsigned Val, SMLoc S, MCContext &Ctx) { - ARM64Operand *Op = new ARM64Operand(k_Prefetch, Ctx); - Op->Prefetch.Val = Val; - Op->StartLoc = S; - Op->EndLoc = S; - return Op; - } - - static ARM64Operand *CreateShiftExtend(ARM64_AM::ShiftExtendType ShOp, - unsigned Val, bool HasExplicitAmount, - SMLoc S, SMLoc E, MCContext &Ctx) { - ARM64Operand *Op = new ARM64Operand(k_ShiftExtend, Ctx); - Op->ShiftExtend.Type = ShOp; - Op->ShiftExtend.Amount = Val; - Op->ShiftExtend.HasExplicitAmount = HasExplicitAmount; - Op->StartLoc = S; - Op->EndLoc = E; - return Op; - } -}; - -} // end anonymous namespace. - -void ARM64Operand::print(raw_ostream &OS) const { - switch (Kind) { - case k_FPImm: - OS << ""; - break; - case k_Barrier: { - bool Valid; - StringRef Name = ARM64DB::DBarrierMapper().toString(getBarrier(), Valid); - if (Valid) - OS << ""; - else - OS << ""; - break; - } - case k_Immediate: - getImm()->print(OS); - break; - case k_ShiftedImm: { - unsigned Shift = getShiftedImmShift(); - OS << "print(OS); - OS << ", lsl #" << ARM64_AM::getShiftValue(Shift) << ">"; - break; - } - case k_CondCode: - OS << ""; - break; - case k_Register: - OS << ""; - break; - case k_VectorList: { - OS << ""; - break; - } - case k_VectorIndex: - OS << ""; - break; - case k_SysReg: - OS << "'; - break; - case k_Token: - OS << "'" << getToken() << "'"; - break; - case k_SysCR: - OS << "c" << getSysCR(); - break; - case k_Prefetch: { - bool Valid; - StringRef Name = ARM64PRFM::PRFMMapper().toString(getPrefetch(), Valid); - if (Valid) - OS << ""; - else - OS << ""; - break; - } - case k_ShiftExtend: { - OS << "<" << ARM64_AM::getShiftExtendName(getShiftExtendType()) << " #" - << getShiftExtendAmount(); - if (!hasShiftExtendAmount()) - OS << ""; - OS << '>'; - break; - } - } -} - -/// @name Auto-generated Match Functions -/// { - -static unsigned MatchRegisterName(StringRef Name); - -/// } - -static unsigned matchVectorRegName(StringRef Name) { - return StringSwitch(Name) - .Case("v0", ARM64::Q0) - .Case("v1", ARM64::Q1) - .Case("v2", ARM64::Q2) - .Case("v3", ARM64::Q3) - .Case("v4", ARM64::Q4) - .Case("v5", ARM64::Q5) - .Case("v6", ARM64::Q6) - .Case("v7", ARM64::Q7) - .Case("v8", ARM64::Q8) - .Case("v9", ARM64::Q9) - .Case("v10", ARM64::Q10) - .Case("v11", ARM64::Q11) - .Case("v12", ARM64::Q12) - .Case("v13", ARM64::Q13) - .Case("v14", ARM64::Q14) - .Case("v15", ARM64::Q15) - .Case("v16", ARM64::Q16) - .Case("v17", ARM64::Q17) - .Case("v18", ARM64::Q18) - .Case("v19", ARM64::Q19) - .Case("v20", ARM64::Q20) - .Case("v21", ARM64::Q21) - .Case("v22", ARM64::Q22) - .Case("v23", ARM64::Q23) - .Case("v24", ARM64::Q24) - .Case("v25", ARM64::Q25) - .Case("v26", ARM64::Q26) - .Case("v27", ARM64::Q27) - .Case("v28", ARM64::Q28) - .Case("v29", ARM64::Q29) - .Case("v30", ARM64::Q30) - .Case("v31", ARM64::Q31) - .Default(0); -} - -static bool isValidVectorKind(StringRef Name) { - return StringSwitch(Name.lower()) - .Case(".8b", true) - .Case(".16b", true) - .Case(".4h", true) - .Case(".8h", true) - .Case(".2s", true) - .Case(".4s", true) - .Case(".1d", true) - .Case(".2d", true) - .Case(".1q", true) - // Accept the width neutral ones, too, for verbose syntax. If those - // aren't used in the right places, the token operand won't match so - // all will work out. - .Case(".b", true) - .Case(".h", true) - .Case(".s", true) - .Case(".d", true) - .Default(false); -} - -static void parseValidVectorKind(StringRef Name, unsigned &NumElements, - char &ElementKind) { - assert(isValidVectorKind(Name)); - - ElementKind = Name.lower()[Name.size() - 1]; - NumElements = 0; - - if (Name.size() == 2) - return; - - // Parse the lane count - Name = Name.drop_front(); - while (isdigit(Name.front())) { - NumElements = 10 * NumElements + (Name.front() - '0'); - Name = Name.drop_front(); - } -} - -bool ARM64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, - SMLoc &EndLoc) { - StartLoc = getLoc(); - RegNo = tryParseRegister(); - EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1); - return (RegNo == (unsigned)-1); -} - -/// tryParseRegister - Try to parse a register name. The token must be an -/// Identifier when called, and if it is a register name the token is eaten and -/// the register is added to the operand list. -int ARM64AsmParser::tryParseRegister() { - const AsmToken &Tok = Parser.getTok(); - assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); - - std::string lowerCase = Tok.getString().lower(); - unsigned RegNum = MatchRegisterName(lowerCase); - // Also handle a few aliases of registers. - if (RegNum == 0) - RegNum = StringSwitch(lowerCase) - .Case("fp", ARM64::FP) - .Case("lr", ARM64::LR) - .Case("x31", ARM64::XZR) - .Case("w31", ARM64::WZR) - .Default(0); - - if (RegNum == 0) - return -1; - - Parser.Lex(); // Eat identifier token. - return RegNum; -} - -/// tryMatchVectorRegister - Try to parse a vector register name with optional -/// kind specifier. If it is a register specifier, eat the token and return it. -int ARM64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) { - if (Parser.getTok().isNot(AsmToken::Identifier)) { - TokError("vector register expected"); - return -1; - } - - StringRef Name = Parser.getTok().getString(); - // If there is a kind specifier, it's separated from the register name by - // a '.'. - size_t Start = 0, Next = Name.find('.'); - StringRef Head = Name.slice(Start, Next); - unsigned RegNum = matchVectorRegName(Head); - if (RegNum) { - if (Next != StringRef::npos) { - Kind = Name.slice(Next, StringRef::npos); - if (!isValidVectorKind(Kind)) { - TokError("invalid vector kind qualifier"); - return -1; - } - } - Parser.Lex(); // Eat the register token. - return RegNum; - } - - if (expected) - TokError("vector register expected"); - return -1; -} - -/// tryParseSysCROperand - Try to parse a system instruction CR operand name. -ARM64AsmParser::OperandMatchResultTy -ARM64AsmParser::tryParseSysCROperand(OperandVector &Operands) { - SMLoc S = getLoc(); - - if (Parser.getTok().isNot(AsmToken::Identifier)) { - Error(S, "Expected cN operand where 0 <= N <= 15"); - return MatchOperand_ParseFail; - } - - StringRef Tok = Parser.getTok().getIdentifier(); - if (Tok[0] != 'c' && Tok[0] != 'C') { - Error(S, "Expected cN operand where 0 <= N <= 15"); - return MatchOperand_ParseFail; - } - - uint32_t CRNum; - bool BadNum = Tok.drop_front().getAsInteger(10, CRNum); - if (BadNum || CRNum > 15) { - Error(S, "Expected cN operand where 0 <= N <= 15"); - return MatchOperand_ParseFail; - } - - Parser.Lex(); // Eat identifier token. - Operands.push_back(ARM64Operand::CreateSysCR(CRNum, S, getLoc(), getContext())); - return MatchOperand_Success; -} - -/// tryParsePrefetch - Try to parse a prefetch operand. -ARM64AsmParser::OperandMatchResultTy -ARM64AsmParser::tryParsePrefetch(OperandVector &Operands) { - SMLoc S = getLoc(); - const AsmToken &Tok = Parser.getTok(); - // Either an identifier for named values or a 5-bit immediate. - bool Hash = Tok.is(AsmToken::Hash); - if (Hash || Tok.is(AsmToken::Integer)) { - if (Hash) - Parser.Lex(); // Eat hash token. - const MCExpr *ImmVal; - if (getParser().parseExpression(ImmVal)) - return MatchOperand_ParseFail; - - const MCConstantExpr *MCE = dyn_cast(ImmVal); - if (!MCE) { - TokError("immediate value expected for prefetch operand"); - return MatchOperand_ParseFail; - } - unsigned prfop = MCE->getValue(); - if (prfop > 31) { - TokError("prefetch operand out of range, [0,31] expected"); - return MatchOperand_ParseFail; - } - - Operands.push_back(ARM64Operand::CreatePrefetch(prfop, S, getContext())); - return MatchOperand_Success; - } - - if (Tok.isNot(AsmToken::Identifier)) { - TokError("pre-fetch hint expected"); - return MatchOperand_ParseFail; - } - - bool Valid; - unsigned prfop = ARM64PRFM::PRFMMapper().fromString(Tok.getString(), Valid); - if (!Valid) { - TokError("pre-fetch hint expected"); - return MatchOperand_ParseFail; - } - - Parser.Lex(); // Eat identifier token. - Operands.push_back(ARM64Operand::CreatePrefetch(prfop, S, getContext())); - return MatchOperand_Success; -} - -/// tryParseAdrpLabel - Parse and validate a source label for the ADRP -/// instruction. -ARM64AsmParser::OperandMatchResultTy -ARM64AsmParser::tryParseAdrpLabel(OperandVector &Operands) { - SMLoc S = getLoc(); - const MCExpr *Expr; - - if (Parser.getTok().is(AsmToken::Hash)) { - Parser.Lex(); // Eat hash token. - } - - if (parseSymbolicImmVal(Expr)) - return MatchOperand_ParseFail; - - ARM64MCExpr::VariantKind ELFRefKind; - MCSymbolRefExpr::VariantKind DarwinRefKind; - int64_t Addend; - if (classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) { - if (DarwinRefKind == MCSymbolRefExpr::VK_None && - ELFRefKind == ARM64MCExpr::VK_INVALID) { - // No modifier was specified at all; this is the syntax for an ELF basic - // ADRP relocation (unfortunately). - Expr = ARM64MCExpr::Create(Expr, ARM64MCExpr::VK_ABS_PAGE, getContext()); - } else if ((DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGE || - DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGE) && - Addend != 0) { - Error(S, "gotpage label reference not allowed an addend"); - return MatchOperand_ParseFail; - } else if (DarwinRefKind != MCSymbolRefExpr::VK_PAGE && - DarwinRefKind != MCSymbolRefExpr::VK_GOTPAGE && - DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE && - ELFRefKind != ARM64MCExpr::VK_GOT_PAGE && - ELFRefKind != ARM64MCExpr::VK_GOTTPREL_PAGE && - ELFRefKind != ARM64MCExpr::VK_TLSDESC_PAGE) { - // The operand must be an @page or @gotpage qualified symbolref. - Error(S, "page or gotpage label reference expected"); - return MatchOperand_ParseFail; - } - } - - // We have either a label reference possibly with addend or an immediate. The - // addend is a raw value here. The linker will adjust it to only reference the - // page. - SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1); - Operands.push_back(ARM64Operand::CreateImm(Expr, S, E, getContext())); - - return MatchOperand_Success; -} - -/// tryParseAdrLabel - Parse and validate a source label for the ADR -/// instruction. -ARM64AsmParser::OperandMatchResultTy -ARM64AsmParser::tryParseAdrLabel(OperandVector &Operands) { - SMLoc S = getLoc(); - const MCExpr *Expr; - - if (Parser.getTok().is(AsmToken::Hash)) { - Parser.Lex(); // Eat hash token. - } - - if (getParser().parseExpression(Expr)) - return MatchOperand_ParseFail; - - SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1); - Operands.push_back(ARM64Operand::CreateImm(Expr, S, E, getContext())); - - return MatchOperand_Success; -} - -/// tryParseFPImm - A floating point immediate expression operand. -ARM64AsmParser::OperandMatchResultTy -ARM64AsmParser::tryParseFPImm(OperandVector &Operands) { - SMLoc S = getLoc(); - - bool Hash = false; - if (Parser.getTok().is(AsmToken::Hash)) { - Parser.Lex(); // Eat '#' - Hash = true; - } - - // Handle negation, as that still comes through as a separate token. - bool isNegative = false; - if (Parser.getTok().is(AsmToken::Minus)) { - isNegative = true; - Parser.Lex(); - } - const AsmToken &Tok = Parser.getTok(); - if (Tok.is(AsmToken::Real)) { - APFloat RealVal(APFloat::IEEEdouble, Tok.getString()); - uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue(); - // If we had a '-' in front, toggle the sign bit. - IntVal ^= (uint64_t)isNegative << 63; - int Val = ARM64_AM::getFP64Imm(APInt(64, IntVal)); - Parser.Lex(); // Eat the token. - // Check for out of range values. As an exception, we let Zero through, - // as we handle that special case in post-processing before matching in - // order to use the zero register for it. - if (Val == -1 && !RealVal.isZero()) { - TokError("expected compatible register or floating-point constant"); - return MatchOperand_ParseFail; - } - Operands.push_back(ARM64Operand::CreateFPImm(Val, S, getContext())); - return MatchOperand_Success; - } - if (Tok.is(AsmToken::Integer)) { - int64_t Val; - if (!isNegative && Tok.getString().startswith("0x")) { - Val = Tok.getIntVal(); - if (Val > 255 || Val < 0) { - TokError("encoded floating point value out of range"); - return MatchOperand_ParseFail; - } - } else { - APFloat RealVal(APFloat::IEEEdouble, Tok.getString()); - uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue(); - // If we had a '-' in front, toggle the sign bit. - IntVal ^= (uint64_t)isNegative << 63; - Val = ARM64_AM::getFP64Imm(APInt(64, IntVal)); - } - Parser.Lex(); // Eat the token. - Operands.push_back(ARM64Operand::CreateFPImm(Val, S, getContext())); - return MatchOperand_Success; - } - - if (!Hash) - return MatchOperand_NoMatch; - - TokError("invalid floating point immediate"); - return MatchOperand_ParseFail; -} - -/// tryParseAddSubImm - Parse ADD/SUB shifted immediate operand -ARM64AsmParser::OperandMatchResultTy -ARM64AsmParser::tryParseAddSubImm(OperandVector &Operands) { - SMLoc S = getLoc(); - - if (Parser.getTok().is(AsmToken::Hash)) - Parser.Lex(); // Eat '#' - else if (Parser.getTok().isNot(AsmToken::Integer)) - // Operand should start from # or should be integer, emit error otherwise. - return MatchOperand_NoMatch; - - const MCExpr *Imm; - if (parseSymbolicImmVal(Imm)) - return MatchOperand_ParseFail; - else if (Parser.getTok().isNot(AsmToken::Comma)) { - uint64_t ShiftAmount = 0; - const MCConstantExpr *MCE = dyn_cast(Imm); - if (MCE) { - int64_t Val = MCE->getValue(); - if (Val > 0xfff && (Val & 0xfff) == 0) { - Imm = MCConstantExpr::Create(Val >> 12, getContext()); - ShiftAmount = 12; - } - } - SMLoc E = Parser.getTok().getLoc(); - Operands.push_back(ARM64Operand::CreateShiftedImm(Imm, ShiftAmount, S, E, - getContext())); - return MatchOperand_Success; - } - - // Eat ',' - Parser.Lex(); - - // The optional operand must be "lsl #N" where N is non-negative. - if (!Parser.getTok().is(AsmToken::Identifier) || - !Parser.getTok().getIdentifier().equals_lower("lsl")) { - Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate"); - return MatchOperand_ParseFail; - } - - // Eat 'lsl' - Parser.Lex(); - - if (Parser.getTok().is(AsmToken::Hash)) { - Parser.Lex(); - } - - if (Parser.getTok().isNot(AsmToken::Integer)) { - Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate"); - return MatchOperand_ParseFail; - } - - int64_t ShiftAmount = Parser.getTok().getIntVal(); - - if (ShiftAmount < 0) { - Error(Parser.getTok().getLoc(), "positive shift amount required"); - return MatchOperand_ParseFail; - } - Parser.Lex(); // Eat the number - - SMLoc E = Parser.getTok().getLoc(); - Operands.push_back(ARM64Operand::CreateShiftedImm(Imm, ShiftAmount, - S, E, getContext())); - return MatchOperand_Success; -} - -/// parseCondCodeString - Parse a Condition Code string. -ARM64CC::CondCode ARM64AsmParser::parseCondCodeString(StringRef Cond) { - ARM64CC::CondCode CC = StringSwitch(Cond.lower()) - .Case("eq", ARM64CC::EQ) - .Case("ne", ARM64CC::NE) - .Case("cs", ARM64CC::HS) - .Case("hs", ARM64CC::HS) - .Case("cc", ARM64CC::LO) - .Case("lo", ARM64CC::LO) - .Case("mi", ARM64CC::MI) - .Case("pl", ARM64CC::PL) - .Case("vs", ARM64CC::VS) - .Case("vc", ARM64CC::VC) - .Case("hi", ARM64CC::HI) - .Case("ls", ARM64CC::LS) - .Case("ge", ARM64CC::GE) - .Case("lt", ARM64CC::LT) - .Case("gt", ARM64CC::GT) - .Case("le", ARM64CC::LE) - .Case("al", ARM64CC::AL) - .Case("nv", ARM64CC::NV) - .Default(ARM64CC::Invalid); - return CC; -} - -/// parseCondCode - Parse a Condition Code operand. -bool ARM64AsmParser::parseCondCode(OperandVector &Operands, - bool invertCondCode) { - SMLoc S = getLoc(); - const AsmToken &Tok = Parser.getTok(); - assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier"); - - StringRef Cond = Tok.getString(); - ARM64CC::CondCode CC = parseCondCodeString(Cond); - if (CC == ARM64CC::Invalid) - return TokError("invalid condition code"); - Parser.Lex(); // Eat identifier token. - - if (invertCondCode) - CC = ARM64CC::getInvertedCondCode(ARM64CC::CondCode(CC)); - - Operands.push_back( - ARM64Operand::CreateCondCode(CC, S, getLoc(), getContext())); - return false; -} - -/// tryParseOptionalShift - Some operands take an optional shift argument. Parse -/// them if present. -ARM64AsmParser::OperandMatchResultTy -ARM64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) { - const AsmToken &Tok = Parser.getTok(); - std::string LowerID = Tok.getString().lower(); - ARM64_AM::ShiftExtendType ShOp = - StringSwitch(LowerID) - .Case("lsl", ARM64_AM::LSL) - .Case("lsr", ARM64_AM::LSR) - .Case("asr", ARM64_AM::ASR) - .Case("ror", ARM64_AM::ROR) - .Case("msl", ARM64_AM::MSL) - .Case("uxtb", ARM64_AM::UXTB) - .Case("uxth", ARM64_AM::UXTH) - .Case("uxtw", ARM64_AM::UXTW) - .Case("uxtx", ARM64_AM::UXTX) - .Case("sxtb", ARM64_AM::SXTB) - .Case("sxth", ARM64_AM::SXTH) - .Case("sxtw", ARM64_AM::SXTW) - .Case("sxtx", ARM64_AM::SXTX) - .Default(ARM64_AM::InvalidShiftExtend); - - if (ShOp == ARM64_AM::InvalidShiftExtend) - return MatchOperand_NoMatch; - - SMLoc S = Tok.getLoc(); - Parser.Lex(); - - bool Hash = getLexer().is(AsmToken::Hash); - if (!Hash && getLexer().isNot(AsmToken::Integer)) { - if (ShOp == ARM64_AM::LSL || ShOp == ARM64_AM::LSR || - ShOp == ARM64_AM::ASR || ShOp == ARM64_AM::ROR || - ShOp == ARM64_AM::MSL) { - // We expect a number here. - TokError("expected #imm after shift specifier"); - return MatchOperand_ParseFail; - } - - // "extend" type operatoins don't need an immediate, #0 is implicit. - SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1); - Operands.push_back( - ARM64Operand::CreateShiftExtend(ShOp, 0, false, S, E, getContext())); - return MatchOperand_Success; - } - - if (Hash) - Parser.Lex(); // Eat the '#'. - - // Make sure we do actually have a number - if (!Parser.getTok().is(AsmToken::Integer)) { - Error(Parser.getTok().getLoc(), - "expected integer shift amount"); - return MatchOperand_ParseFail; - } - - const MCExpr *ImmVal; - if (getParser().parseExpression(ImmVal)) - return MatchOperand_ParseFail; - - const MCConstantExpr *MCE = dyn_cast(ImmVal); - if (!MCE) { - TokError("expected #imm after shift specifier"); - return MatchOperand_ParseFail; - } - - SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1); - Operands.push_back(ARM64Operand::CreateShiftExtend(ShOp, MCE->getValue(), - true, S, E, getContext())); - return MatchOperand_Success; -} - -/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for -/// the SYS instruction. Parse them specially so that we create a SYS MCInst. -bool ARM64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, - OperandVector &Operands) { - if (Name.find('.') != StringRef::npos) - return TokError("invalid operand"); - - Mnemonic = Name; - Operands.push_back( - ARM64Operand::CreateToken("sys", false, NameLoc, getContext())); - - const AsmToken &Tok = Parser.getTok(); - StringRef Op = Tok.getString(); - SMLoc S = Tok.getLoc(); - - const MCExpr *Expr = nullptr; - -#define SYS_ALIAS(op1, Cn, Cm, op2) \ - do { \ - Expr = MCConstantExpr::Create(op1, getContext()); \ - Operands.push_back( \ - ARM64Operand::CreateImm(Expr, S, getLoc(), getContext())); \ - Operands.push_back( \ - ARM64Operand::CreateSysCR(Cn, S, getLoc(), getContext())); \ - Operands.push_back( \ - ARM64Operand::CreateSysCR(Cm, S, getLoc(), getContext())); \ - Expr = MCConstantExpr::Create(op2, getContext()); \ - Operands.push_back( \ - ARM64Operand::CreateImm(Expr, S, getLoc(), getContext())); \ - } while (0) - - if (Mnemonic == "ic") { - if (!Op.compare_lower("ialluis")) { - // SYS #0, C7, C1, #0 - SYS_ALIAS(0, 7, 1, 0); - } else if (!Op.compare_lower("iallu")) { - // SYS #0, C7, C5, #0 - SYS_ALIAS(0, 7, 5, 0); - } else if (!Op.compare_lower("ivau")) { - // SYS #3, C7, C5, #1 - SYS_ALIAS(3, 7, 5, 1); - } else { - return TokError("invalid operand for IC instruction"); - } - } else if (Mnemonic == "dc") { - if (!Op.compare_lower("zva")) { - // SYS #3, C7, C4, #1 - SYS_ALIAS(3, 7, 4, 1); - } else if (!Op.compare_lower("ivac")) { - // SYS #3, C7, C6, #1 - SYS_ALIAS(0, 7, 6, 1); - } else if (!Op.compare_lower("isw")) { - // SYS #0, C7, C6, #2 - SYS_ALIAS(0, 7, 6, 2); - } else if (!Op.compare_lower("cvac")) { - // SYS #3, C7, C10, #1 - SYS_ALIAS(3, 7, 10, 1); - } else if (!Op.compare_lower("csw")) { - // SYS #0, C7, C10, #2 - SYS_ALIAS(0, 7, 10, 2); - } else if (!Op.compare_lower("cvau")) { - // SYS #3, C7, C11, #1 - SYS_ALIAS(3, 7, 11, 1); - } else if (!Op.compare_lower("civac")) { - // SYS #3, C7, C14, #1 - SYS_ALIAS(3, 7, 14, 1); - } else if (!Op.compare_lower("cisw")) { - // SYS #0, C7, C14, #2 - SYS_ALIAS(0, 7, 14, 2); - } else { - return TokError("invalid operand for DC instruction"); - } - } else if (Mnemonic == "at") { - if (!Op.compare_lower("s1e1r")) { - // SYS #0, C7, C8, #0 - SYS_ALIAS(0, 7, 8, 0); - } else if (!Op.compare_lower("s1e2r")) { - // SYS #4, C7, C8, #0 - SYS_ALIAS(4, 7, 8, 0); - } else if (!Op.compare_lower("s1e3r")) { - // SYS #6, C7, C8, #0 - SYS_ALIAS(6, 7, 8, 0); - } else if (!Op.compare_lower("s1e1w")) { - // SYS #0, C7, C8, #1 - SYS_ALIAS(0, 7, 8, 1); - } else if (!Op.compare_lower("s1e2w")) { - // SYS #4, C7, C8, #1 - SYS_ALIAS(4, 7, 8, 1); - } else if (!Op.compare_lower("s1e3w")) { - // SYS #6, C7, C8, #1 - SYS_ALIAS(6, 7, 8, 1); - } else if (!Op.compare_lower("s1e0r")) { - // SYS #0, C7, C8, #3 - SYS_ALIAS(0, 7, 8, 2); - } else if (!Op.compare_lower("s1e0w")) { - // SYS #0, C7, C8, #3 - SYS_ALIAS(0, 7, 8, 3); - } else if (!Op.compare_lower("s12e1r")) { - // SYS #4, C7, C8, #4 - SYS_ALIAS(4, 7, 8, 4); - } else if (!Op.compare_lower("s12e1w")) { - // SYS #4, C7, C8, #5 - SYS_ALIAS(4, 7, 8, 5); - } else if (!Op.compare_lower("s12e0r")) { - // SYS #4, C7, C8, #6 - SYS_ALIAS(4, 7, 8, 6); - } else if (!Op.compare_lower("s12e0w")) { - // SYS #4, C7, C8, #7 - SYS_ALIAS(4, 7, 8, 7); - } else { - return TokError("invalid operand for AT instruction"); - } - } else if (Mnemonic == "tlbi") { - if (!Op.compare_lower("vmalle1is")) { - // SYS #0, C8, C3, #0 - SYS_ALIAS(0, 8, 3, 0); - } else if (!Op.compare_lower("alle2is")) { - // SYS #4, C8, C3, #0 - SYS_ALIAS(4, 8, 3, 0); - } else if (!Op.compare_lower("alle3is")) { - // SYS #6, C8, C3, #0 - SYS_ALIAS(6, 8, 3, 0); - } else if (!Op.compare_lower("vae1is")) { - // SYS #0, C8, C3, #1 - SYS_ALIAS(0, 8, 3, 1); - } else if (!Op.compare_lower("vae2is")) { - // SYS #4, C8, C3, #1 - SYS_ALIAS(4, 8, 3, 1); - } else if (!Op.compare_lower("vae3is")) { - // SYS #6, C8, C3, #1 - SYS_ALIAS(6, 8, 3, 1); - } else if (!Op.compare_lower("aside1is")) { - // SYS #0, C8, C3, #2 - SYS_ALIAS(0, 8, 3, 2); - } else if (!Op.compare_lower("vaae1is")) { - // SYS #0, C8, C3, #3 - SYS_ALIAS(0, 8, 3, 3); - } else if (!Op.compare_lower("alle1is")) { - // SYS #4, C8, C3, #4 - SYS_ALIAS(4, 8, 3, 4); - } else if (!Op.compare_lower("vale1is")) { - // SYS #0, C8, C3, #5 - SYS_ALIAS(0, 8, 3, 5); - } else if (!Op.compare_lower("vaale1is")) { - // SYS #0, C8, C3, #7 - SYS_ALIAS(0, 8, 3, 7); - } else if (!Op.compare_lower("vmalle1")) { - // SYS #0, C8, C7, #0 - SYS_ALIAS(0, 8, 7, 0); - } else if (!Op.compare_lower("alle2")) { - // SYS #4, C8, C7, #0 - SYS_ALIAS(4, 8, 7, 0); - } else if (!Op.compare_lower("vale2is")) { - // SYS #4, C8, C3, #5 - SYS_ALIAS(4, 8, 3, 5); - } else if (!Op.compare_lower("vale3is")) { - // SYS #6, C8, C3, #5 - SYS_ALIAS(6, 8, 3, 5); - } else if (!Op.compare_lower("alle3")) { - // SYS #6, C8, C7, #0 - SYS_ALIAS(6, 8, 7, 0); - } else if (!Op.compare_lower("vae1")) { - // SYS #0, C8, C7, #1 - SYS_ALIAS(0, 8, 7, 1); - } else if (!Op.compare_lower("vae2")) { - // SYS #4, C8, C7, #1 - SYS_ALIAS(4, 8, 7, 1); - } else if (!Op.compare_lower("vae3")) { - // SYS #6, C8, C7, #1 - SYS_ALIAS(6, 8, 7, 1); - } else if (!Op.compare_lower("aside1")) { - // SYS #0, C8, C7, #2 - SYS_ALIAS(0, 8, 7, 2); - } else if (!Op.compare_lower("vaae1")) { - // SYS #0, C8, C7, #3 - SYS_ALIAS(0, 8, 7, 3); - } else if (!Op.compare_lower("alle1")) { - // SYS #4, C8, C7, #4 - SYS_ALIAS(4, 8, 7, 4); - } else if (!Op.compare_lower("vale1")) { - // SYS #0, C8, C7, #5 - SYS_ALIAS(0, 8, 7, 5); - } else if (!Op.compare_lower("vale2")) { - // SYS #4, C8, C7, #5 - SYS_ALIAS(4, 8, 7, 5); - } else if (!Op.compare_lower("vale3")) { - // SYS #6, C8, C7, #5 - SYS_ALIAS(6, 8, 7, 5); - } else if (!Op.compare_lower("vaale1")) { - // SYS #0, C8, C7, #7 - SYS_ALIAS(0, 8, 7, 7); - } else if (!Op.compare_lower("ipas2e1")) { - // SYS #4, C8, C4, #1 - SYS_ALIAS(4, 8, 4, 1); - } else if (!Op.compare_lower("ipas2le1")) { - // SYS #4, C8, C4, #5 - SYS_ALIAS(4, 8, 4, 5); - } else if (!Op.compare_lower("ipas2e1is")) { - // SYS #4, C8, C4, #1 - SYS_ALIAS(4, 8, 0, 1); - } else if (!Op.compare_lower("ipas2le1is")) { - // SYS #4, C8, C4, #5 - SYS_ALIAS(4, 8, 0, 5); - } else if (!Op.compare_lower("vmalls12e1")) { - // SYS #4, C8, C7, #6 - SYS_ALIAS(4, 8, 7, 6); - } else if (!Op.compare_lower("vmalls12e1is")) { - // SYS #4, C8, C3, #6 - SYS_ALIAS(4, 8, 3, 6); - } else { - return TokError("invalid operand for TLBI instruction"); - } - } - -#undef SYS_ALIAS - - Parser.Lex(); // Eat operand. - - bool ExpectRegister = (Op.lower().find("all") == StringRef::npos); - bool HasRegister = false; - - // Check for the optional register operand. - if (getLexer().is(AsmToken::Comma)) { - Parser.Lex(); // Eat comma. - - if (Tok.isNot(AsmToken::Identifier) || parseRegister(Operands)) - return TokError("expected register operand"); - - HasRegister = true; - } - - if (getLexer().isNot(AsmToken::EndOfStatement)) { - Parser.eatToEndOfStatement(); - return TokError("unexpected token in argument list"); - } - - if (ExpectRegister && !HasRegister) { - return TokError("specified " + Mnemonic + " op requires a register"); - } - else if (!ExpectRegister && HasRegister) { - return TokError("specified " + Mnemonic + " op does not use a register"); - } - - Parser.Lex(); // Consume the EndOfStatement - return false; -} - -ARM64AsmParser::OperandMatchResultTy -ARM64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { - const AsmToken &Tok = Parser.getTok(); - - // Can be either a #imm style literal or an option name - bool Hash = Tok.is(AsmToken::Hash); - if (Hash || Tok.is(AsmToken::Integer)) { - // Immediate operand. - if (Hash) - Parser.Lex(); // Eat the '#' - const MCExpr *ImmVal; - SMLoc ExprLoc = getLoc(); - if (getParser().parseExpression(ImmVal)) - return MatchOperand_ParseFail; - const MCConstantExpr *MCE = dyn_cast(ImmVal); - if (!MCE) { - Error(ExprLoc, "immediate value expected for barrier operand"); - return MatchOperand_ParseFail; - } - if (MCE->getValue() < 0 || MCE->getValue() > 15) { - Error(ExprLoc, "barrier operand out of range"); - return MatchOperand_ParseFail; - } - Operands.push_back( - ARM64Operand::CreateBarrier(MCE->getValue(), ExprLoc, getContext())); - return MatchOperand_Success; - } - - if (Tok.isNot(AsmToken::Identifier)) { - TokError("invalid operand for instruction"); - return MatchOperand_ParseFail; - } - - bool Valid; - unsigned Opt = ARM64DB::DBarrierMapper().fromString(Tok.getString(), Valid); - if (!Valid) { - TokError("invalid barrier option name"); - return MatchOperand_ParseFail; - } - - // The only valid named option for ISB is 'sy' - if (Mnemonic == "isb" && Opt != ARM64DB::SY) { - TokError("'sy' or #imm operand expected"); - return MatchOperand_ParseFail; - } - - Operands.push_back(ARM64Operand::CreateBarrier(Opt, getLoc(), getContext())); - Parser.Lex(); // Consume the option - - return MatchOperand_Success; -} - -ARM64AsmParser::OperandMatchResultTy -ARM64AsmParser::tryParseSysReg(OperandVector &Operands) { - const AsmToken &Tok = Parser.getTok(); - - if (Tok.isNot(AsmToken::Identifier)) - return MatchOperand_NoMatch; - - Operands.push_back(ARM64Operand::CreateSysReg(Tok.getString(), getLoc(), - STI.getFeatureBits(), getContext())); - Parser.Lex(); // Eat identifier - - return MatchOperand_Success; -} - -/// tryParseVectorRegister - Parse a vector register operand. -bool ARM64AsmParser::tryParseVectorRegister(OperandVector &Operands) { - if (Parser.getTok().isNot(AsmToken::Identifier)) - return true; - - SMLoc S = getLoc(); - // Check for a vector register specifier first. - StringRef Kind; - int64_t Reg = tryMatchVectorRegister(Kind, false); - if (Reg == -1) - return true; - Operands.push_back( - ARM64Operand::CreateReg(Reg, true, S, getLoc(), getContext())); - // If there was an explicit qualifier, that goes on as a literal text - // operand. - if (!Kind.empty()) - Operands.push_back(ARM64Operand::CreateToken(Kind, false, S, getContext())); - - // If there is an index specifier following the register, parse that too. - if (Parser.getTok().is(AsmToken::LBrac)) { - SMLoc SIdx = getLoc(); - Parser.Lex(); // Eat left bracket token. - - const MCExpr *ImmVal; - if (getParser().parseExpression(ImmVal)) - return false; - const MCConstantExpr *MCE = dyn_cast(ImmVal); - if (!MCE) { - TokError("immediate value expected for vector index"); - return false; - } - - SMLoc E = getLoc(); - if (Parser.getTok().isNot(AsmToken::RBrac)) { - Error(E, "']' expected"); - return false; - } - - Parser.Lex(); // Eat right bracket token. - - Operands.push_back(ARM64Operand::CreateVectorIndex(MCE->getValue(), SIdx, E, - getContext())); - } - - return false; -} - -/// parseRegister - Parse a non-vector register operand. -bool ARM64AsmParser::parseRegister(OperandVector &Operands) { - SMLoc S = getLoc(); - // Try for a vector register. - if (!tryParseVectorRegister(Operands)) - return false; - - // Try for a scalar register. - int64_t Reg = tryParseRegister(); - if (Reg == -1) - return true; - Operands.push_back( - ARM64Operand::CreateReg(Reg, false, S, getLoc(), getContext())); - - // A small number of instructions (FMOVXDhighr, for example) have "[1]" - // as a string token in the instruction itself. - if (getLexer().getKind() == AsmToken::LBrac) { - SMLoc LBracS = getLoc(); - Parser.Lex(); - const AsmToken &Tok = Parser.getTok(); - if (Tok.is(AsmToken::Integer)) { - SMLoc IntS = getLoc(); - int64_t Val = Tok.getIntVal(); - if (Val == 1) { - Parser.Lex(); - if (getLexer().getKind() == AsmToken::RBrac) { - SMLoc RBracS = getLoc(); - Parser.Lex(); - Operands.push_back( - ARM64Operand::CreateToken("[", false, LBracS, getContext())); - Operands.push_back( - ARM64Operand::CreateToken("1", false, IntS, getContext())); - Operands.push_back( - ARM64Operand::CreateToken("]", false, RBracS, getContext())); - return false; - } - } - } - } - - return false; -} - -bool ARM64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) { - bool HasELFModifier = false; - ARM64MCExpr::VariantKind RefKind; - - if (Parser.getTok().is(AsmToken::Colon)) { - Parser.Lex(); // Eat ':" - HasELFModifier = true; - - if (Parser.getTok().isNot(AsmToken::Identifier)) { - Error(Parser.getTok().getLoc(), - "expect relocation specifier in operand after ':'"); - return true; - } - - std::string LowerCase = Parser.getTok().getIdentifier().lower(); - RefKind = StringSwitch(LowerCase) - .Case("lo12", ARM64MCExpr::VK_LO12) - .Case("abs_g3", ARM64MCExpr::VK_ABS_G3) - .Case("abs_g2", ARM64MCExpr::VK_ABS_G2) - .Case("abs_g2_s", ARM64MCExpr::VK_ABS_G2_S) - .Case("abs_g2_nc", ARM64MCExpr::VK_ABS_G2_NC) - .Case("abs_g1", ARM64MCExpr::VK_ABS_G1) - .Case("abs_g1_s", ARM64MCExpr::VK_ABS_G1_S) - .Case("abs_g1_nc", ARM64MCExpr::VK_ABS_G1_NC) - .Case("abs_g0", ARM64MCExpr::VK_ABS_G0) - .Case("abs_g0_s", ARM64MCExpr::VK_ABS_G0_S) - .Case("abs_g0_nc", ARM64MCExpr::VK_ABS_G0_NC) - .Case("dtprel_g2", ARM64MCExpr::VK_DTPREL_G2) - .Case("dtprel_g1", ARM64MCExpr::VK_DTPREL_G1) - .Case("dtprel_g1_nc", ARM64MCExpr::VK_DTPREL_G1_NC) - .Case("dtprel_g0", ARM64MCExpr::VK_DTPREL_G0) - .Case("dtprel_g0_nc", ARM64MCExpr::VK_DTPREL_G0_NC) - .Case("dtprel_hi12", ARM64MCExpr::VK_DTPREL_HI12) - .Case("dtprel_lo12", ARM64MCExpr::VK_DTPREL_LO12) - .Case("dtprel_lo12_nc", ARM64MCExpr::VK_DTPREL_LO12_NC) - .Case("tprel_g2", ARM64MCExpr::VK_TPREL_G2) - .Case("tprel_g1", ARM64MCExpr::VK_TPREL_G1) - .Case("tprel_g1_nc", ARM64MCExpr::VK_TPREL_G1_NC) - .Case("tprel_g0", ARM64MCExpr::VK_TPREL_G0) - .Case("tprel_g0_nc", ARM64MCExpr::VK_TPREL_G0_NC) - .Case("tprel_hi12", ARM64MCExpr::VK_TPREL_HI12) - .Case("tprel_lo12", ARM64MCExpr::VK_TPREL_LO12) - .Case("tprel_lo12_nc", ARM64MCExpr::VK_TPREL_LO12_NC) - .Case("tlsdesc_lo12", ARM64MCExpr::VK_TLSDESC_LO12) - .Case("got", ARM64MCExpr::VK_GOT_PAGE) - .Case("got_lo12", ARM64MCExpr::VK_GOT_LO12) - .Case("gottprel", ARM64MCExpr::VK_GOTTPREL_PAGE) - .Case("gottprel_lo12", ARM64MCExpr::VK_GOTTPREL_LO12_NC) - .Case("gottprel_g1", ARM64MCExpr::VK_GOTTPREL_G1) - .Case("gottprel_g0_nc", ARM64MCExpr::VK_GOTTPREL_G0_NC) - .Case("tlsdesc", ARM64MCExpr::VK_TLSDESC_PAGE) - .Default(ARM64MCExpr::VK_INVALID); - - if (RefKind == ARM64MCExpr::VK_INVALID) { - Error(Parser.getTok().getLoc(), - "expect relocation specifier in operand after ':'"); - return true; - } - - Parser.Lex(); // Eat identifier - - if (Parser.getTok().isNot(AsmToken::Colon)) { - Error(Parser.getTok().getLoc(), "expect ':' after relocation specifier"); - return true; - } - Parser.Lex(); // Eat ':' - } - - if (getParser().parseExpression(ImmVal)) - return true; - - if (HasELFModifier) - ImmVal = ARM64MCExpr::Create(ImmVal, RefKind, getContext()); - - return false; -} - -/// parseVectorList - Parse a vector list operand for AdvSIMD instructions. -bool ARM64AsmParser::parseVectorList(OperandVector &Operands) { - assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Bracket"); - SMLoc S = getLoc(); - Parser.Lex(); // Eat left bracket token. - StringRef Kind; - int64_t FirstReg = tryMatchVectorRegister(Kind, true); - if (FirstReg == -1) - return true; - int64_t PrevReg = FirstReg; - unsigned Count = 1; - - if (Parser.getTok().is(AsmToken::Minus)) { - Parser.Lex(); // Eat the minus. - - SMLoc Loc = getLoc(); - StringRef NextKind; - int64_t Reg = tryMatchVectorRegister(NextKind, true); - if (Reg == -1) - return true; - // Any Kind suffices must match on all regs in the list. - if (Kind != NextKind) - return Error(Loc, "mismatched register size suffix"); - - unsigned Space = (PrevReg < Reg) ? (Reg - PrevReg) : (Reg + 32 - PrevReg); - - if (Space == 0 || Space > 3) { - return Error(Loc, "invalid number of vectors"); - } - - Count += Space; - } - else { - while (Parser.getTok().is(AsmToken::Comma)) { - Parser.Lex(); // Eat the comma token. - - SMLoc Loc = getLoc(); - StringRef NextKind; - int64_t Reg = tryMatchVectorRegister(NextKind, true); - if (Reg == -1) - return true; - // Any Kind suffices must match on all regs in the list. - if (Kind != NextKind) - return Error(Loc, "mismatched register size suffix"); - - // Registers must be incremental (with wraparound at 31) - if (getContext().getRegisterInfo()->getEncodingValue(Reg) != - (getContext().getRegisterInfo()->getEncodingValue(PrevReg) + 1) % 32) - return Error(Loc, "registers must be sequential"); - - PrevReg = Reg; - ++Count; - } - } - - if (Parser.getTok().isNot(AsmToken::RCurly)) - return Error(getLoc(), "'}' expected"); - Parser.Lex(); // Eat the '}' token. - - if (Count > 4) - return Error(S, "invalid number of vectors"); - - unsigned NumElements = 0; - char ElementKind = 0; - if (!Kind.empty()) - parseValidVectorKind(Kind, NumElements, ElementKind); - - Operands.push_back(ARM64Operand::CreateVectorList( - FirstReg, Count, NumElements, ElementKind, S, getLoc(), getContext())); - - // If there is an index specifier following the list, parse that too. - if (Parser.getTok().is(AsmToken::LBrac)) { - SMLoc SIdx = getLoc(); - Parser.Lex(); // Eat left bracket token. - - const MCExpr *ImmVal; - if (getParser().parseExpression(ImmVal)) - return false; - const MCConstantExpr *MCE = dyn_cast(ImmVal); - if (!MCE) { - TokError("immediate value expected for vector index"); - return false; - } - - SMLoc E = getLoc(); - if (Parser.getTok().isNot(AsmToken::RBrac)) { - Error(E, "']' expected"); - return false; - } - - Parser.Lex(); // Eat right bracket token. - - Operands.push_back(ARM64Operand::CreateVectorIndex(MCE->getValue(), SIdx, E, - getContext())); - } - return false; -} - -ARM64AsmParser::OperandMatchResultTy -ARM64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) { - const AsmToken &Tok = Parser.getTok(); - if (!Tok.is(AsmToken::Identifier)) - return MatchOperand_NoMatch; - - unsigned RegNum = MatchRegisterName(Tok.getString().lower()); - - MCContext &Ctx = getContext(); - const MCRegisterInfo *RI = Ctx.getRegisterInfo(); - if (!RI->getRegClass(ARM64::GPR64spRegClassID).contains(RegNum)) - return MatchOperand_NoMatch; - - SMLoc S = getLoc(); - Parser.Lex(); // Eat register - - if (Parser.getTok().isNot(AsmToken::Comma)) { - Operands.push_back(ARM64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx)); - return MatchOperand_Success; - } - Parser.Lex(); // Eat comma. - - if (Parser.getTok().is(AsmToken::Hash)) - Parser.Lex(); // Eat hash - - if (Parser.getTok().isNot(AsmToken::Integer)) { - Error(getLoc(), "index must be absent or #0"); - return MatchOperand_ParseFail; - } - - const MCExpr *ImmVal; - if (Parser.parseExpression(ImmVal) || !isa(ImmVal) || - cast(ImmVal)->getValue() != 0) { - Error(getLoc(), "index must be absent or #0"); - return MatchOperand_ParseFail; - } - - Operands.push_back(ARM64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx)); - return MatchOperand_Success; -} - -/// parseOperand - Parse a arm instruction operand. For now this parses the -/// operand regardless of the mnemonic. -bool ARM64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode, - bool invertCondCode) { - // Check if the current operand has a custom associated parser, if so, try to - // custom parse the operand, or fallback to the general approach. - OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic); - if (ResTy == MatchOperand_Success) - return false; - // If there wasn't a custom match, try the generic matcher below. Otherwise, - // there was a match, but an error occurred, in which case, just return that - // the operand parsing failed. - if (ResTy == MatchOperand_ParseFail) - return true; - - // Nothing custom, so do general case parsing. - SMLoc S, E; - switch (getLexer().getKind()) { - default: { - SMLoc S = getLoc(); - const MCExpr *Expr; - if (parseSymbolicImmVal(Expr)) - return Error(S, "invalid operand"); - - SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1); - Operands.push_back(ARM64Operand::CreateImm(Expr, S, E, getContext())); - return false; - } - case AsmToken::LBrac: { - SMLoc Loc = Parser.getTok().getLoc(); - Operands.push_back(ARM64Operand::CreateToken("[", false, Loc, - getContext())); - Parser.Lex(); // Eat '[' - - // There's no comma after a '[', so we can parse the next operand - // immediately. - return parseOperand(Operands, false, false); - } - case AsmToken::LCurly: - return parseVectorList(Operands); - case AsmToken::Identifier: { - // If we're expecting a Condition Code operand, then just parse that. - if (isCondCode) - return parseCondCode(Operands, invertCondCode); - - // If it's a register name, parse it. - if (!parseRegister(Operands)) - return false; - - // This could be an optional "shift" or "extend" operand. - OperandMatchResultTy GotShift = tryParseOptionalShiftExtend(Operands); - // We can only continue if no tokens were eaten. - if (GotShift != MatchOperand_NoMatch) - return GotShift; - - // This was not a register so parse other operands that start with an - // identifier (like labels) as expressions and create them as immediates. - const MCExpr *IdVal; - S = getLoc(); - if (getParser().parseExpression(IdVal)) - return true; - - E = SMLoc::getFromPointer(getLoc().getPointer() - 1); - Operands.push_back(ARM64Operand::CreateImm(IdVal, S, E, getContext())); - return false; - } - case AsmToken::Integer: - case AsmToken::Real: - case AsmToken::Hash: { - // #42 -> immediate. - S = getLoc(); - if (getLexer().is(AsmToken::Hash)) - Parser.Lex(); - - // Parse a negative sign - bool isNegative = false; - if (Parser.getTok().is(AsmToken::Minus)) { - isNegative = true; - // We need to consume this token only when we have a Real, otherwise - // we let parseSymbolicImmVal take care of it - if (Parser.getLexer().peekTok().is(AsmToken::Real)) - Parser.Lex(); - } - - // The only Real that should come through here is a literal #0.0 for - // the fcmp[e] r, #0.0 instructions. They expect raw token operands, - // so convert the value. - const AsmToken &Tok = Parser.getTok(); - if (Tok.is(AsmToken::Real)) { - APFloat RealVal(APFloat::IEEEdouble, Tok.getString()); - uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue(); - if (Mnemonic != "fcmp" && Mnemonic != "fcmpe" && Mnemonic != "fcmeq" && - Mnemonic != "fcmge" && Mnemonic != "fcmgt" && Mnemonic != "fcmle" && - Mnemonic != "fcmlt") - return TokError("unexpected floating point literal"); - else if (IntVal != 0 || isNegative) - return TokError("expected floating-point constant #0.0"); - Parser.Lex(); // Eat the token. - - Operands.push_back( - ARM64Operand::CreateToken("#0", false, S, getContext())); - Operands.push_back( - ARM64Operand::CreateToken(".0", false, S, getContext())); - return false; - } - - const MCExpr *ImmVal; - if (parseSymbolicImmVal(ImmVal)) - return true; - - E = SMLoc::getFromPointer(getLoc().getPointer() - 1); - Operands.push_back(ARM64Operand::CreateImm(ImmVal, S, E, getContext())); - return false; - } - } -} - -/// ParseInstruction - Parse an ARM64 instruction mnemonic followed by its -/// operands. -bool ARM64AsmParser::ParseInstruction(ParseInstructionInfo &Info, - StringRef Name, SMLoc NameLoc, - OperandVector &Operands) { - Name = StringSwitch(Name.lower()) - .Case("beq", "b.eq") - .Case("bne", "b.ne") - .Case("bhs", "b.hs") - .Case("bcs", "b.cs") - .Case("blo", "b.lo") - .Case("bcc", "b.cc") - .Case("bmi", "b.mi") - .Case("bpl", "b.pl") - .Case("bvs", "b.vs") - .Case("bvc", "b.vc") - .Case("bhi", "b.hi") - .Case("bls", "b.ls") - .Case("bge", "b.ge") - .Case("blt", "b.lt") - .Case("bgt", "b.gt") - .Case("ble", "b.le") - .Case("bal", "b.al") - .Case("bnv", "b.nv") - .Default(Name); - - // Create the leading tokens for the mnemonic, split by '.' characters. - size_t Start = 0, Next = Name.find('.'); - StringRef Head = Name.slice(Start, Next); - - // IC, DC, AT, and TLBI instructions are aliases for the SYS instruction. - if (Head == "ic" || Head == "dc" || Head == "at" || Head == "tlbi") { - bool IsError = parseSysAlias(Head, NameLoc, Operands); - if (IsError && getLexer().isNot(AsmToken::EndOfStatement)) - Parser.eatToEndOfStatement(); - return IsError; - } - - Operands.push_back( - ARM64Operand::CreateToken(Head, false, NameLoc, getContext())); - Mnemonic = Head; - - // Handle condition codes for a branch mnemonic - if (Head == "b" && Next != StringRef::npos) { - Start = Next; - Next = Name.find('.', Start + 1); - Head = Name.slice(Start + 1, Next); - - SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() + - (Head.data() - Name.data())); - ARM64CC::CondCode CC = parseCondCodeString(Head); - if (CC == ARM64CC::Invalid) - return Error(SuffixLoc, "invalid condition code"); - Operands.push_back( - ARM64Operand::CreateToken(".", true, SuffixLoc, getContext())); - Operands.push_back( - ARM64Operand::CreateCondCode(CC, NameLoc, NameLoc, getContext())); - } - - // Add the remaining tokens in the mnemonic. - while (Next != StringRef::npos) { - Start = Next; - Next = Name.find('.', Start + 1); - Head = Name.slice(Start, Next); - SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() + - (Head.data() - Name.data()) + 1); - Operands.push_back( - ARM64Operand::CreateToken(Head, true, SuffixLoc, getContext())); - } - - // Conditional compare instructions have a Condition Code operand, which needs - // to be parsed and an immediate operand created. - bool condCodeFourthOperand = - (Head == "ccmp" || Head == "ccmn" || Head == "fccmp" || - Head == "fccmpe" || Head == "fcsel" || Head == "csel" || - Head == "csinc" || Head == "csinv" || Head == "csneg"); - - // These instructions are aliases to some of the conditional select - // instructions. However, the condition code is inverted in the aliased - // instruction. - // - // FIXME: Is this the correct way to handle these? Or should the parser - // generate the aliased instructions directly? - bool condCodeSecondOperand = (Head == "cset" || Head == "csetm"); - bool condCodeThirdOperand = - (Head == "cinc" || Head == "cinv" || Head == "cneg"); - - // Read the remaining operands. - if (getLexer().isNot(AsmToken::EndOfStatement)) { - // Read the first operand. - if (parseOperand(Operands, false, false)) { - Parser.eatToEndOfStatement(); - return true; - } - - unsigned N = 2; - while (getLexer().is(AsmToken::Comma)) { - Parser.Lex(); // Eat the comma. - - // Parse and remember the operand. - if (parseOperand(Operands, (N == 4 && condCodeFourthOperand) || - (N == 3 && condCodeThirdOperand) || - (N == 2 && condCodeSecondOperand), - condCodeSecondOperand || condCodeThirdOperand)) { - Parser.eatToEndOfStatement(); - return true; - } - - // After successfully parsing some operands there are two special cases to - // consider (i.e. notional operands not separated by commas). Both are due - // to memory specifiers: - // + An RBrac will end an address for load/store/prefetch - // + An '!' will indicate a pre-indexed operation. - // - // It's someone else's responsibility to make sure these tokens are sane - // in the given context! - if (Parser.getTok().is(AsmToken::RBrac)) { - SMLoc Loc = Parser.getTok().getLoc(); - Operands.push_back(ARM64Operand::CreateToken("]", false, Loc, - getContext())); - Parser.Lex(); - } - - if (Parser.getTok().is(AsmToken::Exclaim)) { - SMLoc Loc = Parser.getTok().getLoc(); - Operands.push_back(ARM64Operand::CreateToken("!", false, Loc, - getContext())); - Parser.Lex(); - } - - ++N; - } - } - - if (getLexer().isNot(AsmToken::EndOfStatement)) { - SMLoc Loc = Parser.getTok().getLoc(); - Parser.eatToEndOfStatement(); - return Error(Loc, "unexpected token in argument list"); - } - - Parser.Lex(); // Consume the EndOfStatement - return false; -} - -// FIXME: This entire function is a giant hack to provide us with decent -// operand range validation/diagnostics until TableGen/MC can be extended -// to support autogeneration of this kind of validation. -bool ARM64AsmParser::validateInstruction(MCInst &Inst, - SmallVectorImpl &Loc) { - const MCRegisterInfo *RI = getContext().getRegisterInfo(); - // Check for indexed addressing modes w/ the base register being the - // same as a destination/source register or pair load where - // the Rt == Rt2. All of those are undefined behaviour. - switch (Inst.getOpcode()) { - case ARM64::LDPSWpre: - case ARM64::LDPWpost: - case ARM64::LDPWpre: - case ARM64::LDPXpost: - case ARM64::LDPXpre: { - unsigned Rt = Inst.getOperand(1).getReg(); - unsigned Rt2 = Inst.getOperand(2).getReg(); - unsigned Rn = Inst.getOperand(3).getReg(); - if (RI->isSubRegisterEq(Rn, Rt)) - return Error(Loc[0], "unpredictable LDP instruction, writeback base " - "is also a destination"); - if (RI->isSubRegisterEq(Rn, Rt2)) - return Error(Loc[1], "unpredictable LDP instruction, writeback base " - "is also a destination"); - // FALLTHROUGH - } - case ARM64::LDPDi: - case ARM64::LDPQi: - case ARM64::LDPSi: - case ARM64::LDPSWi: - case ARM64::LDPWi: - case ARM64::LDPXi: { - unsigned Rt = Inst.getOperand(0).getReg(); - unsigned Rt2 = Inst.getOperand(1).getReg(); - if (Rt == Rt2) - return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt"); - break; - } - case ARM64::LDPDpost: - case ARM64::LDPDpre: - case ARM64::LDPQpost: - case ARM64::LDPQpre: - case ARM64::LDPSpost: - case ARM64::LDPSpre: - case ARM64::LDPSWpost: { - unsigned Rt = Inst.getOperand(1).getReg(); - unsigned Rt2 = Inst.getOperand(2).getReg(); - if (Rt == Rt2) - return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt"); - break; - } - case ARM64::STPDpost: - case ARM64::STPDpre: - case ARM64::STPQpost: - case ARM64::STPQpre: - case ARM64::STPSpost: - case ARM64::STPSpre: - case ARM64::STPWpost: - case ARM64::STPWpre: - case ARM64::STPXpost: - case ARM64::STPXpre: { - unsigned Rt = Inst.getOperand(1).getReg(); - unsigned Rt2 = Inst.getOperand(2).getReg(); - unsigned Rn = Inst.getOperand(3).getReg(); - if (RI->isSubRegisterEq(Rn, Rt)) - return Error(Loc[0], "unpredictable STP instruction, writeback base " - "is also a source"); - if (RI->isSubRegisterEq(Rn, Rt2)) - return Error(Loc[1], "unpredictable STP instruction, writeback base " - "is also a source"); - break; - } - case ARM64::LDRBBpre: - case ARM64::LDRBpre: - case ARM64::LDRHHpre: - case ARM64::LDRHpre: - case ARM64::LDRSBWpre: - case ARM64::LDRSBXpre: - case ARM64::LDRSHWpre: - case ARM64::LDRSHXpre: - case ARM64::LDRSWpre: - case ARM64::LDRWpre: - case ARM64::LDRXpre: - case ARM64::LDRBBpost: - case ARM64::LDRBpost: - case ARM64::LDRHHpost: - case ARM64::LDRHpost: - case ARM64::LDRSBWpost: - case ARM64::LDRSBXpost: - case ARM64::LDRSHWpost: - case ARM64::LDRSHXpost: - case ARM64::LDRSWpost: - case ARM64::LDRWpost: - case ARM64::LDRXpost: { - unsigned Rt = Inst.getOperand(1).getReg(); - unsigned Rn = Inst.getOperand(2).getReg(); - if (RI->isSubRegisterEq(Rn, Rt)) - return Error(Loc[0], "unpredictable LDR instruction, writeback base " - "is also a source"); - break; - } - case ARM64::STRBBpost: - case ARM64::STRBpost: - case ARM64::STRHHpost: - case ARM64::STRHpost: - case ARM64::STRWpost: - case ARM64::STRXpost: - case ARM64::STRBBpre: - case ARM64::STRBpre: - case ARM64::STRHHpre: - case ARM64::STRHpre: - case ARM64::STRWpre: - case ARM64::STRXpre: { - unsigned Rt = Inst.getOperand(1).getReg(); - unsigned Rn = Inst.getOperand(2).getReg(); - if (RI->isSubRegisterEq(Rn, Rt)) - return Error(Loc[0], "unpredictable STR instruction, writeback base " - "is also a source"); - break; - } - } - - // Now check immediate ranges. Separate from the above as there is overlap - // in the instructions being checked and this keeps the nested conditionals - // to a minimum. - switch (Inst.getOpcode()) { - case ARM64::ADDSWri: - case ARM64::ADDSXri: - case ARM64::ADDWri: - case ARM64::ADDXri: - case ARM64::SUBSWri: - case ARM64::SUBSXri: - case ARM64::SUBWri: - case ARM64::SUBXri: { - // Annoyingly we can't do this in the isAddSubImm predicate, so there is - // some slight duplication here. - if (Inst.getOperand(2).isExpr()) { - const MCExpr *Expr = Inst.getOperand(2).getExpr(); - ARM64MCExpr::VariantKind ELFRefKind; - MCSymbolRefExpr::VariantKind DarwinRefKind; - int64_t Addend; - if (!classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) { - return Error(Loc[2], "invalid immediate expression"); - } - - // Only allow these with ADDXri. - if ((DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF || - DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) && - Inst.getOpcode() == ARM64::ADDXri) - return false; - - // Only allow these with ADDXri/ADDWri - if ((ELFRefKind == ARM64MCExpr::VK_LO12 || - ELFRefKind == ARM64MCExpr::VK_DTPREL_HI12 || - ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12 || - ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12_NC || - ELFRefKind == ARM64MCExpr::VK_TPREL_HI12 || - ELFRefKind == ARM64MCExpr::VK_TPREL_LO12 || - ELFRefKind == ARM64MCExpr::VK_TPREL_LO12_NC || - ELFRefKind == ARM64MCExpr::VK_TLSDESC_LO12) && - (Inst.getOpcode() == ARM64::ADDXri || - Inst.getOpcode() == ARM64::ADDWri)) - return false; - - // Don't allow expressions in the immediate field otherwise - return Error(Loc[2], "invalid immediate expression"); - } - return false; - } - default: - return false; - } -} - -bool ARM64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) { - switch (ErrCode) { - case Match_MissingFeature: - return Error(Loc, - "instruction requires a CPU feature not currently enabled"); - case Match_InvalidOperand: - return Error(Loc, "invalid operand for instruction"); - case Match_InvalidSuffix: - return Error(Loc, "invalid type suffix for instruction"); - case Match_InvalidCondCode: - return Error(Loc, "expected AArch64 condition code"); - case Match_AddSubRegExtendSmall: - return Error(Loc, - "expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]"); - case Match_AddSubRegExtendLarge: - return Error(Loc, - "expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]"); - case Match_AddSubSecondSource: - return Error(Loc, - "expected compatible register, symbol or integer in range [0, 4095]"); - case Match_LogicalSecondSource: - return Error(Loc, "expected compatible register or logical immediate"); - case Match_InvalidMovImm32Shift: - return Error(Loc, "expected 'lsl' with optional integer 0 or 16"); - case Match_InvalidMovImm64Shift: - return Error(Loc, "expected 'lsl' with optional integer 0, 16, 32 or 48"); - case Match_AddSubRegShift32: - return Error(Loc, - "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]"); - case Match_AddSubRegShift64: - return Error(Loc, - "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]"); - case Match_InvalidFPImm: - return Error(Loc, - "expected compatible register or floating-point constant"); - case Match_InvalidMemoryIndexedSImm9: - return Error(Loc, "index must be an integer in range [-256, 255]."); - case Match_InvalidMemoryIndexed4SImm7: - return Error(Loc, "index must be a multiple of 4 in range [-256, 252]."); - case Match_InvalidMemoryIndexed8SImm7: - return Error(Loc, "index must be a multiple of 8 in range [-512, 504]."); - case Match_InvalidMemoryIndexed16SImm7: - return Error(Loc, "index must be a multiple of 16 in range [-1024, 1008]."); - case Match_InvalidMemoryWExtend8: - return Error(Loc, - "expected 'uxtw' or 'sxtw' with optional shift of #0"); - case Match_InvalidMemoryWExtend16: - return Error(Loc, - "expected 'uxtw' or 'sxtw' with optional shift of #0 or #1"); - case Match_InvalidMemoryWExtend32: - return Error(Loc, - "expected 'uxtw' or 'sxtw' with optional shift of #0 or #2"); - case Match_InvalidMemoryWExtend64: - return Error(Loc, - "expected 'uxtw' or 'sxtw' with optional shift of #0 or #3"); - case Match_InvalidMemoryWExtend128: - return Error(Loc, - "expected 'uxtw' or 'sxtw' with optional shift of #0 or #4"); - case Match_InvalidMemoryXExtend8: - return Error(Loc, - "expected 'lsl' or 'sxtx' with optional shift of #0"); - case Match_InvalidMemoryXExtend16: - return Error(Loc, - "expected 'lsl' or 'sxtx' with optional shift of #0 or #1"); - case Match_InvalidMemoryXExtend32: - return Error(Loc, - "expected 'lsl' or 'sxtx' with optional shift of #0 or #2"); - case Match_InvalidMemoryXExtend64: - return Error(Loc, - "expected 'lsl' or 'sxtx' with optional shift of #0 or #3"); - case Match_InvalidMemoryXExtend128: - return Error(Loc, - "expected 'lsl' or 'sxtx' with optional shift of #0 or #4"); - case Match_InvalidMemoryIndexed1: - return Error(Loc, "index must be an integer in range [0, 4095]."); - case Match_InvalidMemoryIndexed2: - return Error(Loc, "index must be a multiple of 2 in range [0, 8190]."); - case Match_InvalidMemoryIndexed4: - return Error(Loc, "index must be a multiple of 4 in range [0, 16380]."); - case Match_InvalidMemoryIndexed8: - return Error(Loc, "index must be a multiple of 8 in range [0, 32760]."); - case Match_InvalidMemoryIndexed16: - return Error(Loc, "index must be a multiple of 16 in range [0, 65520]."); - case Match_InvalidImm0_7: - return Error(Loc, "immediate must be an integer in range [0, 7]."); - case Match_InvalidImm0_15: - return Error(Loc, "immediate must be an integer in range [0, 15]."); - case Match_InvalidImm0_31: - return Error(Loc, "immediate must be an integer in range [0, 31]."); - case Match_InvalidImm0_63: - return Error(Loc, "immediate must be an integer in range [0, 63]."); - case Match_InvalidImm0_127: - return Error(Loc, "immediate must be an integer in range [0, 127]."); - case Match_InvalidImm0_65535: - return Error(Loc, "immediate must be an integer in range [0, 65535]."); - case Match_InvalidImm1_8: - return Error(Loc, "immediate must be an integer in range [1, 8]."); - case Match_InvalidImm1_16: - return Error(Loc, "immediate must be an integer in range [1, 16]."); - case Match_InvalidImm1_32: - return Error(Loc, "immediate must be an integer in range [1, 32]."); - case Match_InvalidImm1_64: - return Error(Loc, "immediate must be an integer in range [1, 64]."); - case Match_InvalidIndex1: - return Error(Loc, "expected lane specifier '[1]'"); - case Match_InvalidIndexB: - return Error(Loc, "vector lane must be an integer in range [0, 15]."); - case Match_InvalidIndexH: - return Error(Loc, "vector lane must be an integer in range [0, 7]."); - case Match_InvalidIndexS: - return Error(Loc, "vector lane must be an integer in range [0, 3]."); - case Match_InvalidIndexD: - return Error(Loc, "vector lane must be an integer in range [0, 1]."); - case Match_InvalidLabel: - return Error(Loc, "expected label or encodable integer pc offset"); - case Match_MRS: - return Error(Loc, "expected readable system register"); - case Match_MSR: - return Error(Loc, "expected writable system register or pstate"); - case Match_MnemonicFail: - return Error(Loc, "unrecognized instruction mnemonic"); - default: - assert(0 && "unexpected error code!"); - return Error(Loc, "invalid instruction format"); - } -} - -static const char *getSubtargetFeatureName(unsigned Val); - -bool ARM64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, - OperandVector &Operands, - MCStreamer &Out, - unsigned &ErrorInfo, - bool MatchingInlineAsm) { - assert(!Operands.empty() && "Unexpect empty operand list!"); - ARM64Operand *Op = static_cast(Operands[0]); - assert(Op->isToken() && "Leading operand should always be a mnemonic!"); - - StringRef Tok = Op->getToken(); - unsigned NumOperands = Operands.size(); - - if (NumOperands == 4 && Tok == "lsl") { - ARM64Operand *Op2 = static_cast(Operands[2]); - ARM64Operand *Op3 = static_cast(Operands[3]); - if (Op2->isReg() && Op3->isImm()) { - const MCConstantExpr *Op3CE = dyn_cast(Op3->getImm()); - if (Op3CE) { - uint64_t Op3Val = Op3CE->getValue(); - uint64_t NewOp3Val = 0; - uint64_t NewOp4Val = 0; - if (ARM64MCRegisterClasses[ARM64::GPR32allRegClassID].contains( - Op2->getReg())) { - NewOp3Val = (32 - Op3Val) & 0x1f; - NewOp4Val = 31 - Op3Val; - } else { - NewOp3Val = (64 - Op3Val) & 0x3f; - NewOp4Val = 63 - Op3Val; - } - - const MCExpr *NewOp3 = MCConstantExpr::Create(NewOp3Val, getContext()); - const MCExpr *NewOp4 = MCConstantExpr::Create(NewOp4Val, getContext()); - - Operands[0] = ARM64Operand::CreateToken( - "ubfm", false, Op->getStartLoc(), getContext()); - Operands[3] = ARM64Operand::CreateImm(NewOp3, Op3->getStartLoc(), - Op3->getEndLoc(), getContext()); - Operands.push_back(ARM64Operand::CreateImm( - NewOp4, Op3->getStartLoc(), Op3->getEndLoc(), getContext())); - delete Op3; - delete Op; - } - } - } else if (NumOperands == 5) { - // FIXME: Horrible hack to handle the BFI -> BFM, SBFIZ->SBFM, and - // UBFIZ -> UBFM aliases. - if (Tok == "bfi" || Tok == "sbfiz" || Tok == "ubfiz") { - ARM64Operand *Op1 = static_cast(Operands[1]); - ARM64Operand *Op3 = static_cast(Operands[3]); - ARM64Operand *Op4 = static_cast(Operands[4]); - - if (Op1->isReg() && Op3->isImm() && Op4->isImm()) { - const MCConstantExpr *Op3CE = dyn_cast(Op3->getImm()); - const MCConstantExpr *Op4CE = dyn_cast(Op4->getImm()); - - if (Op3CE && Op4CE) { - uint64_t Op3Val = Op3CE->getValue(); - uint64_t Op4Val = Op4CE->getValue(); - - uint64_t RegWidth = 0; - if (ARM64MCRegisterClasses[ARM64::GPR64allRegClassID].contains( - Op1->getReg())) - RegWidth = 64; - else - RegWidth = 32; - - if (Op3Val >= RegWidth) - return Error(Op3->getStartLoc(), - "expected integer in range [0, 31]"); - if (Op4Val < 1 || Op4Val > RegWidth) - return Error(Op4->getStartLoc(), - "expected integer in range [1, 32]"); - - uint64_t NewOp3Val = 0; - if (ARM64MCRegisterClasses[ARM64::GPR32allRegClassID].contains( - Op1->getReg())) - NewOp3Val = (32 - Op3Val) & 0x1f; - else - NewOp3Val = (64 - Op3Val) & 0x3f; - - uint64_t NewOp4Val = Op4Val - 1; - - if (NewOp3Val != 0 && NewOp4Val >= NewOp3Val) - return Error(Op4->getStartLoc(), - "requested insert overflows register"); - - const MCExpr *NewOp3 = - MCConstantExpr::Create(NewOp3Val, getContext()); - const MCExpr *NewOp4 = - MCConstantExpr::Create(NewOp4Val, getContext()); - Operands[3] = ARM64Operand::CreateImm(NewOp3, Op3->getStartLoc(), - Op3->getEndLoc(), getContext()); - Operands[4] = ARM64Operand::CreateImm(NewOp4, Op4->getStartLoc(), - Op4->getEndLoc(), getContext()); - if (Tok == "bfi") - Operands[0] = ARM64Operand::CreateToken( - "bfm", false, Op->getStartLoc(), getContext()); - else if (Tok == "sbfiz") - Operands[0] = ARM64Operand::CreateToken( - "sbfm", false, Op->getStartLoc(), getContext()); - else if (Tok == "ubfiz") - Operands[0] = ARM64Operand::CreateToken( - "ubfm", false, Op->getStartLoc(), getContext()); - else - llvm_unreachable("No valid mnemonic for alias?"); - - delete Op; - delete Op3; - delete Op4; - } - } - - // FIXME: Horrible hack to handle the BFXIL->BFM, SBFX->SBFM, and - // UBFX -> UBFM aliases. - } else if (NumOperands == 5 && - (Tok == "bfxil" || Tok == "sbfx" || Tok == "ubfx")) { - ARM64Operand *Op1 = static_cast(Operands[1]); - ARM64Operand *Op3 = static_cast(Operands[3]); - ARM64Operand *Op4 = static_cast(Operands[4]); - - if (Op1->isReg() && Op3->isImm() && Op4->isImm()) { - const MCConstantExpr *Op3CE = dyn_cast(Op3->getImm()); - const MCConstantExpr *Op4CE = dyn_cast(Op4->getImm()); - - if (Op3CE && Op4CE) { - uint64_t Op3Val = Op3CE->getValue(); - uint64_t Op4Val = Op4CE->getValue(); - - uint64_t RegWidth = 0; - if (ARM64MCRegisterClasses[ARM64::GPR64allRegClassID].contains( - Op1->getReg())) - RegWidth = 64; - else - RegWidth = 32; - - if (Op3Val >= RegWidth) - return Error(Op3->getStartLoc(), - "expected integer in range [0, 31]"); - if (Op4Val < 1 || Op4Val > RegWidth) - return Error(Op4->getStartLoc(), - "expected integer in range [1, 32]"); - - uint64_t NewOp4Val = Op3Val + Op4Val - 1; - - if (NewOp4Val >= RegWidth || NewOp4Val < Op3Val) - return Error(Op4->getStartLoc(), - "requested extract overflows register"); - - const MCExpr *NewOp4 = - MCConstantExpr::Create(NewOp4Val, getContext()); - Operands[4] = ARM64Operand::CreateImm( - NewOp4, Op4->getStartLoc(), Op4->getEndLoc(), getContext()); - if (Tok == "bfxil") - Operands[0] = ARM64Operand::CreateToken( - "bfm", false, Op->getStartLoc(), getContext()); - else if (Tok == "sbfx") - Operands[0] = ARM64Operand::CreateToken( - "sbfm", false, Op->getStartLoc(), getContext()); - else if (Tok == "ubfx") - Operands[0] = ARM64Operand::CreateToken( - "ubfm", false, Op->getStartLoc(), getContext()); - else - llvm_unreachable("No valid mnemonic for alias?"); - - delete Op; - delete Op4; - } - } - } - } - // FIXME: Horrible hack for sxtw and uxtw with Wn src and Xd dst operands. - // InstAlias can't quite handle this since the reg classes aren't - // subclasses. - if (NumOperands == 3 && (Tok == "sxtw" || Tok == "uxtw")) { - // The source register can be Wn here, but the matcher expects a - // GPR64. Twiddle it here if necessary. - ARM64Operand *Op = static_cast(Operands[2]); - if (Op->isReg()) { - unsigned Reg = getXRegFromWReg(Op->getReg()); - Operands[2] = ARM64Operand::CreateReg(Reg, false, Op->getStartLoc(), - Op->getEndLoc(), getContext()); - delete Op; - } - } - // FIXME: Likewise for sxt[bh] with a Xd dst operand - else if (NumOperands == 3 && (Tok == "sxtb" || Tok == "sxth")) { - ARM64Operand *Op = static_cast(Operands[1]); - if (Op->isReg() && - ARM64MCRegisterClasses[ARM64::GPR64allRegClassID].contains( - Op->getReg())) { - // The source register can be Wn here, but the matcher expects a - // GPR64. Twiddle it here if necessary. - ARM64Operand *Op = static_cast(Operands[2]); - if (Op->isReg()) { - unsigned Reg = getXRegFromWReg(Op->getReg()); - Operands[2] = ARM64Operand::CreateReg(Reg, false, Op->getStartLoc(), - Op->getEndLoc(), getContext()); - delete Op; - } - } - } - // FIXME: Likewise for uxt[bh] with a Xd dst operand - else if (NumOperands == 3 && (Tok == "uxtb" || Tok == "uxth")) { - ARM64Operand *Op = static_cast(Operands[1]); - if (Op->isReg() && - ARM64MCRegisterClasses[ARM64::GPR64allRegClassID].contains( - Op->getReg())) { - // The source register can be Wn here, but the matcher expects a - // GPR32. Twiddle it here if necessary. - ARM64Operand *Op = static_cast(Operands[1]); - if (Op->isReg()) { - unsigned Reg = getWRegFromXReg(Op->getReg()); - Operands[1] = ARM64Operand::CreateReg(Reg, false, Op->getStartLoc(), - Op->getEndLoc(), getContext()); - delete Op; - } - } - } - - // Yet another horrible hack to handle FMOV Rd, #0.0 using [WX]ZR. - if (NumOperands == 3 && Tok == "fmov") { - ARM64Operand *RegOp = static_cast(Operands[1]); - ARM64Operand *ImmOp = static_cast(Operands[2]); - if (RegOp->isReg() && ImmOp->isFPImm() && - ImmOp->getFPImm() == (unsigned)-1) { - unsigned zreg = ARM64MCRegisterClasses[ARM64::FPR32RegClassID].contains( - RegOp->getReg()) - ? ARM64::WZR - : ARM64::XZR; - Operands[2] = ARM64Operand::CreateReg(zreg, false, Op->getStartLoc(), - Op->getEndLoc(), getContext()); - delete ImmOp; - } - } - - MCInst Inst; - // First try to match against the secondary set of tables containing the - // short-form NEON instructions (e.g. "fadd.2s v0, v1, v2"). - unsigned MatchResult = - MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 1); - - // If that fails, try against the alternate table containing long-form NEON: - // "fadd v0.2s, v1.2s, v2.2s" - if (MatchResult != Match_Success) - MatchResult = - MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0); - - switch (MatchResult) { - case Match_Success: { - // Perform range checking and other semantic validations - SmallVector OperandLocs; - NumOperands = Operands.size(); - for (unsigned i = 1; i < NumOperands; ++i) - OperandLocs.push_back(Operands[i]->getStartLoc()); - if (validateInstruction(Inst, OperandLocs)) - return true; - - Inst.setLoc(IDLoc); - Out.EmitInstruction(Inst, STI); - return false; - } - case Match_MissingFeature: { - assert(ErrorInfo && "Unknown missing feature!"); - // Special case the error message for the very common case where only - // a single subtarget feature is missing (neon, e.g.). - std::string Msg = "instruction requires:"; - unsigned Mask = 1; - for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) { - if (ErrorInfo & Mask) { - Msg += " "; - Msg += getSubtargetFeatureName(ErrorInfo & Mask); - } - Mask <<= 1; - } - return Error(IDLoc, Msg); - } - case Match_MnemonicFail: - return showMatchError(IDLoc, MatchResult); - case Match_InvalidOperand: { - SMLoc ErrorLoc = IDLoc; - if (ErrorInfo != ~0U) { - if (ErrorInfo >= Operands.size()) - return Error(IDLoc, "too few operands for instruction"); - - ErrorLoc = ((ARM64Operand *)Operands[ErrorInfo])->getStartLoc(); - if (ErrorLoc == SMLoc()) - ErrorLoc = IDLoc; - } - // If the match failed on a suffix token operand, tweak the diagnostic - // accordingly. - if (((ARM64Operand *)Operands[ErrorInfo])->isToken() && - ((ARM64Operand *)Operands[ErrorInfo])->isTokenSuffix()) - MatchResult = Match_InvalidSuffix; - - return showMatchError(ErrorLoc, MatchResult); - } - case Match_InvalidMemoryIndexed1: - case Match_InvalidMemoryIndexed2: - case Match_InvalidMemoryIndexed4: - case Match_InvalidMemoryIndexed8: - case Match_InvalidMemoryIndexed16: - case Match_InvalidCondCode: - case Match_AddSubRegExtendSmall: - case Match_AddSubRegExtendLarge: - case Match_AddSubSecondSource: - case Match_LogicalSecondSource: - case Match_AddSubRegShift32: - case Match_AddSubRegShift64: - case Match_InvalidMovImm32Shift: - case Match_InvalidMovImm64Shift: - case Match_InvalidFPImm: - case Match_InvalidMemoryWExtend8: - case Match_InvalidMemoryWExtend16: - case Match_InvalidMemoryWExtend32: - case Match_InvalidMemoryWExtend64: - case Match_InvalidMemoryWExtend128: - case Match_InvalidMemoryXExtend8: - case Match_InvalidMemoryXExtend16: - case Match_InvalidMemoryXExtend32: - case Match_InvalidMemoryXExtend64: - case Match_InvalidMemoryXExtend128: - case Match_InvalidMemoryIndexed4SImm7: - case Match_InvalidMemoryIndexed8SImm7: - case Match_InvalidMemoryIndexed16SImm7: - case Match_InvalidMemoryIndexedSImm9: - case Match_InvalidImm0_7: - case Match_InvalidImm0_15: - case Match_InvalidImm0_31: - case Match_InvalidImm0_63: - case Match_InvalidImm0_127: - case Match_InvalidImm0_65535: - case Match_InvalidImm1_8: - case Match_InvalidImm1_16: - case Match_InvalidImm1_32: - case Match_InvalidImm1_64: - case Match_InvalidIndex1: - case Match_InvalidIndexB: - case Match_InvalidIndexH: - case Match_InvalidIndexS: - case Match_InvalidIndexD: - case Match_InvalidLabel: - case Match_MSR: - case Match_MRS: { - // Any time we get here, there's nothing fancy to do. Just get the - // operand SMLoc and display the diagnostic. - SMLoc ErrorLoc = ((ARM64Operand *)Operands[ErrorInfo])->getStartLoc(); - if (ErrorLoc == SMLoc()) - ErrorLoc = IDLoc; - return showMatchError(ErrorLoc, MatchResult); - } - } - - llvm_unreachable("Implement any new match types added!"); - return true; -} - -/// ParseDirective parses the arm specific directives -bool ARM64AsmParser::ParseDirective(AsmToken DirectiveID) { - StringRef IDVal = DirectiveID.getIdentifier(); - SMLoc Loc = DirectiveID.getLoc(); - if (IDVal == ".hword") - return parseDirectiveWord(2, Loc); - if (IDVal == ".word") - return parseDirectiveWord(4, Loc); - if (IDVal == ".xword") - return parseDirectiveWord(8, Loc); - if (IDVal == ".tlsdesccall") - return parseDirectiveTLSDescCall(Loc); - - return parseDirectiveLOH(IDVal, Loc); -} - -/// parseDirectiveWord -/// ::= .word [ expression (, expression)* ] -bool ARM64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) { - if (getLexer().isNot(AsmToken::EndOfStatement)) { - for (;;) { - const MCExpr *Value; - if (getParser().parseExpression(Value)) - return true; - - getParser().getStreamer().EmitValue(Value, Size); - - if (getLexer().is(AsmToken::EndOfStatement)) - break; - - // FIXME: Improve diagnostic. - if (getLexer().isNot(AsmToken::Comma)) - return Error(L, "unexpected token in directive"); - Parser.Lex(); - } - } - - Parser.Lex(); - return false; -} - -// parseDirectiveTLSDescCall: -// ::= .tlsdesccall symbol -bool ARM64AsmParser::parseDirectiveTLSDescCall(SMLoc L) { - StringRef Name; - if (getParser().parseIdentifier(Name)) - return Error(L, "expected symbol after directive"); - - MCSymbol *Sym = getContext().GetOrCreateSymbol(Name); - const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, getContext()); - Expr = ARM64MCExpr::Create(Expr, ARM64MCExpr::VK_TLSDESC, getContext()); - - MCInst Inst; - Inst.setOpcode(ARM64::TLSDESCCALL); - Inst.addOperand(MCOperand::CreateExpr(Expr)); - - getParser().getStreamer().EmitInstruction(Inst, STI); - return false; -} - -/// ::= .loh label1, ..., labelN -/// The number of arguments depends on the loh identifier. -bool ARM64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) { - if (IDVal != MCLOHDirectiveName()) - return true; - MCLOHType Kind; - if (getParser().getTok().isNot(AsmToken::Identifier)) { - if (getParser().getTok().isNot(AsmToken::Integer)) - return TokError("expected an identifier or a number in directive"); - // We successfully get a numeric value for the identifier. - // Check if it is valid. - int64_t Id = getParser().getTok().getIntVal(); - Kind = (MCLOHType)Id; - // Check that Id does not overflow MCLOHType. - if (!isValidMCLOHType(Kind) || Id != Kind) - return TokError("invalid numeric identifier in directive"); - } else { - StringRef Name = getTok().getIdentifier(); - // We successfully parse an identifier. - // Check if it is a recognized one. - int Id = MCLOHNameToId(Name); - - if (Id == -1) - return TokError("invalid identifier in directive"); - Kind = (MCLOHType)Id; - } - // Consume the identifier. - Lex(); - // Get the number of arguments of this LOH. - int NbArgs = MCLOHIdToNbArgs(Kind); - - assert(NbArgs != -1 && "Invalid number of arguments"); - - SmallVector Args; - for (int Idx = 0; Idx < NbArgs; ++Idx) { - StringRef Name; - if (getParser().parseIdentifier(Name)) - return TokError("expected identifier in directive"); - Args.push_back(getContext().GetOrCreateSymbol(Name)); - - if (Idx + 1 == NbArgs) - break; - if (getLexer().isNot(AsmToken::Comma)) - return TokError("unexpected token in '" + Twine(IDVal) + "' directive"); - Lex(); - } - if (getLexer().isNot(AsmToken::EndOfStatement)) - return TokError("unexpected token in '" + Twine(IDVal) + "' directive"); - - getStreamer().EmitLOHDirective((MCLOHType)Kind, Args); - return false; -} - -bool -ARM64AsmParser::classifySymbolRef(const MCExpr *Expr, - ARM64MCExpr::VariantKind &ELFRefKind, - MCSymbolRefExpr::VariantKind &DarwinRefKind, - int64_t &Addend) { - ELFRefKind = ARM64MCExpr::VK_INVALID; - DarwinRefKind = MCSymbolRefExpr::VK_None; - Addend = 0; - - if (const ARM64MCExpr *AE = dyn_cast(Expr)) { - ELFRefKind = AE->getKind(); - Expr = AE->getSubExpr(); - } - - const MCSymbolRefExpr *SE = dyn_cast(Expr); - if (SE) { - // It's a simple symbol reference with no addend. - DarwinRefKind = SE->getKind(); - return true; - } - - const MCBinaryExpr *BE = dyn_cast(Expr); - if (!BE) - return false; - - SE = dyn_cast(BE->getLHS()); - if (!SE) - return false; - DarwinRefKind = SE->getKind(); - - if (BE->getOpcode() != MCBinaryExpr::Add && - BE->getOpcode() != MCBinaryExpr::Sub) - return false; - - // See if the addend is is a constant, otherwise there's more going - // on here than we can deal with. - auto AddendExpr = dyn_cast(BE->getRHS()); - if (!AddendExpr) - return false; - - Addend = AddendExpr->getValue(); - if (BE->getOpcode() == MCBinaryExpr::Sub) - Addend = -Addend; - - // It's some symbol reference + a constant addend, but really - // shouldn't use both Darwin and ELF syntax. - return ELFRefKind == ARM64MCExpr::VK_INVALID || - DarwinRefKind == MCSymbolRefExpr::VK_None; -} - -/// Force static initialization. -extern "C" void LLVMInitializeARM64AsmParser() { - RegisterMCAsmParser X(TheARM64leTarget); - RegisterMCAsmParser Y(TheARM64beTarget); - - RegisterMCAsmParser Z(TheAArch64leTarget); - RegisterMCAsmParser W(TheAArch64beTarget); -} - -#define GET_REGISTER_MATCHER -#define GET_SUBTARGET_FEATURE_NAME -#define GET_MATCHER_IMPLEMENTATION -#include "ARM64GenAsmMatcher.inc" - -// Define this matcher function after the auto-generated include so we -// have the match class enum definitions. -unsigned ARM64AsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp, - unsigned Kind) { - ARM64Operand *Op = static_cast(AsmOp); - // If the kind is a token for a literal immediate, check if our asm - // operand matches. This is for InstAliases which have a fixed-value - // immediate in the syntax. - int64_t ExpectedVal; - switch (Kind) { - default: - return Match_InvalidOperand; - case MCK__35_0: - ExpectedVal = 0; - break; - case MCK__35_1: - ExpectedVal = 1; - break; - case MCK__35_12: - ExpectedVal = 12; - break; - case MCK__35_16: - ExpectedVal = 16; - break; - case MCK__35_2: - ExpectedVal = 2; - break; - case MCK__35_24: - ExpectedVal = 24; - break; - case MCK__35_3: - ExpectedVal = 3; - break; - case MCK__35_32: - ExpectedVal = 32; - break; - case MCK__35_4: - ExpectedVal = 4; - break; - case MCK__35_48: - ExpectedVal = 48; - break; - case MCK__35_6: - ExpectedVal = 6; - break; - case MCK__35_64: - ExpectedVal = 64; - break; - case MCK__35_8: - ExpectedVal = 8; - break; - } - if (!Op->isImm()) - return Match_InvalidOperand; - const MCConstantExpr *CE = dyn_cast(Op->getImm()); - if (!CE) - return Match_InvalidOperand; - if (CE->getValue() == ExpectedVal) - return Match_Success; - return Match_InvalidOperand; -} diff --git a/lib/Target/ARM64/AsmParser/CMakeLists.txt b/lib/Target/ARM64/AsmParser/CMakeLists.txt deleted file mode 100644 index 826158b1ed1..00000000000 --- a/lib/Target/ARM64/AsmParser/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) - -add_llvm_library(LLVMARM64AsmParser - ARM64AsmParser.cpp - ) - diff --git a/lib/Target/ARM64/AsmParser/LLVMBuild.txt b/lib/Target/ARM64/AsmParser/LLVMBuild.txt deleted file mode 100644 index 9045283e919..00000000000 --- a/lib/Target/ARM64/AsmParser/LLVMBuild.txt +++ /dev/null @@ -1,23 +0,0 @@ -;===- ./lib/Target/ARM64/AsmParser/LLVMBuild.txt ---------------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[component_0] -type = Library -name = ARM64AsmParser -parent = ARM64 -required_libraries = ARM64Desc ARM64Info ARM64Utils MC MCParser Support -add_to_library_groups = ARM64 diff --git a/lib/Target/ARM64/AsmParser/Makefile b/lib/Target/ARM64/AsmParser/Makefile deleted file mode 100644 index d25c47f9af9..00000000000 --- a/lib/Target/ARM64/AsmParser/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/ARM/AsmParser/Makefile -------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMARM64AsmParser - -# Hack: we need to include 'main' ARM target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/ARM64/CMakeLists.txt b/lib/Target/ARM64/CMakeLists.txt deleted file mode 100644 index 56ba3b73294..00000000000 --- a/lib/Target/ARM64/CMakeLists.txt +++ /dev/null @@ -1,51 +0,0 @@ -set(LLVM_TARGET_DEFINITIONS ARM64.td) - -tablegen(LLVM ARM64GenRegisterInfo.inc -gen-register-info) -tablegen(LLVM ARM64GenInstrInfo.inc -gen-instr-info) -tablegen(LLVM ARM64GenMCCodeEmitter.inc -gen-emitter -mc-emitter) -tablegen(LLVM ARM64GenMCPseudoLowering.inc -gen-pseudo-lowering) -tablegen(LLVM ARM64GenAsmWriter.inc -gen-asm-writer) -tablegen(LLVM ARM64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1) -tablegen(LLVM ARM64GenAsmMatcher.inc -gen-asm-matcher) -tablegen(LLVM ARM64GenDAGISel.inc -gen-dag-isel) -tablegen(LLVM ARM64GenFastISel.inc -gen-fast-isel) -tablegen(LLVM ARM64GenCallingConv.inc -gen-callingconv) -tablegen(LLVM ARM64GenSubtargetInfo.inc -gen-subtarget) -tablegen(LLVM ARM64GenDisassemblerTables.inc -gen-disassembler) -add_public_tablegen_target(ARM64CommonTableGen) - -add_llvm_target(ARM64CodeGen - ARM64AddressTypePromotion.cpp - ARM64AdvSIMDScalarPass.cpp - ARM64AsmPrinter.cpp - ARM64BranchRelaxation.cpp - ARM64CleanupLocalDynamicTLSPass.cpp - ARM64CollectLOH.cpp - ARM64ConditionalCompares.cpp - ARM64DeadRegisterDefinitionsPass.cpp - ARM64ExpandPseudoInsts.cpp - ARM64FastISel.cpp - ARM64FrameLowering.cpp - ARM64ISelDAGToDAG.cpp - ARM64ISelLowering.cpp - ARM64InstrInfo.cpp - ARM64LoadStoreOptimizer.cpp - ARM64MCInstLower.cpp - ARM64PromoteConstant.cpp - ARM64RegisterInfo.cpp - ARM64SelectionDAGInfo.cpp - ARM64StorePairSuppress.cpp - ARM64Subtarget.cpp - ARM64TargetMachine.cpp - ARM64TargetObjectFile.cpp - ARM64TargetTransformInfo.cpp -) - -add_dependencies(LLVMARM64CodeGen intrinsics_gen) - -add_subdirectory(TargetInfo) -add_subdirectory(AsmParser) -add_subdirectory(Disassembler) -add_subdirectory(InstPrinter) -add_subdirectory(MCTargetDesc) -add_subdirectory(Utils) diff --git a/lib/Target/ARM64/Disassembler/ARM64Disassembler.cpp b/lib/Target/ARM64/Disassembler/ARM64Disassembler.cpp deleted file mode 100644 index bb47b3a0982..00000000000 --- a/lib/Target/ARM64/Disassembler/ARM64Disassembler.cpp +++ /dev/null @@ -1,1548 +0,0 @@ -//===- ARM64Disassembler.cpp - Disassembler for ARM64 -----------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#include "ARM64Disassembler.h" -#include "ARM64ExternalSymbolizer.h" -#include "ARM64Subtarget.h" -#include "MCTargetDesc/ARM64AddressingModes.h" -#include "Utils/ARM64BaseInfo.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCFixedLenDisassembler.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/MemoryObject.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/ErrorHandling.h" - -using namespace llvm; - -#define DEBUG_TYPE "arm64-disassembler" - -// Pull DecodeStatus and its enum values into the global namespace. -typedef llvm::MCDisassembler::DecodeStatus DecodeStatus; - -// Forward declare these because the autogenerated code will reference them. -// Definitions are further down. -static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst, - unsigned RegNo, uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeFPR128_loRegisterClass(llvm::MCInst &Inst, - unsigned RegNo, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeGPR64spRegisterClass(llvm::MCInst &Inst, - unsigned RegNo, uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeGPR32spRegisterClass(llvm::MCInst &Inst, - unsigned RegNo, uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeQQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeDDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo, - uint64_t Address, - const void *Decoder); - -static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm, - uint64_t Address, const void *Decoder); -static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm, - uint64_t Address, const void *Decoder); -static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm, - uint64_t Address, const void *Decoder); -static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm, - uint64_t Address, const void *Decoder); -static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst, - uint32_t insn, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst, - uint32_t insn, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst, - uint32_t insn, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn, - uint64_t Address, const void *Decoder); -static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn, - uint64_t Address, const void *Decoder); -static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst, - uint32_t insn, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn, - uint64_t Address, const void *Decoder); - -static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn, - uint64_t Address, - const void *Decoder); -static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder); -static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, - const void *Decoder); -static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder); -static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, - const void *Decoder); -static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder); -static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, - const void *Decoder); -static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder); -static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder); -static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder); -static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder); -static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder); - -static bool Check(DecodeStatus &Out, DecodeStatus In) { - switch (In) { - case MCDisassembler::Success: - // Out stays the same. - return true; - case MCDisassembler::SoftFail: - Out = In; - return true; - case MCDisassembler::Fail: - Out = In; - return false; - } - llvm_unreachable("Invalid DecodeStatus!"); -} - -#include "ARM64GenDisassemblerTables.inc" -#include "ARM64GenInstrInfo.inc" - -#define Success llvm::MCDisassembler::Success -#define Fail llvm::MCDisassembler::Fail -#define SoftFail llvm::MCDisassembler::SoftFail - -static MCDisassembler *createARM64Disassembler(const Target &T, - const MCSubtargetInfo &STI, - MCContext &Ctx) { - return new ARM64Disassembler(STI, Ctx); -} - -DecodeStatus ARM64Disassembler::getInstruction(MCInst &MI, uint64_t &Size, - const MemoryObject &Region, - uint64_t Address, - raw_ostream &os, - raw_ostream &cs) const { - CommentStream = &cs; - - uint8_t bytes[4]; - - Size = 0; - // We want to read exactly 4 bytes of data. - if (Region.readBytes(Address, 4, (uint8_t *)bytes) == -1) - return Fail; - Size = 4; - - // Encoded as a small-endian 32-bit word in the stream. - uint32_t insn = - (bytes[3] << 24) | (bytes[2] << 16) | (bytes[1] << 8) | (bytes[0] << 0); - - // Calling the auto-generated decoder function. - return decodeInstruction(DecoderTable32, MI, insn, Address, this, STI); -} - -static MCSymbolizer * -createARM64ExternalSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo, - LLVMSymbolLookupCallback SymbolLookUp, - void *DisInfo, MCContext *Ctx, - MCRelocationInfo *RelInfo) { - return new llvm::ARM64ExternalSymbolizer( - *Ctx, - std::unique_ptr(RelInfo), - GetOpInfo, SymbolLookUp, DisInfo); -} - -extern "C" void LLVMInitializeARM64Disassembler() { - TargetRegistry::RegisterMCDisassembler(TheARM64leTarget, - createARM64Disassembler); - TargetRegistry::RegisterMCDisassembler(TheARM64beTarget, - createARM64Disassembler); - TargetRegistry::RegisterMCSymbolizer(TheARM64leTarget, - createARM64ExternalSymbolizer); - TargetRegistry::RegisterMCSymbolizer(TheARM64beTarget, - createARM64ExternalSymbolizer); - - TargetRegistry::RegisterMCDisassembler(TheAArch64leTarget, - createARM64Disassembler); - TargetRegistry::RegisterMCDisassembler(TheAArch64beTarget, - createARM64Disassembler); - TargetRegistry::RegisterMCSymbolizer(TheAArch64leTarget, - createARM64ExternalSymbolizer); - TargetRegistry::RegisterMCSymbolizer(TheAArch64beTarget, - createARM64ExternalSymbolizer); -} - -static const unsigned FPR128DecoderTable[] = { - ARM64::Q0, ARM64::Q1, ARM64::Q2, ARM64::Q3, ARM64::Q4, ARM64::Q5, - ARM64::Q6, ARM64::Q7, ARM64::Q8, ARM64::Q9, ARM64::Q10, ARM64::Q11, - ARM64::Q12, ARM64::Q13, ARM64::Q14, ARM64::Q15, ARM64::Q16, ARM64::Q17, - ARM64::Q18, ARM64::Q19, ARM64::Q20, ARM64::Q21, ARM64::Q22, ARM64::Q23, - ARM64::Q24, ARM64::Q25, ARM64::Q26, ARM64::Q27, ARM64::Q28, ARM64::Q29, - ARM64::Q30, ARM64::Q31 -}; - -static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const void *Decoder) { - if (RegNo > 31) - return Fail; - - unsigned Register = FPR128DecoderTable[RegNo]; - Inst.addOperand(MCOperand::CreateReg(Register)); - return Success; -} - -static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const void *Decoder) { - if (RegNo > 15) - return Fail; - return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder); -} - -static const unsigned FPR64DecoderTable[] = { - ARM64::D0, ARM64::D1, ARM64::D2, ARM64::D3, ARM64::D4, ARM64::D5, - ARM64::D6, ARM64::D7, ARM64::D8, ARM64::D9, ARM64::D10, ARM64::D11, - ARM64::D12, ARM64::D13, ARM64::D14, ARM64::D15, ARM64::D16, ARM64::D17, - ARM64::D18, ARM64::D19, ARM64::D20, ARM64::D21, ARM64::D22, ARM64::D23, - ARM64::D24, ARM64::D25, ARM64::D26, ARM64::D27, ARM64::D28, ARM64::D29, - ARM64::D30, ARM64::D31 -}; - -static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const void *Decoder) { - if (RegNo > 31) - return Fail; - - unsigned Register = FPR64DecoderTable[RegNo]; - Inst.addOperand(MCOperand::CreateReg(Register)); - return Success; -} - -static const unsigned FPR32DecoderTable[] = { - ARM64::S0, ARM64::S1, ARM64::S2, ARM64::S3, ARM64::S4, ARM64::S5, - ARM64::S6, ARM64::S7, ARM64::S8, ARM64::S9, ARM64::S10, ARM64::S11, - ARM64::S12, ARM64::S13, ARM64::S14, ARM64::S15, ARM64::S16, ARM64::S17, - ARM64::S18, ARM64::S19, ARM64::S20, ARM64::S21, ARM64::S22, ARM64::S23, - ARM64::S24, ARM64::S25, ARM64::S26, ARM64::S27, ARM64::S28, ARM64::S29, - ARM64::S30, ARM64::S31 -}; - -static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const void *Decoder) { - if (RegNo > 31) - return Fail; - - unsigned Register = FPR32DecoderTable[RegNo]; - Inst.addOperand(MCOperand::CreateReg(Register)); - return Success; -} - -static const unsigned FPR16DecoderTable[] = { - ARM64::H0, ARM64::H1, ARM64::H2, ARM64::H3, ARM64::H4, ARM64::H5, - ARM64::H6, ARM64::H7, ARM64::H8, ARM64::H9, ARM64::H10, ARM64::H11, - ARM64::H12, ARM64::H13, ARM64::H14, ARM64::H15, ARM64::H16, ARM64::H17, - ARM64::H18, ARM64::H19, ARM64::H20, ARM64::H21, ARM64::H22, ARM64::H23, - ARM64::H24, ARM64::H25, ARM64::H26, ARM64::H27, ARM64::H28, ARM64::H29, - ARM64::H30, ARM64::H31 -}; - -static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const void *Decoder) { - if (RegNo > 31) - return Fail; - - unsigned Register = FPR16DecoderTable[RegNo]; - Inst.addOperand(MCOperand::CreateReg(Register)); - return Success; -} - -static const unsigned FPR8DecoderTable[] = { - ARM64::B0, ARM64::B1, ARM64::B2, ARM64::B3, ARM64::B4, ARM64::B5, - ARM64::B6, ARM64::B7, ARM64::B8, ARM64::B9, ARM64::B10, ARM64::B11, - ARM64::B12, ARM64::B13, ARM64::B14, ARM64::B15, ARM64::B16, ARM64::B17, - ARM64::B18, ARM64::B19, ARM64::B20, ARM64::B21, ARM64::B22, ARM64::B23, - ARM64::B24, ARM64::B25, ARM64::B26, ARM64::B27, ARM64::B28, ARM64::B29, - ARM64::B30, ARM64::B31 -}; - -static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const void *Decoder) { - if (RegNo > 31) - return Fail; - - unsigned Register = FPR8DecoderTable[RegNo]; - Inst.addOperand(MCOperand::CreateReg(Register)); - return Success; -} - -static const unsigned GPR64DecoderTable[] = { - ARM64::X0, ARM64::X1, ARM64::X2, ARM64::X3, ARM64::X4, ARM64::X5, - ARM64::X6, ARM64::X7, ARM64::X8, ARM64::X9, ARM64::X10, ARM64::X11, - ARM64::X12, ARM64::X13, ARM64::X14, ARM64::X15, ARM64::X16, ARM64::X17, - ARM64::X18, ARM64::X19, ARM64::X20, ARM64::X21, ARM64::X22, ARM64::X23, - ARM64::X24, ARM64::X25, ARM64::X26, ARM64::X27, ARM64::X28, ARM64::FP, - ARM64::LR, ARM64::XZR -}; - -static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const void *Decoder) { - if (RegNo > 31) - return Fail; - - unsigned Register = GPR64DecoderTable[RegNo]; - Inst.addOperand(MCOperand::CreateReg(Register)); - return Success; -} - -static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const void *Decoder) { - if (RegNo > 31) - return Fail; - unsigned Register = GPR64DecoderTable[RegNo]; - if (Register == ARM64::XZR) - Register = ARM64::SP; - Inst.addOperand(MCOperand::CreateReg(Register)); - return Success; -} - -static const unsigned GPR32DecoderTable[] = { - ARM64::W0, ARM64::W1, ARM64::W2, ARM64::W3, ARM64::W4, ARM64::W5, - ARM64::W6, ARM64::W7, ARM64::W8, ARM64::W9, ARM64::W10, ARM64::W11, - ARM64::W12, ARM64::W13, ARM64::W14, ARM64::W15, ARM64::W16, ARM64::W17, - ARM64::W18, ARM64::W19, ARM64::W20, ARM64::W21, ARM64::W22, ARM64::W23, - ARM64::W24, ARM64::W25, ARM64::W26, ARM64::W27, ARM64::W28, ARM64::W29, - ARM64::W30, ARM64::WZR -}; - -static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const void *Decoder) { - if (RegNo > 31) - return Fail; - - unsigned Register = GPR32DecoderTable[RegNo]; - Inst.addOperand(MCOperand::CreateReg(Register)); - return Success; -} - -static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const void *Decoder) { - if (RegNo > 31) - return Fail; - - unsigned Register = GPR32DecoderTable[RegNo]; - if (Register == ARM64::WZR) - Register = ARM64::WSP; - Inst.addOperand(MCOperand::CreateReg(Register)); - return Success; -} - -static const unsigned VectorDecoderTable[] = { - ARM64::Q0, ARM64::Q1, ARM64::Q2, ARM64::Q3, ARM64::Q4, ARM64::Q5, - ARM64::Q6, ARM64::Q7, ARM64::Q8, ARM64::Q9, ARM64::Q10, ARM64::Q11, - ARM64::Q12, ARM64::Q13, ARM64::Q14, ARM64::Q15, ARM64::Q16, ARM64::Q17, - ARM64::Q18, ARM64::Q19, ARM64::Q20, ARM64::Q21, ARM64::Q22, ARM64::Q23, - ARM64::Q24, ARM64::Q25, ARM64::Q26, ARM64::Q27, ARM64::Q28, ARM64::Q29, - ARM64::Q30, ARM64::Q31 -}; - -static DecodeStatus DecodeVectorRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const void *Decoder) { - if (RegNo > 31) - return Fail; - - unsigned Register = VectorDecoderTable[RegNo]; - Inst.addOperand(MCOperand::CreateReg(Register)); - return Success; -} - -static const unsigned QQDecoderTable[] = { - ARM64::Q0_Q1, ARM64::Q1_Q2, ARM64::Q2_Q3, ARM64::Q3_Q4, - ARM64::Q4_Q5, ARM64::Q5_Q6, ARM64::Q6_Q7, ARM64::Q7_Q8, - ARM64::Q8_Q9, ARM64::Q9_Q10, ARM64::Q10_Q11, ARM64::Q11_Q12, - ARM64::Q12_Q13, ARM64::Q13_Q14, ARM64::Q14_Q15, ARM64::Q15_Q16, - ARM64::Q16_Q17, ARM64::Q17_Q18, ARM64::Q18_Q19, ARM64::Q19_Q20, - ARM64::Q20_Q21, ARM64::Q21_Q22, ARM64::Q22_Q23, ARM64::Q23_Q24, - ARM64::Q24_Q25, ARM64::Q25_Q26, ARM64::Q26_Q27, ARM64::Q27_Q28, - ARM64::Q28_Q29, ARM64::Q29_Q30, ARM64::Q30_Q31, ARM64::Q31_Q0 -}; - -static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, const void *Decoder) { - if (RegNo > 31) - return Fail; - unsigned Register = QQDecoderTable[RegNo]; - Inst.addOperand(MCOperand::CreateReg(Register)); - return Success; -} - -static const unsigned QQQDecoderTable[] = { - ARM64::Q0_Q1_Q2, ARM64::Q1_Q2_Q3, ARM64::Q2_Q3_Q4, - ARM64::Q3_Q4_Q5, ARM64::Q4_Q5_Q6, ARM64::Q5_Q6_Q7, - ARM64::Q6_Q7_Q8, ARM64::Q7_Q8_Q9, ARM64::Q8_Q9_Q10, - ARM64::Q9_Q10_Q11, ARM64::Q10_Q11_Q12, ARM64::Q11_Q12_Q13, - ARM64::Q12_Q13_Q14, ARM64::Q13_Q14_Q15, ARM64::Q14_Q15_Q16, - ARM64::Q15_Q16_Q17, ARM64::Q16_Q17_Q18, ARM64::Q17_Q18_Q19, - ARM64::Q18_Q19_Q20, ARM64::Q19_Q20_Q21, ARM64::Q20_Q21_Q22, - ARM64::Q21_Q22_Q23, ARM64::Q22_Q23_Q24, ARM64::Q23_Q24_Q25, - ARM64::Q24_Q25_Q26, ARM64::Q25_Q26_Q27, ARM64::Q26_Q27_Q28, - ARM64::Q27_Q28_Q29, ARM64::Q28_Q29_Q30, ARM64::Q29_Q30_Q31, - ARM64::Q30_Q31_Q0, ARM64::Q31_Q0_Q1 -}; - -static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, const void *Decoder) { - if (RegNo > 31) - return Fail; - unsigned Register = QQQDecoderTable[RegNo]; - Inst.addOperand(MCOperand::CreateReg(Register)); - return Success; -} - -static const unsigned QQQQDecoderTable[] = { - ARM64::Q0_Q1_Q2_Q3, ARM64::Q1_Q2_Q3_Q4, ARM64::Q2_Q3_Q4_Q5, - ARM64::Q3_Q4_Q5_Q6, ARM64::Q4_Q5_Q6_Q7, ARM64::Q5_Q6_Q7_Q8, - ARM64::Q6_Q7_Q8_Q9, ARM64::Q7_Q8_Q9_Q10, ARM64::Q8_Q9_Q10_Q11, - ARM64::Q9_Q10_Q11_Q12, ARM64::Q10_Q11_Q12_Q13, ARM64::Q11_Q12_Q13_Q14, - ARM64::Q12_Q13_Q14_Q15, ARM64::Q13_Q14_Q15_Q16, ARM64::Q14_Q15_Q16_Q17, - ARM64::Q15_Q16_Q17_Q18, ARM64::Q16_Q17_Q18_Q19, ARM64::Q17_Q18_Q19_Q20, - ARM64::Q18_Q19_Q20_Q21, ARM64::Q19_Q20_Q21_Q22, ARM64::Q20_Q21_Q22_Q23, - ARM64::Q21_Q22_Q23_Q24, ARM64::Q22_Q23_Q24_Q25, ARM64::Q23_Q24_Q25_Q26, - ARM64::Q24_Q25_Q26_Q27, ARM64::Q25_Q26_Q27_Q28, ARM64::Q26_Q27_Q28_Q29, - ARM64::Q27_Q28_Q29_Q30, ARM64::Q28_Q29_Q30_Q31, ARM64::Q29_Q30_Q31_Q0, - ARM64::Q30_Q31_Q0_Q1, ARM64::Q31_Q0_Q1_Q2 -}; - -static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const void *Decoder) { - if (RegNo > 31) - return Fail; - unsigned Register = QQQQDecoderTable[RegNo]; - Inst.addOperand(MCOperand::CreateReg(Register)); - return Success; -} - -static const unsigned DDDecoderTable[] = { - ARM64::D0_D1, ARM64::D1_D2, ARM64::D2_D3, ARM64::D3_D4, - ARM64::D4_D5, ARM64::D5_D6, ARM64::D6_D7, ARM64::D7_D8, - ARM64::D8_D9, ARM64::D9_D10, ARM64::D10_D11, ARM64::D11_D12, - ARM64::D12_D13, ARM64::D13_D14, ARM64::D14_D15, ARM64::D15_D16, - ARM64::D16_D17, ARM64::D17_D18, ARM64::D18_D19, ARM64::D19_D20, - ARM64::D20_D21, ARM64::D21_D22, ARM64::D22_D23, ARM64::D23_D24, - ARM64::D24_D25, ARM64::D25_D26, ARM64::D26_D27, ARM64::D27_D28, - ARM64::D28_D29, ARM64::D29_D30, ARM64::D30_D31, ARM64::D31_D0 -}; - -static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, const void *Decoder) { - if (RegNo > 31) - return Fail; - unsigned Register = DDDecoderTable[RegNo]; - Inst.addOperand(MCOperand::CreateReg(Register)); - return Success; -} - -static const unsigned DDDDecoderTable[] = { - ARM64::D0_D1_D2, ARM64::D1_D2_D3, ARM64::D2_D3_D4, - ARM64::D3_D4_D5, ARM64::D4_D5_D6, ARM64::D5_D6_D7, - ARM64::D6_D7_D8, ARM64::D7_D8_D9, ARM64::D8_D9_D10, - ARM64::D9_D10_D11, ARM64::D10_D11_D12, ARM64::D11_D12_D13, - ARM64::D12_D13_D14, ARM64::D13_D14_D15, ARM64::D14_D15_D16, - ARM64::D15_D16_D17, ARM64::D16_D17_D18, ARM64::D17_D18_D19, - ARM64::D18_D19_D20, ARM64::D19_D20_D21, ARM64::D20_D21_D22, - ARM64::D21_D22_D23, ARM64::D22_D23_D24, ARM64::D23_D24_D25, - ARM64::D24_D25_D26, ARM64::D25_D26_D27, ARM64::D26_D27_D28, - ARM64::D27_D28_D29, ARM64::D28_D29_D30, ARM64::D29_D30_D31, - ARM64::D30_D31_D0, ARM64::D31_D0_D1 -}; - -static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, const void *Decoder) { - if (RegNo > 31) - return Fail; - unsigned Register = DDDDecoderTable[RegNo]; - Inst.addOperand(MCOperand::CreateReg(Register)); - return Success; -} - -static const unsigned DDDDDecoderTable[] = { - ARM64::D0_D1_D2_D3, ARM64::D1_D2_D3_D4, ARM64::D2_D3_D4_D5, - ARM64::D3_D4_D5_D6, ARM64::D4_D5_D6_D7, ARM64::D5_D6_D7_D8, - ARM64::D6_D7_D8_D9, ARM64::D7_D8_D9_D10, ARM64::D8_D9_D10_D11, - ARM64::D9_D10_D11_D12, ARM64::D10_D11_D12_D13, ARM64::D11_D12_D13_D14, - ARM64::D12_D13_D14_D15, ARM64::D13_D14_D15_D16, ARM64::D14_D15_D16_D17, - ARM64::D15_D16_D17_D18, ARM64::D16_D17_D18_D19, ARM64::D17_D18_D19_D20, - ARM64::D18_D19_D20_D21, ARM64::D19_D20_D21_D22, ARM64::D20_D21_D22_D23, - ARM64::D21_D22_D23_D24, ARM64::D22_D23_D24_D25, ARM64::D23_D24_D25_D26, - ARM64::D24_D25_D26_D27, ARM64::D25_D26_D27_D28, ARM64::D26_D27_D28_D29, - ARM64::D27_D28_D29_D30, ARM64::D28_D29_D30_D31, ARM64::D29_D30_D31_D0, - ARM64::D30_D31_D0_D1, ARM64::D31_D0_D1_D2 -}; - -static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t Addr, - const void *Decoder) { - if (RegNo > 31) - return Fail; - unsigned Register = DDDDDecoderTable[RegNo]; - Inst.addOperand(MCOperand::CreateReg(Register)); - return Success; -} - -static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, - const void *Decoder) { - // scale{5} is asserted as 1 in tblgen. - Imm |= 0x20; - Inst.addOperand(MCOperand::CreateImm(64 - Imm)); - return Success; -} - -static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, - const void *Decoder) { - Inst.addOperand(MCOperand::CreateImm(64 - Imm)); - return Success; -} - -static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { - int64_t ImmVal = Imm; - const ARM64Disassembler *Dis = - static_cast(Decoder); - - // Sign-extend 19-bit immediate. - if (ImmVal & (1 << (19 - 1))) - ImmVal |= ~((1LL << 19) - 1); - - if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal << 2, Addr, - Inst.getOpcode() != ARM64::LDRXl, 0, 4)) - Inst.addOperand(MCOperand::CreateImm(ImmVal)); - return Success; -} - -static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm, - uint64_t Address, const void *Decoder) { - Inst.addOperand(MCOperand::CreateImm((Imm >> 1) & 1)); - Inst.addOperand(MCOperand::CreateImm(Imm & 1)); - return Success; -} - -static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm, - uint64_t Address, - const void *Decoder) { - const ARM64Disassembler *Dis = - static_cast(Decoder); - const MCSubtargetInfo &STI = Dis->getSubtargetInfo(); - - Imm |= 0x8000; - Inst.addOperand(MCOperand::CreateImm(Imm)); - - bool ValidNamed; - (void)ARM64SysReg::MRSMapper(STI.getFeatureBits()).toString(Imm, ValidNamed); - - return ValidNamed ? Success : Fail; -} - -static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm, - uint64_t Address, - const void *Decoder) { - const ARM64Disassembler *Dis = - static_cast(Decoder); - const MCSubtargetInfo &STI = Dis->getSubtargetInfo(); - - Imm |= 0x8000; - Inst.addOperand(MCOperand::CreateImm(Imm)); - - bool ValidNamed; - (void)ARM64SysReg::MSRMapper(STI.getFeatureBits()).toString(Imm, ValidNamed); - - return ValidNamed ? Success : Fail; -} - -static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn, - uint64_t Address, - const void *Decoder) { - // This decoder exists to add the dummy Lane operand to the MCInst, which must - // be 1 in assembly but has no other real manifestation. - unsigned Rd = fieldFromInstruction(Insn, 0, 5); - unsigned Rn = fieldFromInstruction(Insn, 5, 5); - unsigned IsToVec = fieldFromInstruction(Insn, 16, 1); - - if (IsToVec) { - DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder); - DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder); - } else { - DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder); - DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder); - } - - // Add the lane - Inst.addOperand(MCOperand::CreateImm(1)); - - return Success; -} - -static DecodeStatus DecodeVecShiftRImm(llvm::MCInst &Inst, unsigned Imm, - unsigned Add) { - Inst.addOperand(MCOperand::CreateImm(Add - Imm)); - return Success; -} - -static DecodeStatus DecodeVecShiftLImm(llvm::MCInst &Inst, unsigned Imm, - unsigned Add) { - Inst.addOperand(MCOperand::CreateImm((Imm + Add) & (Add - 1))); - return Success; -} - -static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { - return DecodeVecShiftRImm(Inst, Imm, 64); -} - -static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, - const void *Decoder) { - return DecodeVecShiftRImm(Inst, Imm | 0x20, 64); -} - -static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { - return DecodeVecShiftRImm(Inst, Imm, 32); -} - -static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, - const void *Decoder) { - return DecodeVecShiftRImm(Inst, Imm | 0x10, 32); -} - -static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { - return DecodeVecShiftRImm(Inst, Imm, 16); -} - -static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, - const void *Decoder) { - return DecodeVecShiftRImm(Inst, Imm | 0x8, 16); -} - -static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { - return DecodeVecShiftRImm(Inst, Imm, 8); -} - -static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { - return DecodeVecShiftLImm(Inst, Imm, 64); -} - -static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { - return DecodeVecShiftLImm(Inst, Imm, 32); -} - -static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { - return DecodeVecShiftLImm(Inst, Imm, 16); -} - -static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm, - uint64_t Addr, const void *Decoder) { - return DecodeVecShiftLImm(Inst, Imm, 8); -} - -static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Addr, - const void *Decoder) { - unsigned Rd = fieldFromInstruction(insn, 0, 5); - unsigned Rn = fieldFromInstruction(insn, 5, 5); - unsigned Rm = fieldFromInstruction(insn, 16, 5); - unsigned shiftHi = fieldFromInstruction(insn, 22, 2); - unsigned shiftLo = fieldFromInstruction(insn, 10, 6); - unsigned shift = (shiftHi << 6) | shiftLo; - switch (Inst.getOpcode()) { - default: - return Fail; - case ARM64::ADDWrs: - case ARM64::ADDSWrs: - case ARM64::SUBWrs: - case ARM64::SUBSWrs: - // if shift == '11' then ReservedValue() - if (shiftHi == 0x3) - return Fail; - // Deliberate fallthrough - case ARM64::ANDWrs: - case ARM64::ANDSWrs: - case ARM64::BICWrs: - case ARM64::BICSWrs: - case ARM64::ORRWrs: - case ARM64::ORNWrs: - case ARM64::EORWrs: - case ARM64::EONWrs: { - // if sf == '0' and imm6<5> == '1' then ReservedValue() - if (shiftLo >> 5 == 1) - return Fail; - DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder); - DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder); - break; - } - case ARM64::ADDXrs: - case ARM64::ADDSXrs: - case ARM64::SUBXrs: - case ARM64::SUBSXrs: - // if shift == '11' then ReservedValue() - if (shiftHi == 0x3) - return Fail; - // Deliberate fallthrough - case ARM64::ANDXrs: - case ARM64::ANDSXrs: - case ARM64::BICXrs: - case ARM64::BICSXrs: - case ARM64::ORRXrs: - case ARM64::ORNXrs: - case ARM64::EORXrs: - case ARM64::EONXrs: - DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder); - DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder); - break; - } - - Inst.addOperand(MCOperand::CreateImm(shift)); - return Success; -} - -static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn, - uint64_t Addr, - const void *Decoder) { - unsigned Rd = fieldFromInstruction(insn, 0, 5); - unsigned imm = fieldFromInstruction(insn, 5, 16); - unsigned shift = fieldFromInstruction(insn, 21, 2); - shift <<= 4; - switch (Inst.getOpcode()) { - default: - return Fail; - case ARM64::MOVZWi: - case ARM64::MOVNWi: - case ARM64::MOVKWi: - if (shift & (1U << 5)) - return Fail; - DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder); - break; - case ARM64::MOVZXi: - case ARM64::MOVNXi: - case ARM64::MOVKXi: - DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); - break; - } - - if (Inst.getOpcode() == ARM64::MOVKWi || Inst.getOpcode() == ARM64::MOVKXi) - Inst.addOperand(Inst.getOperand(0)); - - Inst.addOperand(MCOperand::CreateImm(imm)); - Inst.addOperand(MCOperand::CreateImm(shift)); - return Success; -} - -static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Addr, - const void *Decoder) { - unsigned Rt = fieldFromInstruction(insn, 0, 5); - unsigned Rn = fieldFromInstruction(insn, 5, 5); - unsigned offset = fieldFromInstruction(insn, 10, 12); - const ARM64Disassembler *Dis = - static_cast(Decoder); - - switch (Inst.getOpcode()) { - default: - return Fail; - case ARM64::PRFMui: - // Rt is an immediate in prefetch. - Inst.addOperand(MCOperand::CreateImm(Rt)); - break; - case ARM64::STRBBui: - case ARM64::LDRBBui: - case ARM64::LDRSBWui: - case ARM64::STRHHui: - case ARM64::LDRHHui: - case ARM64::LDRSHWui: - case ARM64::STRWui: - case ARM64::LDRWui: - DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); - break; - case ARM64::LDRSBXui: - case ARM64::LDRSHXui: - case ARM64::LDRSWui: - case ARM64::STRXui: - case ARM64::LDRXui: - DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); - break; - case ARM64::LDRQui: - case ARM64::STRQui: - DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder); - break; - case ARM64::LDRDui: - case ARM64::STRDui: - DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder); - break; - case ARM64::LDRSui: - case ARM64::STRSui: - DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder); - break; - case ARM64::LDRHui: - case ARM64::STRHui: - DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder); - break; - case ARM64::LDRBui: - case ARM64::STRBui: - DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder); - break; - } - - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); - if (!Dis->tryAddingSymbolicOperand(Inst, offset, Addr, Fail, 0, 4)) - Inst.addOperand(MCOperand::CreateImm(offset)); - return Success; -} - -static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Addr, - const void *Decoder) { - unsigned Rt = fieldFromInstruction(insn, 0, 5); - unsigned Rn = fieldFromInstruction(insn, 5, 5); - int64_t offset = fieldFromInstruction(insn, 12, 9); - - // offset is a 9-bit signed immediate, so sign extend it to - // fill the unsigned. - if (offset & (1 << (9 - 1))) - offset |= ~((1LL << 9) - 1); - - // First operand is always the writeback to the address register, if needed. - switch (Inst.getOpcode()) { - default: - break; - case ARM64::LDRSBWpre: - case ARM64::LDRSHWpre: - case ARM64::STRBBpre: - case ARM64::LDRBBpre: - case ARM64::STRHHpre: - case ARM64::LDRHHpre: - case ARM64::STRWpre: - case ARM64::LDRWpre: - case ARM64::LDRSBWpost: - case ARM64::LDRSHWpost: - case ARM64::STRBBpost: - case ARM64::LDRBBpost: - case ARM64::STRHHpost: - case ARM64::LDRHHpost: - case ARM64::STRWpost: - case ARM64::LDRWpost: - case ARM64::LDRSBXpre: - case ARM64::LDRSHXpre: - case ARM64::STRXpre: - case ARM64::LDRSWpre: - case ARM64::LDRXpre: - case ARM64::LDRSBXpost: - case ARM64::LDRSHXpost: - case ARM64::STRXpost: - case ARM64::LDRSWpost: - case ARM64::LDRXpost: - case ARM64::LDRQpre: - case ARM64::STRQpre: - case ARM64::LDRQpost: - case ARM64::STRQpost: - case ARM64::LDRDpre: - case ARM64::STRDpre: - case ARM64::LDRDpost: - case ARM64::STRDpost: - case ARM64::LDRSpre: - case ARM64::STRSpre: - case ARM64::LDRSpost: - case ARM64::STRSpost: - case ARM64::LDRHpre: - case ARM64::STRHpre: - case ARM64::LDRHpost: - case ARM64::STRHpost: - case ARM64::LDRBpre: - case ARM64::STRBpre: - case ARM64::LDRBpost: - case ARM64::STRBpost: - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); - break; - } - - switch (Inst.getOpcode()) { - default: - return Fail; - case ARM64::PRFUMi: - // Rt is an immediate in prefetch. - Inst.addOperand(MCOperand::CreateImm(Rt)); - break; - case ARM64::STURBBi: - case ARM64::LDURBBi: - case ARM64::LDURSBWi: - case ARM64::STURHHi: - case ARM64::LDURHHi: - case ARM64::LDURSHWi: - case ARM64::STURWi: - case ARM64::LDURWi: - case ARM64::LDTRSBWi: - case ARM64::LDTRSHWi: - case ARM64::STTRWi: - case ARM64::LDTRWi: - case ARM64::STTRHi: - case ARM64::LDTRHi: - case ARM64::LDTRBi: - case ARM64::STTRBi: - case ARM64::LDRSBWpre: - case ARM64::LDRSHWpre: - case ARM64::STRBBpre: - case ARM64::LDRBBpre: - case ARM64::STRHHpre: - case ARM64::LDRHHpre: - case ARM64::STRWpre: - case ARM64::LDRWpre: - case ARM64::LDRSBWpost: - case ARM64::LDRSHWpost: - case ARM64::STRBBpost: - case ARM64::LDRBBpost: - case ARM64::STRHHpost: - case ARM64::LDRHHpost: - case ARM64::STRWpost: - case ARM64::LDRWpost: - DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); - break; - case ARM64::LDURSBXi: - case ARM64::LDURSHXi: - case ARM64::LDURSWi: - case ARM64::STURXi: - case ARM64::LDURXi: - case ARM64::LDTRSBXi: - case ARM64::LDTRSHXi: - case ARM64::LDTRSWi: - case ARM64::STTRXi: - case ARM64::LDTRXi: - case ARM64::LDRSBXpre: - case ARM64::LDRSHXpre: - case ARM64::STRXpre: - case ARM64::LDRSWpre: - case ARM64::LDRXpre: - case ARM64::LDRSBXpost: - case ARM64::LDRSHXpost: - case ARM64::STRXpost: - case ARM64::LDRSWpost: - case ARM64::LDRXpost: - DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); - break; - case ARM64::LDURQi: - case ARM64::STURQi: - case ARM64::LDRQpre: - case ARM64::STRQpre: - case ARM64::LDRQpost: - case ARM64::STRQpost: - DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder); - break; - case ARM64::LDURDi: - case ARM64::STURDi: - case ARM64::LDRDpre: - case ARM64::STRDpre: - case ARM64::LDRDpost: - case ARM64::STRDpost: - DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder); - break; - case ARM64::LDURSi: - case ARM64::STURSi: - case ARM64::LDRSpre: - case ARM64::STRSpre: - case ARM64::LDRSpost: - case ARM64::STRSpost: - DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder); - break; - case ARM64::LDURHi: - case ARM64::STURHi: - case ARM64::LDRHpre: - case ARM64::STRHpre: - case ARM64::LDRHpost: - case ARM64::STRHpost: - DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder); - break; - case ARM64::LDURBi: - case ARM64::STURBi: - case ARM64::LDRBpre: - case ARM64::STRBpre: - case ARM64::LDRBpost: - case ARM64::STRBpost: - DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder); - break; - } - - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); - Inst.addOperand(MCOperand::CreateImm(offset)); - - bool IsLoad = fieldFromInstruction(insn, 22, 1); - bool IsIndexed = fieldFromInstruction(insn, 10, 2) != 0; - bool IsFP = fieldFromInstruction(insn, 26, 1); - - // Cannot write back to a transfer register (but xzr != sp). - if (IsLoad && IsIndexed && !IsFP && Rn != 31 && Rt == Rn) - return SoftFail; - - return Success; -} - -static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Addr, - const void *Decoder) { - unsigned Rt = fieldFromInstruction(insn, 0, 5); - unsigned Rn = fieldFromInstruction(insn, 5, 5); - unsigned Rt2 = fieldFromInstruction(insn, 10, 5); - unsigned Rs = fieldFromInstruction(insn, 16, 5); - - unsigned Opcode = Inst.getOpcode(); - switch (Opcode) { - default: - return Fail; - case ARM64::STLXRW: - case ARM64::STLXRB: - case ARM64::STLXRH: - case ARM64::STXRW: - case ARM64::STXRB: - case ARM64::STXRH: - DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder); - // FALLTHROUGH - case ARM64::LDARW: - case ARM64::LDARB: - case ARM64::LDARH: - case ARM64::LDAXRW: - case ARM64::LDAXRB: - case ARM64::LDAXRH: - case ARM64::LDXRW: - case ARM64::LDXRB: - case ARM64::LDXRH: - case ARM64::STLRW: - case ARM64::STLRB: - case ARM64::STLRH: - DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); - break; - case ARM64::STLXRX: - case ARM64::STXRX: - DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder); - // FALLTHROUGH - case ARM64::LDARX: - case ARM64::LDAXRX: - case ARM64::LDXRX: - case ARM64::STLRX: - DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); - break; - case ARM64::STLXPW: - case ARM64::STXPW: - DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder); - // FALLTHROUGH - case ARM64::LDAXPW: - case ARM64::LDXPW: - DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); - DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder); - break; - case ARM64::STLXPX: - case ARM64::STXPX: - DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder); - // FALLTHROUGH - case ARM64::LDAXPX: - case ARM64::LDXPX: - DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); - DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder); - break; - } - - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); - - // You shouldn't load to the same register twice in an instruction... - if ((Opcode == ARM64::LDAXPW || Opcode == ARM64::LDXPW || - Opcode == ARM64::LDAXPX || Opcode == ARM64::LDXPX) && - Rt == Rt2) - return SoftFail; - - return Success; -} - -static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn, - uint64_t Addr, - const void *Decoder) { - unsigned Rt = fieldFromInstruction(insn, 0, 5); - unsigned Rn = fieldFromInstruction(insn, 5, 5); - unsigned Rt2 = fieldFromInstruction(insn, 10, 5); - int64_t offset = fieldFromInstruction(insn, 15, 7); - bool IsLoad = fieldFromInstruction(insn, 22, 1); - - // offset is a 7-bit signed immediate, so sign extend it to - // fill the unsigned. - if (offset & (1 << (7 - 1))) - offset |= ~((1LL << 7) - 1); - - unsigned Opcode = Inst.getOpcode(); - bool NeedsDisjointWritebackTransfer = false; - - // First operand is always writeback of base register. - switch (Opcode) { - default: - break; - case ARM64::LDPXpost: - case ARM64::STPXpost: - case ARM64::LDPSWpost: - case ARM64::LDPXpre: - case ARM64::STPXpre: - case ARM64::LDPSWpre: - case ARM64::LDPWpost: - case ARM64::STPWpost: - case ARM64::LDPWpre: - case ARM64::STPWpre: - case ARM64::LDPQpost: - case ARM64::STPQpost: - case ARM64::LDPQpre: - case ARM64::STPQpre: - case ARM64::LDPDpost: - case ARM64::STPDpost: - case ARM64::LDPDpre: - case ARM64::STPDpre: - case ARM64::LDPSpost: - case ARM64::STPSpost: - case ARM64::LDPSpre: - case ARM64::STPSpre: - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); - break; - } - - switch (Opcode) { - default: - return Fail; - case ARM64::LDPXpost: - case ARM64::STPXpost: - case ARM64::LDPSWpost: - case ARM64::LDPXpre: - case ARM64::STPXpre: - case ARM64::LDPSWpre: - NeedsDisjointWritebackTransfer = true; - // Fallthrough - case ARM64::LDNPXi: - case ARM64::STNPXi: - case ARM64::LDPXi: - case ARM64::STPXi: - case ARM64::LDPSWi: - DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); - DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder); - break; - case ARM64::LDPWpost: - case ARM64::STPWpost: - case ARM64::LDPWpre: - case ARM64::STPWpre: - NeedsDisjointWritebackTransfer = true; - // Fallthrough - case ARM64::LDNPWi: - case ARM64::STNPWi: - case ARM64::LDPWi: - case ARM64::STPWi: - DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); - DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder); - break; - case ARM64::LDNPQi: - case ARM64::STNPQi: - case ARM64::LDPQpost: - case ARM64::STPQpost: - case ARM64::LDPQi: - case ARM64::STPQi: - case ARM64::LDPQpre: - case ARM64::STPQpre: - DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder); - DecodeFPR128RegisterClass(Inst, Rt2, Addr, Decoder); - break; - case ARM64::LDNPDi: - case ARM64::STNPDi: - case ARM64::LDPDpost: - case ARM64::STPDpost: - case ARM64::LDPDi: - case ARM64::STPDi: - case ARM64::LDPDpre: - case ARM64::STPDpre: - DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder); - DecodeFPR64RegisterClass(Inst, Rt2, Addr, Decoder); - break; - case ARM64::LDNPSi: - case ARM64::STNPSi: - case ARM64::LDPSpost: - case ARM64::STPSpost: - case ARM64::LDPSi: - case ARM64::STPSi: - case ARM64::LDPSpre: - case ARM64::STPSpre: - DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder); - DecodeFPR32RegisterClass(Inst, Rt2, Addr, Decoder); - break; - } - - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); - Inst.addOperand(MCOperand::CreateImm(offset)); - - // You shouldn't load to the same register twice in an instruction... - if (IsLoad && Rt == Rt2) - return SoftFail; - - // ... or do any operation that writes-back to a transfer register. But note - // that "stp xzr, xzr, [sp], #4" is fine because xzr and sp are different. - if (NeedsDisjointWritebackTransfer && Rn != 31 && (Rt == Rn || Rt2 == Rn)) - return SoftFail; - - return Success; -} - -static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Addr, - const void *Decoder) { - unsigned Rd = fieldFromInstruction(insn, 0, 5); - unsigned Rn = fieldFromInstruction(insn, 5, 5); - unsigned Rm = fieldFromInstruction(insn, 16, 5); - unsigned extend = fieldFromInstruction(insn, 10, 6); - - unsigned shift = extend & 0x7; - if (shift > 4) - return Fail; - - switch (Inst.getOpcode()) { - default: - return Fail; - case ARM64::ADDWrx: - case ARM64::SUBWrx: - DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder); - DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder); - break; - case ARM64::ADDSWrx: - case ARM64::SUBSWrx: - DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder); - DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder); - break; - case ARM64::ADDXrx: - case ARM64::SUBXrx: - DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); - DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder); - break; - case ARM64::ADDSXrx: - case ARM64::SUBSXrx: - DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); - DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder); - break; - case ARM64::ADDXrx64: - case ARM64::SUBXrx64: - DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); - DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder); - break; - case ARM64::SUBSXrx64: - case ARM64::ADDSXrx64: - DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); - DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder); - break; - } - - Inst.addOperand(MCOperand::CreateImm(extend)); - return Success; -} - -static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Addr, - const void *Decoder) { - unsigned Rd = fieldFromInstruction(insn, 0, 5); - unsigned Rn = fieldFromInstruction(insn, 5, 5); - unsigned Datasize = fieldFromInstruction(insn, 31, 1); - unsigned imm; - - if (Datasize) { - if (Inst.getOpcode() == ARM64::ANDSXri) - DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); - else - DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder); - imm = fieldFromInstruction(insn, 10, 13); - if (!ARM64_AM::isValidDecodeLogicalImmediate(imm, 64)) - return Fail; - } else { - if (Inst.getOpcode() == ARM64::ANDSWri) - DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder); - else - DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder); - imm = fieldFromInstruction(insn, 10, 12); - if (!ARM64_AM::isValidDecodeLogicalImmediate(imm, 32)) - return Fail; - } - Inst.addOperand(MCOperand::CreateImm(imm)); - return Success; -} - -static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn, - uint64_t Addr, - const void *Decoder) { - unsigned Rd = fieldFromInstruction(insn, 0, 5); - unsigned cmode = fieldFromInstruction(insn, 12, 4); - unsigned imm = fieldFromInstruction(insn, 16, 3) << 5; - imm |= fieldFromInstruction(insn, 5, 5); - - if (Inst.getOpcode() == ARM64::MOVID) - DecodeFPR64RegisterClass(Inst, Rd, Addr, Decoder); - else - DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder); - - Inst.addOperand(MCOperand::CreateImm(imm)); - - switch (Inst.getOpcode()) { - default: - break; - case ARM64::MOVIv4i16: - case ARM64::MOVIv8i16: - case ARM64::MVNIv4i16: - case ARM64::MVNIv8i16: - case ARM64::MOVIv2i32: - case ARM64::MOVIv4i32: - case ARM64::MVNIv2i32: - case ARM64::MVNIv4i32: - Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2)); - break; - case ARM64::MOVIv2s_msl: - case ARM64::MOVIv4s_msl: - case ARM64::MVNIv2s_msl: - case ARM64::MVNIv4s_msl: - Inst.addOperand(MCOperand::CreateImm(cmode & 1 ? 0x110 : 0x108)); - break; - } - - return Success; -} - -static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Addr, - const void *Decoder) { - unsigned Rd = fieldFromInstruction(insn, 0, 5); - unsigned cmode = fieldFromInstruction(insn, 12, 4); - unsigned imm = fieldFromInstruction(insn, 16, 3) << 5; - imm |= fieldFromInstruction(insn, 5, 5); - - // Tied operands added twice. - DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder); - DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder); - - Inst.addOperand(MCOperand::CreateImm(imm)); - Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2)); - - return Success; -} - -static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn, - uint64_t Addr, const void *Decoder) { - unsigned Rd = fieldFromInstruction(insn, 0, 5); - int64_t imm = fieldFromInstruction(insn, 5, 19) << 2; - imm |= fieldFromInstruction(insn, 29, 2); - const ARM64Disassembler *Dis = - static_cast(Decoder); - - // Sign-extend the 21-bit immediate. - if (imm & (1 << (21 - 1))) - imm |= ~((1LL << 21) - 1); - - DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); - if (!Dis->tryAddingSymbolicOperand(Inst, imm, Addr, Fail, 0, 4)) - Inst.addOperand(MCOperand::CreateImm(imm)); - - return Success; -} - -static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn, - uint64_t Addr, const void *Decoder) { - unsigned Rd = fieldFromInstruction(insn, 0, 5); - unsigned Rn = fieldFromInstruction(insn, 5, 5); - unsigned Imm = fieldFromInstruction(insn, 10, 14); - unsigned S = fieldFromInstruction(insn, 29, 1); - unsigned Datasize = fieldFromInstruction(insn, 31, 1); - - unsigned ShifterVal = (Imm >> 12) & 3; - unsigned ImmVal = Imm & 0xFFF; - const ARM64Disassembler *Dis = - static_cast(Decoder); - - if (ShifterVal != 0 && ShifterVal != 1) - return Fail; - - if (Datasize) { - if (Rd == 31 && !S) - DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder); - else - DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder); - } else { - if (Rd == 31 && !S) - DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder); - else - DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder); - DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder); - } - - if (!Dis->tryAddingSymbolicOperand(Inst, Imm, Addr, Fail, 0, 4)) - Inst.addOperand(MCOperand::CreateImm(ImmVal)); - Inst.addOperand(MCOperand::CreateImm(12 * ShifterVal)); - return Success; -} - -static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn, - uint64_t Addr, - const void *Decoder) { - int64_t imm = fieldFromInstruction(insn, 0, 26); - const ARM64Disassembler *Dis = - static_cast(Decoder); - - // Sign-extend the 26-bit immediate. - if (imm & (1 << (26 - 1))) - imm |= ~((1LL << 26) - 1); - - if (!Dis->tryAddingSymbolicOperand(Inst, imm << 2, Addr, true, 0, 4)) - Inst.addOperand(MCOperand::CreateImm(imm)); - - return Success; -} - -static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst, - uint32_t insn, uint64_t Addr, - const void *Decoder) { - uint64_t op1 = fieldFromInstruction(insn, 16, 3); - uint64_t op2 = fieldFromInstruction(insn, 5, 3); - uint64_t crm = fieldFromInstruction(insn, 8, 4); - - uint64_t pstate_field = (op1 << 3) | op2; - - Inst.addOperand(MCOperand::CreateImm(pstate_field)); - Inst.addOperand(MCOperand::CreateImm(crm)); - - bool ValidNamed; - (void)ARM64PState::PStateMapper().toString(pstate_field, ValidNamed); - - return ValidNamed ? Success : Fail; -} - -static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn, - uint64_t Addr, const void *Decoder) { - uint64_t Rt = fieldFromInstruction(insn, 0, 5); - uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5; - bit |= fieldFromInstruction(insn, 19, 5); - int64_t dst = fieldFromInstruction(insn, 5, 14); - const ARM64Disassembler *Dis = - static_cast(Decoder); - - // Sign-extend 14-bit immediate. - if (dst & (1 << (14 - 1))) - dst |= ~((1LL << 14) - 1); - - if (fieldFromInstruction(insn, 31, 1) == 0) - DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder); - else - DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder); - Inst.addOperand(MCOperand::CreateImm(bit)); - if (!Dis->tryAddingSymbolicOperand(Inst, dst << 2, Addr, true, 0, 4)) - Inst.addOperand(MCOperand::CreateImm(dst)); - - return Success; -} diff --git a/lib/Target/ARM64/Disassembler/ARM64Disassembler.h b/lib/Target/ARM64/Disassembler/ARM64Disassembler.h deleted file mode 100644 index 8989925f36b..00000000000 --- a/lib/Target/ARM64/Disassembler/ARM64Disassembler.h +++ /dev/null @@ -1,40 +0,0 @@ -//===- ARM64Disassembler.h - Disassembler for ARM64 -------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef ARM64DISASSEMBLER_H -#define ARM64DISASSEMBLER_H - -#include "llvm/MC/MCDisassembler.h" - -namespace llvm { - -class MCInst; -class MemoryObject; -class raw_ostream; - -class ARM64Disassembler : public MCDisassembler { -public: - ARM64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx) - : MCDisassembler(STI, Ctx) {} - - ~ARM64Disassembler() {} - - /// getInstruction - See MCDisassembler. - MCDisassembler::DecodeStatus - getInstruction(MCInst &instr, uint64_t &size, const MemoryObject ®ion, - uint64_t address, raw_ostream &vStream, - raw_ostream &cStream) const override; -}; - -} // namespace llvm - -#endif diff --git a/lib/Target/ARM64/Disassembler/ARM64ExternalSymbolizer.cpp b/lib/Target/ARM64/Disassembler/ARM64ExternalSymbolizer.cpp deleted file mode 100644 index 2f8e516d185..00000000000 --- a/lib/Target/ARM64/Disassembler/ARM64ExternalSymbolizer.cpp +++ /dev/null @@ -1,226 +0,0 @@ -//===- ARM64ExternalSymbolizer.cpp - Symbolizer for ARM64 -------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "ARM64ExternalSymbolizer.h" -#include "ARM64Subtarget.h" -#include "MCTargetDesc/ARM64AddressingModes.h" -#include "Utils/ARM64BaseInfo.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/Support/Format.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -#define DEBUG_TYPE "arm64-disassembler" - -static MCSymbolRefExpr::VariantKind -getVariant(uint64_t LLVMDisassembler_VariantKind) { - switch (LLVMDisassembler_VariantKind) { - case LLVMDisassembler_VariantKind_None: - return MCSymbolRefExpr::VK_None; - case LLVMDisassembler_VariantKind_ARM64_PAGE: - return MCSymbolRefExpr::VK_PAGE; - case LLVMDisassembler_VariantKind_ARM64_PAGEOFF: - return MCSymbolRefExpr::VK_PAGEOFF; - case LLVMDisassembler_VariantKind_ARM64_GOTPAGE: - return MCSymbolRefExpr::VK_GOTPAGE; - case LLVMDisassembler_VariantKind_ARM64_GOTPAGEOFF: - return MCSymbolRefExpr::VK_GOTPAGEOFF; - case LLVMDisassembler_VariantKind_ARM64_TLVP: - case LLVMDisassembler_VariantKind_ARM64_TLVOFF: - default: - assert(0 && "bad LLVMDisassembler_VariantKind"); - return MCSymbolRefExpr::VK_None; - } -} - -/// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic -/// operand in place of the immediate Value in the MCInst. The immediate -/// Value has not had any PC adjustment made by the caller. If the instruction -/// is a branch that adds the PC to the immediate Value then isBranch is -/// Success, else Fail. If GetOpInfo is non-null, then it is called to get any -/// symbolic information at the Address for this instrution. If that returns -/// non-zero then the symbolic information it returns is used to create an -/// MCExpr and that is added as an operand to the MCInst. If GetOpInfo() -/// returns zero and isBranch is Success then a symbol look up for -/// Address + Value is done and if a symbol is found an MCExpr is created with -/// that, else an MCExpr with Address + Value is created. If GetOpInfo() -/// returns zero and isBranch is Fail then the the Opcode of the MCInst is -/// tested and for ADRP an other instructions that help to load of pointers -/// a symbol look up is done to see it is returns a specific reference type -/// to add to the comment stream. This function returns Success if it adds -/// an operand to the MCInst and Fail otherwise. -bool ARM64ExternalSymbolizer::tryAddingSymbolicOperand( - MCInst &MI, - raw_ostream &CommentStream, - int64_t Value, - uint64_t Address, - bool IsBranch, - uint64_t Offset, - uint64_t InstSize) { - // FIXME: This method shares a lot of code with - // MCExternalSymbolizer::tryAddingSymbolicOperand. It may be possible - // refactor the MCExternalSymbolizer interface to allow more of this - // implementation to be shared. - // - struct LLVMOpInfo1 SymbolicOp; - memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1)); - SymbolicOp.Value = Value; - uint64_t ReferenceType; - const char *ReferenceName; - if (!GetOpInfo || - !GetOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) { - if (IsBranch) { - ReferenceType = LLVMDisassembler_ReferenceType_In_Branch; - const char *Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType, - Address, &ReferenceName); - if (Name) { - SymbolicOp.AddSymbol.Name = Name; - SymbolicOp.AddSymbol.Present = true; - SymbolicOp.Value = 0; - } else { - SymbolicOp.Value = Address + Value; - } - if (ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub) - CommentStream << "symbol stub for: " << ReferenceName; - else if (ReferenceType == - LLVMDisassembler_ReferenceType_Out_Objc_Message) - CommentStream << "Objc message: " << ReferenceName; - } else if (MI.getOpcode() == ARM64::ADRP) { - ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADRP; - // otool expects the fully encoded ADRP instruction to be passed in as - // the value here, so reconstruct it: - const MCRegisterInfo &MCRI = *Ctx.getRegisterInfo(); - uint32_t EncodedInst = 0x90000000; - EncodedInst |= (Value & 0x3) << 29; // immlo - EncodedInst |= ((Value >> 2) & 0x7FFFF) << 5; // immhi - EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // reg - SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address, - &ReferenceName); - CommentStream << format("0x%llx", - 0xfffffffffffff000LL & (Address + Value)); - } else if (MI.getOpcode() == ARM64::ADDXri || - MI.getOpcode() == ARM64::LDRXui || - MI.getOpcode() == ARM64::LDRXl || - MI.getOpcode() == ARM64::ADR) { - if (MI.getOpcode() == ARM64::ADDXri) - ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADDXri; - else if (MI.getOpcode() == ARM64::LDRXui) - ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXui; - if (MI.getOpcode() == ARM64::LDRXl) { - ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXl; - SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address, - &ReferenceName); - } else if (MI.getOpcode() == ARM64::ADR) { - ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADR; - SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address, - &ReferenceName); - } else { - const MCRegisterInfo &MCRI = *Ctx.getRegisterInfo(); - // otool expects the fully encoded ADD/LDR instruction to be passed in - // as the value here, so reconstruct it: - unsigned EncodedInst = - MI.getOpcode() == ARM64::ADDXri ? 0x91000000: 0xF9400000; - EncodedInst |= Value << 10; // imm12 [+ shift:2 for ADD] - EncodedInst |= - MCRI.getEncodingValue(MI.getOperand(1).getReg()) << 5; // Rn - EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // Rd - - SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address, - &ReferenceName); - } - if (ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr) - CommentStream << "literal pool symbol address: " << ReferenceName; - else if (ReferenceType == - LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr) - CommentStream << "literal pool for: \"" << ReferenceName << "\""; - else if (ReferenceType == - LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref) - CommentStream << "Objc cfstring ref: @\"" << ReferenceName << "\""; - else if (ReferenceType == - LLVMDisassembler_ReferenceType_Out_Objc_Message) - CommentStream << "Objc message: " << ReferenceName; - else if (ReferenceType == - LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref) - CommentStream << "Objc message ref: " << ReferenceName; - else if (ReferenceType == - LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref) - CommentStream << "Objc selector ref: " << ReferenceName; - else if (ReferenceType == - LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref) - CommentStream << "Objc class ref: " << ReferenceName; - // For these instructions, the SymbolLookUp() above is just to get the - // ReferenceType and ReferenceName. We want to make sure not to - // fall through so we don't build an MCExpr to leave the disassembly - // of the immediate values of these instructions to the InstPrinter. - return false; - } else { - return false; - } - } - - const MCExpr *Add = nullptr; - if (SymbolicOp.AddSymbol.Present) { - if (SymbolicOp.AddSymbol.Name) { - StringRef Name(SymbolicOp.AddSymbol.Name); - MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name); - MCSymbolRefExpr::VariantKind Variant = getVariant(SymbolicOp.VariantKind); - if (Variant != MCSymbolRefExpr::VK_None) - Add = MCSymbolRefExpr::Create(Sym, Variant, Ctx); - else - Add = MCSymbolRefExpr::Create(Sym, Ctx); - } else { - Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, Ctx); - } - } - - const MCExpr *Sub = nullptr; - if (SymbolicOp.SubtractSymbol.Present) { - if (SymbolicOp.SubtractSymbol.Name) { - StringRef Name(SymbolicOp.SubtractSymbol.Name); - MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name); - Sub = MCSymbolRefExpr::Create(Sym, Ctx); - } else { - Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, Ctx); - } - } - - const MCExpr *Off = nullptr; - if (SymbolicOp.Value != 0) - Off = MCConstantExpr::Create(SymbolicOp.Value, Ctx); - - const MCExpr *Expr; - if (Sub) { - const MCExpr *LHS; - if (Add) - LHS = MCBinaryExpr::CreateSub(Add, Sub, Ctx); - else - LHS = MCUnaryExpr::CreateMinus(Sub, Ctx); - if (Off) - Expr = MCBinaryExpr::CreateAdd(LHS, Off, Ctx); - else - Expr = LHS; - } else if (Add) { - if (Off) - Expr = MCBinaryExpr::CreateAdd(Add, Off, Ctx); - else - Expr = Add; - } else { - if (Off) - Expr = Off; - else - Expr = MCConstantExpr::Create(0, Ctx); - } - - MI.addOperand(MCOperand::CreateExpr(Expr)); - - return true; -} diff --git a/lib/Target/ARM64/Disassembler/ARM64ExternalSymbolizer.h b/lib/Target/ARM64/Disassembler/ARM64ExternalSymbolizer.h deleted file mode 100644 index 45f07a5e258..00000000000 --- a/lib/Target/ARM64/Disassembler/ARM64ExternalSymbolizer.h +++ /dev/null @@ -1,37 +0,0 @@ -//===- ARM64ExternalSymbolizer.h - Symbolizer for ARM64 ---------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Symbolize ARM64 assembly code during disassembly using callbacks. -// -//===----------------------------------------------------------------------===// - -#ifndef ARM64EXTERNALSYMBOLIZER_H -#define ARM64EXTERNALSYMBOLIZER_H - -#include "llvm/MC/MCExternalSymbolizer.h" - -namespace llvm { - -class ARM64ExternalSymbolizer : public MCExternalSymbolizer { -public: - ARM64ExternalSymbolizer(MCContext &Ctx, - std::unique_ptr RelInfo, - LLVMOpInfoCallback GetOpInfo, - LLVMSymbolLookupCallback SymbolLookUp, void *DisInfo) - : MCExternalSymbolizer(Ctx, std::move(RelInfo), GetOpInfo, SymbolLookUp, - DisInfo) {} - - bool tryAddingSymbolicOperand(MCInst &MI, raw_ostream &CommentStream, - int64_t Value, uint64_t Address, bool IsBranch, - uint64_t Offset, uint64_t InstSize) override; -}; - -} // namespace llvm - -#endif diff --git a/lib/Target/ARM64/Disassembler/CMakeLists.txt b/lib/Target/ARM64/Disassembler/CMakeLists.txt deleted file mode 100644 index 43ade66be14..00000000000 --- a/lib/Target/ARM64/Disassembler/CMakeLists.txt +++ /dev/null @@ -1,14 +0,0 @@ -include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) - -add_llvm_library(LLVMARM64Disassembler - ARM64Disassembler.cpp - ARM64ExternalSymbolizer.cpp - ) -# workaround for hanging compilation on MSVC8, 9 and 10 -#if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 ) -#set_property( -# SOURCE ARMDisassembler.cpp -# PROPERTY COMPILE_FLAGS "/Od" -# ) -#endif() -add_dependencies(LLVMARM64Disassembler ARM64CommonTableGen) diff --git a/lib/Target/ARM64/Disassembler/LLVMBuild.txt b/lib/Target/ARM64/Disassembler/LLVMBuild.txt deleted file mode 100644 index 5bbe88ddb49..00000000000 --- a/lib/Target/ARM64/Disassembler/LLVMBuild.txt +++ /dev/null @@ -1,23 +0,0 @@ -;===- ./lib/Target/ARM64/Disassembler/LLVMBuild.txt ------------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[component_0] -type = Library -name = ARM64Disassembler -parent = ARM64 -required_libraries = ARM64Info ARM64Utils MC Support -add_to_library_groups = ARM64 diff --git a/lib/Target/ARM64/Disassembler/Makefile b/lib/Target/ARM64/Disassembler/Makefile deleted file mode 100644 index 479d00c2494..00000000000 --- a/lib/Target/ARM64/Disassembler/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -##===- lib/Target/ARM64/Disassembler/Makefile --------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../../.. -LIBRARYNAME = LLVMARM64Disassembler - -# Hack: we need to include 'main' arm target directory to grab private headers -CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.cpp b/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.cpp deleted file mode 100644 index 529b450352e..00000000000 --- a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.cpp +++ /dev/null @@ -1,1312 +0,0 @@ -//===-- ARM64InstPrinter.cpp - Convert ARM64 MCInst to assembly syntax ----===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints an ARM64 MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#include "ARM64InstPrinter.h" -#include "MCTargetDesc/ARM64AddressingModes.h" -#include "Utils/ARM64BaseInfo.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/Support/Format.h" -#include "llvm/Support/raw_ostream.h" -using namespace llvm; - -#define DEBUG_TYPE "asm-printer" - -#define GET_INSTRUCTION_NAME -#define PRINT_ALIAS_INSTR -#include "ARM64GenAsmWriter.inc" -#define GET_INSTRUCTION_NAME -#define PRINT_ALIAS_INSTR -#include "ARM64GenAsmWriter1.inc" - -ARM64InstPrinter::ARM64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI) - : MCInstPrinter(MAI, MII, MRI) { - // Initialize the set of available features. - setAvailableFeatures(STI.getFeatureBits()); -} - -ARM64AppleInstPrinter::ARM64AppleInstPrinter(const MCAsmInfo &MAI, - const MCInstrInfo &MII, - const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI) - : ARM64InstPrinter(MAI, MII, MRI, STI) {} - -void ARM64InstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { - // This is for .cfi directives. - OS << getRegisterName(RegNo); -} - -void ARM64InstPrinter::printInst(const MCInst *MI, raw_ostream &O, - StringRef Annot) { - // Check for special encodings and print the canonical alias instead. - - unsigned Opcode = MI->getOpcode(); - - if (Opcode == ARM64::SYSxt) - if (printSysAlias(MI, O)) { - printAnnotation(O, Annot); - return; - } - - // SBFM/UBFM should print to a nicer aliased form if possible. - if (Opcode == ARM64::SBFMXri || Opcode == ARM64::SBFMWri || - Opcode == ARM64::UBFMXri || Opcode == ARM64::UBFMWri) { - const MCOperand &Op0 = MI->getOperand(0); - const MCOperand &Op1 = MI->getOperand(1); - const MCOperand &Op2 = MI->getOperand(2); - const MCOperand &Op3 = MI->getOperand(3); - - bool IsSigned = (Opcode == ARM64::SBFMXri || Opcode == ARM64::SBFMWri); - bool Is64Bit = (Opcode == ARM64::SBFMXri || Opcode == ARM64::UBFMXri); - if (Op2.isImm() && Op2.getImm() == 0 && Op3.isImm()) { - const char *AsmMnemonic = nullptr; - - switch (Op3.getImm()) { - default: - break; - case 7: - if (IsSigned) - AsmMnemonic = "sxtb"; - else if (!Is64Bit) - AsmMnemonic = "uxtb"; - break; - case 15: - if (IsSigned) - AsmMnemonic = "sxth"; - else if (!Is64Bit) - AsmMnemonic = "uxth"; - break; - case 31: - // *xtw is only valid for signed 64-bit operations. - if (Is64Bit && IsSigned) - AsmMnemonic = "sxtw"; - break; - } - - if (AsmMnemonic) { - O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg()) - << ", " << getRegisterName(getWRegFromXReg(Op1.getReg())); - printAnnotation(O, Annot); - return; - } - } - - // All immediate shifts are aliases, implemented using the Bitfield - // instruction. In all cases the immediate shift amount shift must be in - // the range 0 to (reg.size -1). - if (Op2.isImm() && Op3.isImm()) { - const char *AsmMnemonic = nullptr; - int shift = 0; - int64_t immr = Op2.getImm(); - int64_t imms = Op3.getImm(); - if (Opcode == ARM64::UBFMWri && imms != 0x1F && ((imms + 1) == immr)) { - AsmMnemonic = "lsl"; - shift = 31 - imms; - } else if (Opcode == ARM64::UBFMXri && imms != 0x3f && - ((imms + 1 == immr))) { - AsmMnemonic = "lsl"; - shift = 63 - imms; - } else if (Opcode == ARM64::UBFMWri && imms == 0x1f) { - AsmMnemonic = "lsr"; - shift = immr; - } else if (Opcode == ARM64::UBFMXri && imms == 0x3f) { - AsmMnemonic = "lsr"; - shift = immr; - } else if (Opcode == ARM64::SBFMWri && imms == 0x1f) { - AsmMnemonic = "asr"; - shift = immr; - } else if (Opcode == ARM64::SBFMXri && imms == 0x3f) { - AsmMnemonic = "asr"; - shift = immr; - } - if (AsmMnemonic) { - O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg()) - << ", " << getRegisterName(Op1.getReg()) << ", #" << shift; - printAnnotation(O, Annot); - return; - } - } - - // SBFIZ/UBFIZ aliases - if (Op2.getImm() > Op3.getImm()) { - O << '\t' << (IsSigned ? "sbfiz" : "ubfiz") << '\t' - << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg()) - << ", #" << (Is64Bit ? 64 : 32) - Op2.getImm() << ", #" << Op3.getImm() + 1; - printAnnotation(O, Annot); - return; - } - - // Otherwise SBFX/UBFX is the preferred form - O << '\t' << (IsSigned ? "sbfx" : "ubfx") << '\t' - << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg()) - << ", #" << Op2.getImm() << ", #" << Op3.getImm() - Op2.getImm() + 1; - printAnnotation(O, Annot); - return; - } - - if (Opcode == ARM64::BFMXri || Opcode == ARM64::BFMWri) { - const MCOperand &Op0 = MI->getOperand(0); // Op1 == Op0 - const MCOperand &Op2 = MI->getOperand(2); - int ImmR = MI->getOperand(3).getImm(); - int ImmS = MI->getOperand(4).getImm(); - - // BFI alias - if (ImmS < ImmR) { - int BitWidth = Opcode == ARM64::BFMXri ? 64 : 32; - int LSB = (BitWidth - ImmR) % BitWidth; - int Width = ImmS + 1; - O << "\tbfi\t" << getRegisterName(Op0.getReg()) << ", " - << getRegisterName(Op2.getReg()) << ", #" << LSB << ", #" << Width; - printAnnotation(O, Annot); - return; - } - - int LSB = ImmR; - int Width = ImmS - ImmR + 1; - // Otherwise BFXIL the preferred form - O << "\tbfxil\t" - << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op2.getReg()) - << ", #" << LSB << ", #" << Width; - printAnnotation(O, Annot); - return; - } - - // Symbolic operands for MOVZ, MOVN and MOVK already imply a shift - // (e.g. :gottprel_g1: is always going to be "lsl #16") so it should not be - // printed. - if ((Opcode == ARM64::MOVZXi || Opcode == ARM64::MOVZWi || - Opcode == ARM64::MOVNXi || Opcode == ARM64::MOVNWi) && - MI->getOperand(1).isExpr()) { - if (Opcode == ARM64::MOVZXi || Opcode == ARM64::MOVZWi) - O << "\tmovz\t"; - else - O << "\tmovn\t"; - - O << getRegisterName(MI->getOperand(0).getReg()) << ", #" - << *MI->getOperand(1).getExpr(); - return; - } - - if ((Opcode == ARM64::MOVKXi || Opcode == ARM64::MOVKWi) && - MI->getOperand(2).isExpr()) { - O << "\tmovk\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #" - << *MI->getOperand(2).getExpr(); - return; - } - - if (!printAliasInstr(MI, O)) - printInstruction(MI, O); - - printAnnotation(O, Annot); -} - -static bool isTblTbxInstruction(unsigned Opcode, StringRef &Layout, - bool &IsTbx) { - switch (Opcode) { - case ARM64::TBXv8i8One: - case ARM64::TBXv8i8Two: - case ARM64::TBXv8i8Three: - case ARM64::TBXv8i8Four: - IsTbx = true; - Layout = ".8b"; - return true; - case ARM64::TBLv8i8One: - case ARM64::TBLv8i8Two: - case ARM64::TBLv8i8Three: - case ARM64::TBLv8i8Four: - IsTbx = false; - Layout = ".8b"; - return true; - case ARM64::TBXv16i8One: - case ARM64::TBXv16i8Two: - case ARM64::TBXv16i8Three: - case ARM64::TBXv16i8Four: - IsTbx = true; - Layout = ".16b"; - return true; - case ARM64::TBLv16i8One: - case ARM64::TBLv16i8Two: - case ARM64::TBLv16i8Three: - case ARM64::TBLv16i8Four: - IsTbx = false; - Layout = ".16b"; - return true; - default: - return false; - } -} - -struct LdStNInstrDesc { - unsigned Opcode; - const char *Mnemonic; - const char *Layout; - int ListOperand; - bool HasLane; - int NaturalOffset; -}; - -static LdStNInstrDesc LdStNInstInfo[] = { - { ARM64::LD1i8, "ld1", ".b", 1, true, 0 }, - { ARM64::LD1i16, "ld1", ".h", 1, true, 0 }, - { ARM64::LD1i32, "ld1", ".s", 1, true, 0 }, - { ARM64::LD1i64, "ld1", ".d", 1, true, 0 }, - { ARM64::LD1i8_POST, "ld1", ".b", 2, true, 1 }, - { ARM64::LD1i16_POST, "ld1", ".h", 2, true, 2 }, - { ARM64::LD1i32_POST, "ld1", ".s", 2, true, 4 }, - { ARM64::LD1i64_POST, "ld1", ".d", 2, true, 8 }, - { ARM64::LD1Rv16b, "ld1r", ".16b", 0, false, 0 }, - { ARM64::LD1Rv8h, "ld1r", ".8h", 0, false, 0 }, - { ARM64::LD1Rv4s, "ld1r", ".4s", 0, false, 0 }, - { ARM64::LD1Rv2d, "ld1r", ".2d", 0, false, 0 }, - { ARM64::LD1Rv8b, "ld1r", ".8b", 0, false, 0 }, - { ARM64::LD1Rv4h, "ld1r", ".4h", 0, false, 0 }, - { ARM64::LD1Rv2s, "ld1r", ".2s", 0, false, 0 }, - { ARM64::LD1Rv1d, "ld1r", ".1d", 0, false, 0 }, - { ARM64::LD1Rv16b_POST, "ld1r", ".16b", 1, false, 1 }, - { ARM64::LD1Rv8h_POST, "ld1r", ".8h", 1, false, 2 }, - { ARM64::LD1Rv4s_POST, "ld1r", ".4s", 1, false, 4 }, - { ARM64::LD1Rv2d_POST, "ld1r", ".2d", 1, false, 8 }, - { ARM64::LD1Rv8b_POST, "ld1r", ".8b", 1, false, 1 }, - { ARM64::LD1Rv4h_POST, "ld1r", ".4h", 1, false, 2 }, - { ARM64::LD1Rv2s_POST, "ld1r", ".2s", 1, false, 4 }, - { ARM64::LD1Rv1d_POST, "ld1r", ".1d", 1, false, 8 }, - { ARM64::LD1Onev16b, "ld1", ".16b", 0, false, 0 }, - { ARM64::LD1Onev8h, "ld1", ".8h", 0, false, 0 }, - { ARM64::LD1Onev4s, "ld1", ".4s", 0, false, 0 }, - { ARM64::LD1Onev2d, "ld1", ".2d", 0, false, 0 }, - { ARM64::LD1Onev8b, "ld1", ".8b", 0, false, 0 }, - { ARM64::LD1Onev4h, "ld1", ".4h", 0, false, 0 }, - { ARM64::LD1Onev2s, "ld1", ".2s", 0, false, 0 }, - { ARM64::LD1Onev1d, "ld1", ".1d", 0, false, 0 }, - { ARM64::LD1Onev16b_POST, "ld1", ".16b", 1, false, 16 }, - { ARM64::LD1Onev8h_POST, "ld1", ".8h", 1, false, 16 }, - { ARM64::LD1Onev4s_POST, "ld1", ".4s", 1, false, 16 }, - { ARM64::LD1Onev2d_POST, "ld1", ".2d", 1, false, 16 }, - { ARM64::LD1Onev8b_POST, "ld1", ".8b", 1, false, 8 }, - { ARM64::LD1Onev4h_POST, "ld1", ".4h", 1, false, 8 }, - { ARM64::LD1Onev2s_POST, "ld1", ".2s", 1, false, 8 }, - { ARM64::LD1Onev1d_POST, "ld1", ".1d", 1, false, 8 }, - { ARM64::LD1Twov16b, "ld1", ".16b", 0, false, 0 }, - { ARM64::LD1Twov8h, "ld1", ".8h", 0, false, 0 }, - { ARM64::LD1Twov4s, "ld1", ".4s", 0, false, 0 }, - { ARM64::LD1Twov2d, "ld1", ".2d", 0, false, 0 }, - { ARM64::LD1Twov8b, "ld1", ".8b", 0, false, 0 }, - { ARM64::LD1Twov4h, "ld1", ".4h", 0, false, 0 }, - { ARM64::LD1Twov2s, "ld1", ".2s", 0, false, 0 }, - { ARM64::LD1Twov1d, "ld1", ".1d", 0, false, 0 }, - { ARM64::LD1Twov16b_POST, "ld1", ".16b", 1, false, 32 }, - { ARM64::LD1Twov8h_POST, "ld1", ".8h", 1, false, 32 }, - { ARM64::LD1Twov4s_POST, "ld1", ".4s", 1, false, 32 }, - { ARM64::LD1Twov2d_POST, "ld1", ".2d", 1, false, 32 }, - { ARM64::LD1Twov8b_POST, "ld1", ".8b", 1, false, 16 }, - { ARM64::LD1Twov4h_POST, "ld1", ".4h", 1, false, 16 }, - { ARM64::LD1Twov2s_POST, "ld1", ".2s", 1, false, 16 }, - { ARM64::LD1Twov1d_POST, "ld1", ".1d", 1, false, 16 }, - { ARM64::LD1Threev16b, "ld1", ".16b", 0, false, 0 }, - { ARM64::LD1Threev8h, "ld1", ".8h", 0, false, 0 }, - { ARM64::LD1Threev4s, "ld1", ".4s", 0, false, 0 }, - { ARM64::LD1Threev2d, "ld1", ".2d", 0, false, 0 }, - { ARM64::LD1Threev8b, "ld1", ".8b", 0, false, 0 }, - { ARM64::LD1Threev4h, "ld1", ".4h", 0, false, 0 }, - { ARM64::LD1Threev2s, "ld1", ".2s", 0, false, 0 }, - { ARM64::LD1Threev1d, "ld1", ".1d", 0, false, 0 }, - { ARM64::LD1Threev16b_POST, "ld1", ".16b", 1, false, 48 }, - { ARM64::LD1Threev8h_POST, "ld1", ".8h", 1, false, 48 }, - { ARM64::LD1Threev4s_POST, "ld1", ".4s", 1, false, 48 }, - { ARM64::LD1Threev2d_POST, "ld1", ".2d", 1, false, 48 }, - { ARM64::LD1Threev8b_POST, "ld1", ".8b", 1, false, 24 }, - { ARM64::LD1Threev4h_POST, "ld1", ".4h", 1, false, 24 }, - { ARM64::LD1Threev2s_POST, "ld1", ".2s", 1, false, 24 }, - { ARM64::LD1Threev1d_POST, "ld1", ".1d", 1, false, 24 }, - { ARM64::LD1Fourv16b, "ld1", ".16b", 0, false, 0 }, - { ARM64::LD1Fourv8h, "ld1", ".8h", 0, false, 0 }, - { ARM64::LD1Fourv4s, "ld1", ".4s", 0, false, 0 }, - { ARM64::LD1Fourv2d, "ld1", ".2d", 0, false, 0 }, - { ARM64::LD1Fourv8b, "ld1", ".8b", 0, false, 0 }, - { ARM64::LD1Fourv4h, "ld1", ".4h", 0, false, 0 }, - { ARM64::LD1Fourv2s, "ld1", ".2s", 0, false, 0 }, - { ARM64::LD1Fourv1d, "ld1", ".1d", 0, false, 0 }, - { ARM64::LD1Fourv16b_POST, "ld1", ".16b", 1, false, 64 }, - { ARM64::LD1Fourv8h_POST, "ld1", ".8h", 1, false, 64 }, - { ARM64::LD1Fourv4s_POST, "ld1", ".4s", 1, false, 64 }, - { ARM64::LD1Fourv2d_POST, "ld1", ".2d", 1, false, 64 }, - { ARM64::LD1Fourv8b_POST, "ld1", ".8b", 1, false, 32 }, - { ARM64::LD1Fourv4h_POST, "ld1", ".4h", 1, false, 32 }, - { ARM64::LD1Fourv2s_POST, "ld1", ".2s", 1, false, 32 }, - { ARM64::LD1Fourv1d_POST, "ld1", ".1d", 1, false, 32 }, - { ARM64::LD2i8, "ld2", ".b", 1, true, 0 }, - { ARM64::LD2i16, "ld2", ".h", 1, true, 0 }, - { ARM64::LD2i32, "ld2", ".s", 1, true, 0 }, - { ARM64::LD2i64, "ld2", ".d", 1, true, 0 }, - { ARM64::LD2i8_POST, "ld2", ".b", 2, true, 2 }, - { ARM64::LD2i16_POST, "ld2", ".h", 2, true, 4 }, - { ARM64::LD2i32_POST, "ld2", ".s", 2, true, 8 }, - { ARM64::LD2i64_POST, "ld2", ".d", 2, true, 16 }, - { ARM64::LD2Rv16b, "ld2r", ".16b", 0, false, 0 }, - { ARM64::LD2Rv8h, "ld2r", ".8h", 0, false, 0 }, - { ARM64::LD2Rv4s, "ld2r", ".4s", 0, false, 0 }, - { ARM64::LD2Rv2d, "ld2r", ".2d", 0, false, 0 }, - { ARM64::LD2Rv8b, "ld2r", ".8b", 0, false, 0 }, - { ARM64::LD2Rv4h, "ld2r", ".4h", 0, false, 0 }, - { ARM64::LD2Rv2s, "ld2r", ".2s", 0, false, 0 }, - { ARM64::LD2Rv1d, "ld2r", ".1d", 0, false, 0 }, - { ARM64::LD2Rv16b_POST, "ld2r", ".16b", 1, false, 2 }, - { ARM64::LD2Rv8h_POST, "ld2r", ".8h", 1, false, 4 }, - { ARM64::LD2Rv4s_POST, "ld2r", ".4s", 1, false, 8 }, - { ARM64::LD2Rv2d_POST, "ld2r", ".2d", 1, false, 16 }, - { ARM64::LD2Rv8b_POST, "ld2r", ".8b", 1, false, 2 }, - { ARM64::LD2Rv4h_POST, "ld2r", ".4h", 1, false, 4 }, - { ARM64::LD2Rv2s_POST, "ld2r", ".2s", 1, false, 8 }, - { ARM64::LD2Rv1d_POST, "ld2r", ".1d", 1, false, 16 }, - { ARM64::LD2Twov16b, "ld2", ".16b", 0, false, 0 }, - { ARM64::LD2Twov8h, "ld2", ".8h", 0, false, 0 }, - { ARM64::LD2Twov4s, "ld2", ".4s", 0, false, 0 }, - { ARM64::LD2Twov2d, "ld2", ".2d", 0, false, 0 }, - { ARM64::LD2Twov8b, "ld2", ".8b", 0, false, 0 }, - { ARM64::LD2Twov4h, "ld2", ".4h", 0, false, 0 }, - { ARM64::LD2Twov2s, "ld2", ".2s", 0, false, 0 }, - { ARM64::LD2Twov16b_POST, "ld2", ".16b", 1, false, 32 }, - { ARM64::LD2Twov8h_POST, "ld2", ".8h", 1, false, 32 }, - { ARM64::LD2Twov4s_POST, "ld2", ".4s", 1, false, 32 }, - { ARM64::LD2Twov2d_POST, "ld2", ".2d", 1, false, 32 }, - { ARM64::LD2Twov8b_POST, "ld2", ".8b", 1, false, 16 }, - { ARM64::LD2Twov4h_POST, "ld2", ".4h", 1, false, 16 }, - { ARM64::LD2Twov2s_POST, "ld2", ".2s", 1, false, 16 }, - { ARM64::LD3i8, "ld3", ".b", 1, true, 0 }, - { ARM64::LD3i16, "ld3", ".h", 1, true, 0 }, - { ARM64::LD3i32, "ld3", ".s", 1, true, 0 }, - { ARM64::LD3i64, "ld3", ".d", 1, true, 0 }, - { ARM64::LD3i8_POST, "ld3", ".b", 2, true, 3 }, - { ARM64::LD3i16_POST, "ld3", ".h", 2, true, 6 }, - { ARM64::LD3i32_POST, "ld3", ".s", 2, true, 12 }, - { ARM64::LD3i64_POST, "ld3", ".d", 2, true, 24 }, - { ARM64::LD3Rv16b, "ld3r", ".16b", 0, false, 0 }, - { ARM64::LD3Rv8h, "ld3r", ".8h", 0, false, 0 }, - { ARM64::LD3Rv4s, "ld3r", ".4s", 0, false, 0 }, - { ARM64::LD3Rv2d, "ld3r", ".2d", 0, false, 0 }, - { ARM64::LD3Rv8b, "ld3r", ".8b", 0, false, 0 }, - { ARM64::LD3Rv4h, "ld3r", ".4h", 0, false, 0 }, - { ARM64::LD3Rv2s, "ld3r", ".2s", 0, false, 0 }, - { ARM64::LD3Rv1d, "ld3r", ".1d", 0, false, 0 }, - { ARM64::LD3Rv16b_POST, "ld3r", ".16b", 1, false, 3 }, - { ARM64::LD3Rv8h_POST, "ld3r", ".8h", 1, false, 6 }, - { ARM64::LD3Rv4s_POST, "ld3r", ".4s", 1, false, 12 }, - { ARM64::LD3Rv2d_POST, "ld3r", ".2d", 1, false, 24 }, - { ARM64::LD3Rv8b_POST, "ld3r", ".8b", 1, false, 3 }, - { ARM64::LD3Rv4h_POST, "ld3r", ".4h", 1, false, 6 }, - { ARM64::LD3Rv2s_POST, "ld3r", ".2s", 1, false, 12 }, - { ARM64::LD3Rv1d_POST, "ld3r", ".1d", 1, false, 24 }, - { ARM64::LD3Threev16b, "ld3", ".16b", 0, false, 0 }, - { ARM64::LD3Threev8h, "ld3", ".8h", 0, false, 0 }, - { ARM64::LD3Threev4s, "ld3", ".4s", 0, false, 0 }, - { ARM64::LD3Threev2d, "ld3", ".2d", 0, false, 0 }, - { ARM64::LD3Threev8b, "ld3", ".8b", 0, false, 0 }, - { ARM64::LD3Threev4h, "ld3", ".4h", 0, false, 0 }, - { ARM64::LD3Threev2s, "ld3", ".2s", 0, false, 0 }, - { ARM64::LD3Threev16b_POST, "ld3", ".16b", 1, false, 48 }, - { ARM64::LD3Threev8h_POST, "ld3", ".8h", 1, false, 48 }, - { ARM64::LD3Threev4s_POST, "ld3", ".4s", 1, false, 48 }, - { ARM64::LD3Threev2d_POST, "ld3", ".2d", 1, false, 48 }, - { ARM64::LD3Threev8b_POST, "ld3", ".8b", 1, false, 24 }, - { ARM64::LD3Threev4h_POST, "ld3", ".4h", 1, false, 24 }, - { ARM64::LD3Threev2s_POST, "ld3", ".2s", 1, false, 24 }, - { ARM64::LD4i8, "ld4", ".b", 1, true, 0 }, - { ARM64::LD4i16, "ld4", ".h", 1, true, 0 }, - { ARM64::LD4i32, "ld4", ".s", 1, true, 0 }, - { ARM64::LD4i64, "ld4", ".d", 1, true, 0 }, - { ARM64::LD4i8_POST, "ld4", ".b", 2, true, 4 }, - { ARM64::LD4i16_POST, "ld4", ".h", 2, true, 8 }, - { ARM64::LD4i32_POST, "ld4", ".s", 2, true, 16 }, - { ARM64::LD4i64_POST, "ld4", ".d", 2, true, 32 }, - { ARM64::LD4Rv16b, "ld4r", ".16b", 0, false, 0 }, - { ARM64::LD4Rv8h, "ld4r", ".8h", 0, false, 0 }, - { ARM64::LD4Rv4s, "ld4r", ".4s", 0, false, 0 }, - { ARM64::LD4Rv2d, "ld4r", ".2d", 0, false, 0 }, - { ARM64::LD4Rv8b, "ld4r", ".8b", 0, false, 0 }, - { ARM64::LD4Rv4h, "ld4r", ".4h", 0, false, 0 }, - { ARM64::LD4Rv2s, "ld4r", ".2s", 0, false, 0 }, - { ARM64::LD4Rv1d, "ld4r", ".1d", 0, false, 0 }, - { ARM64::LD4Rv16b_POST, "ld4r", ".16b", 1, false, 4 }, - { ARM64::LD4Rv8h_POST, "ld4r", ".8h", 1, false, 8 }, - { ARM64::LD4Rv4s_POST, "ld4r", ".4s", 1, false, 16 }, - { ARM64::LD4Rv2d_POST, "ld4r", ".2d", 1, false, 32 }, - { ARM64::LD4Rv8b_POST, "ld4r", ".8b", 1, false, 4 }, - { ARM64::LD4Rv4h_POST, "ld4r", ".4h", 1, false, 8 }, - { ARM64::LD4Rv2s_POST, "ld4r", ".2s", 1, false, 16 }, - { ARM64::LD4Rv1d_POST, "ld4r", ".1d", 1, false, 32 }, - { ARM64::LD4Fourv16b, "ld4", ".16b", 0, false, 0 }, - { ARM64::LD4Fourv8h, "ld4", ".8h", 0, false, 0 }, - { ARM64::LD4Fourv4s, "ld4", ".4s", 0, false, 0 }, - { ARM64::LD4Fourv2d, "ld4", ".2d", 0, false, 0 }, - { ARM64::LD4Fourv8b, "ld4", ".8b", 0, false, 0 }, - { ARM64::LD4Fourv4h, "ld4", ".4h", 0, false, 0 }, - { ARM64::LD4Fourv2s, "ld4", ".2s", 0, false, 0 }, - { ARM64::LD4Fourv16b_POST, "ld4", ".16b", 1, false, 64 }, - { ARM64::LD4Fourv8h_POST, "ld4", ".8h", 1, false, 64 }, - { ARM64::LD4Fourv4s_POST, "ld4", ".4s", 1, false, 64 }, - { ARM64::LD4Fourv2d_POST, "ld4", ".2d", 1, false, 64 }, - { ARM64::LD4Fourv8b_POST, "ld4", ".8b", 1, false, 32 }, - { ARM64::LD4Fourv4h_POST, "ld4", ".4h", 1, false, 32 }, - { ARM64::LD4Fourv2s_POST, "ld4", ".2s", 1, false, 32 }, - { ARM64::ST1i8, "st1", ".b", 0, true, 0 }, - { ARM64::ST1i16, "st1", ".h", 0, true, 0 }, - { ARM64::ST1i32, "st1", ".s", 0, true, 0 }, - { ARM64::ST1i64, "st1", ".d", 0, true, 0 }, - { ARM64::ST1i8_POST, "st1", ".b", 1, true, 1 }, - { ARM64::ST1i16_POST, "st1", ".h", 1, true, 2 }, - { ARM64::ST1i32_POST, "st1", ".s", 1, true, 4 }, - { ARM64::ST1i64_POST, "st1", ".d", 1, true, 8 }, - { ARM64::ST1Onev16b, "st1", ".16b", 0, false, 0 }, - { ARM64::ST1Onev8h, "st1", ".8h", 0, false, 0 }, - { ARM64::ST1Onev4s, "st1", ".4s", 0, false, 0 }, - { ARM64::ST1Onev2d, "st1", ".2d", 0, false, 0 }, - { ARM64::ST1Onev8b, "st1", ".8b", 0, false, 0 }, - { ARM64::ST1Onev4h, "st1", ".4h", 0, false, 0 }, - { ARM64::ST1Onev2s, "st1", ".2s", 0, false, 0 }, - { ARM64::ST1Onev1d, "st1", ".1d", 0, false, 0 }, - { ARM64::ST1Onev16b_POST, "st1", ".16b", 1, false, 16 }, - { ARM64::ST1Onev8h_POST, "st1", ".8h", 1, false, 16 }, - { ARM64::ST1Onev4s_POST, "st1", ".4s", 1, false, 16 }, - { ARM64::ST1Onev2d_POST, "st1", ".2d", 1, false, 16 }, - { ARM64::ST1Onev8b_POST, "st1", ".8b", 1, false, 8 }, - { ARM64::ST1Onev4h_POST, "st1", ".4h", 1, false, 8 }, - { ARM64::ST1Onev2s_POST, "st1", ".2s", 1, false, 8 }, - { ARM64::ST1Onev1d_POST, "st1", ".1d", 1, false, 8 }, - { ARM64::ST1Twov16b, "st1", ".16b", 0, false, 0 }, - { ARM64::ST1Twov8h, "st1", ".8h", 0, false, 0 }, - { ARM64::ST1Twov4s, "st1", ".4s", 0, false, 0 }, - { ARM64::ST1Twov2d, "st1", ".2d", 0, false, 0 }, - { ARM64::ST1Twov8b, "st1", ".8b", 0, false, 0 }, - { ARM64::ST1Twov4h, "st1", ".4h", 0, false, 0 }, - { ARM64::ST1Twov2s, "st1", ".2s", 0, false, 0 }, - { ARM64::ST1Twov1d, "st1", ".1d", 0, false, 0 }, - { ARM64::ST1Twov16b_POST, "st1", ".16b", 1, false, 32 }, - { ARM64::ST1Twov8h_POST, "st1", ".8h", 1, false, 32 }, - { ARM64::ST1Twov4s_POST, "st1", ".4s", 1, false, 32 }, - { ARM64::ST1Twov2d_POST, "st1", ".2d", 1, false, 32 }, - { ARM64::ST1Twov8b_POST, "st1", ".8b", 1, false, 16 }, - { ARM64::ST1Twov4h_POST, "st1", ".4h", 1, false, 16 }, - { ARM64::ST1Twov2s_POST, "st1", ".2s", 1, false, 16 }, - { ARM64::ST1Twov1d_POST, "st1", ".1d", 1, false, 16 }, - { ARM64::ST1Threev16b, "st1", ".16b", 0, false, 0 }, - { ARM64::ST1Threev8h, "st1", ".8h", 0, false, 0 }, - { ARM64::ST1Threev4s, "st1", ".4s", 0, false, 0 }, - { ARM64::ST1Threev2d, "st1", ".2d", 0, false, 0 }, - { ARM64::ST1Threev8b, "st1", ".8b", 0, false, 0 }, - { ARM64::ST1Threev4h, "st1", ".4h", 0, false, 0 }, - { ARM64::ST1Threev2s, "st1", ".2s", 0, false, 0 }, - { ARM64::ST1Threev1d, "st1", ".1d", 0, false, 0 }, - { ARM64::ST1Threev16b_POST, "st1", ".16b", 1, false, 48 }, - { ARM64::ST1Threev8h_POST, "st1", ".8h", 1, false, 48 }, - { ARM64::ST1Threev4s_POST, "st1", ".4s", 1, false, 48 }, - { ARM64::ST1Threev2d_POST, "st1", ".2d", 1, false, 48 }, - { ARM64::ST1Threev8b_POST, "st1", ".8b", 1, false, 24 }, - { ARM64::ST1Threev4h_POST, "st1", ".4h", 1, false, 24 }, - { ARM64::ST1Threev2s_POST, "st1", ".2s", 1, false, 24 }, - { ARM64::ST1Threev1d_POST, "st1", ".1d", 1, false, 24 }, - { ARM64::ST1Fourv16b, "st1", ".16b", 0, false, 0 }, - { ARM64::ST1Fourv8h, "st1", ".8h", 0, false, 0 }, - { ARM64::ST1Fourv4s, "st1", ".4s", 0, false, 0 }, - { ARM64::ST1Fourv2d, "st1", ".2d", 0, false, 0 }, - { ARM64::ST1Fourv8b, "st1", ".8b", 0, false, 0 }, - { ARM64::ST1Fourv4h, "st1", ".4h", 0, false, 0 }, - { ARM64::ST1Fourv2s, "st1", ".2s", 0, false, 0 }, - { ARM64::ST1Fourv1d, "st1", ".1d", 0, false, 0 }, - { ARM64::ST1Fourv16b_POST, "st1", ".16b", 1, false, 64 }, - { ARM64::ST1Fourv8h_POST, "st1", ".8h", 1, false, 64 }, - { ARM64::ST1Fourv4s_POST, "st1", ".4s", 1, false, 64 }, - { ARM64::ST1Fourv2d_POST, "st1", ".2d", 1, false, 64 }, - { ARM64::ST1Fourv8b_POST, "st1", ".8b", 1, false, 32 }, - { ARM64::ST1Fourv4h_POST, "st1", ".4h", 1, false, 32 }, - { ARM64::ST1Fourv2s_POST, "st1", ".2s", 1, false, 32 }, - { ARM64::ST1Fourv1d_POST, "st1", ".1d", 1, false, 32 }, - { ARM64::ST2i8, "st2", ".b", 0, true, 0 }, - { ARM64::ST2i16, "st2", ".h", 0, true, 0 }, - { ARM64::ST2i32, "st2", ".s", 0, true, 0 }, - { ARM64::ST2i64, "st2", ".d", 0, true, 0 }, - { ARM64::ST2i8_POST, "st2", ".b", 1, true, 2 }, - { ARM64::ST2i16_POST, "st2", ".h", 1, true, 4 }, - { ARM64::ST2i32_POST, "st2", ".s", 1, true, 8 }, - { ARM64::ST2i64_POST, "st2", ".d", 1, true, 16 }, - { ARM64::ST2Twov16b, "st2", ".16b", 0, false, 0 }, - { ARM64::ST2Twov8h, "st2", ".8h", 0, false, 0 }, - { ARM64::ST2Twov4s, "st2", ".4s", 0, false, 0 }, - { ARM64::ST2Twov2d, "st2", ".2d", 0, false, 0 }, - { ARM64::ST2Twov8b, "st2", ".8b", 0, false, 0 }, - { ARM64::ST2Twov4h, "st2", ".4h", 0, false, 0 }, - { ARM64::ST2Twov2s, "st2", ".2s", 0, false, 0 }, - { ARM64::ST2Twov16b_POST, "st2", ".16b", 1, false, 32 }, - { ARM64::ST2Twov8h_POST, "st2", ".8h", 1, false, 32 }, - { ARM64::ST2Twov4s_POST, "st2", ".4s", 1, false, 32 }, - { ARM64::ST2Twov2d_POST, "st2", ".2d", 1, false, 32 }, - { ARM64::ST2Twov8b_POST, "st2", ".8b", 1, false, 16 }, - { ARM64::ST2Twov4h_POST, "st2", ".4h", 1, false, 16 }, - { ARM64::ST2Twov2s_POST, "st2", ".2s", 1, false, 16 }, - { ARM64::ST3i8, "st3", ".b", 0, true, 0 }, - { ARM64::ST3i16, "st3", ".h", 0, true, 0 }, - { ARM64::ST3i32, "st3", ".s", 0, true, 0 }, - { ARM64::ST3i64, "st3", ".d", 0, true, 0 }, - { ARM64::ST3i8_POST, "st3", ".b", 1, true, 3 }, - { ARM64::ST3i16_POST, "st3", ".h", 1, true, 6 }, - { ARM64::ST3i32_POST, "st3", ".s", 1, true, 12 }, - { ARM64::ST3i64_POST, "st3", ".d", 1, true, 24 }, - { ARM64::ST3Threev16b, "st3", ".16b", 0, false, 0 }, - { ARM64::ST3Threev8h, "st3", ".8h", 0, false, 0 }, - { ARM64::ST3Threev4s, "st3", ".4s", 0, false, 0 }, - { ARM64::ST3Threev2d, "st3", ".2d", 0, false, 0 }, - { ARM64::ST3Threev8b, "st3", ".8b", 0, false, 0 }, - { ARM64::ST3Threev4h, "st3", ".4h", 0, false, 0 }, - { ARM64::ST3Threev2s, "st3", ".2s", 0, false, 0 }, - { ARM64::ST3Threev16b_POST, "st3", ".16b", 1, false, 48 }, - { ARM64::ST3Threev8h_POST, "st3", ".8h", 1, false, 48 }, - { ARM64::ST3Threev4s_POST, "st3", ".4s", 1, false, 48 }, - { ARM64::ST3Threev2d_POST, "st3", ".2d", 1, false, 48 }, - { ARM64::ST3Threev8b_POST, "st3", ".8b", 1, false, 24 }, - { ARM64::ST3Threev4h_POST, "st3", ".4h", 1, false, 24 }, - { ARM64::ST3Threev2s_POST, "st3", ".2s", 1, false, 24 }, - { ARM64::ST4i8, "st4", ".b", 0, true, 0 }, - { ARM64::ST4i16, "st4", ".h", 0, true, 0 }, - { ARM64::ST4i32, "st4", ".s", 0, true, 0 }, - { ARM64::ST4i64, "st4", ".d", 0, true, 0 }, - { ARM64::ST4i8_POST, "st4", ".b", 1, true, 4 }, - { ARM64::ST4i16_POST, "st4", ".h", 1, true, 8 }, - { ARM64::ST4i32_POST, "st4", ".s", 1, true, 16 }, - { ARM64::ST4i64_POST, "st4", ".d", 1, true, 32 }, - { ARM64::ST4Fourv16b, "st4", ".16b", 0, false, 0 }, - { ARM64::ST4Fourv8h, "st4", ".8h", 0, false, 0 }, - { ARM64::ST4Fourv4s, "st4", ".4s", 0, false, 0 }, - { ARM64::ST4Fourv2d, "st4", ".2d", 0, false, 0 }, - { ARM64::ST4Fourv8b, "st4", ".8b", 0, false, 0 }, - { ARM64::ST4Fourv4h, "st4", ".4h", 0, false, 0 }, - { ARM64::ST4Fourv2s, "st4", ".2s", 0, false, 0 }, - { ARM64::ST4Fourv16b_POST, "st4", ".16b", 1, false, 64 }, - { ARM64::ST4Fourv8h_POST, "st4", ".8h", 1, false, 64 }, - { ARM64::ST4Fourv4s_POST, "st4", ".4s", 1, false, 64 }, - { ARM64::ST4Fourv2d_POST, "st4", ".2d", 1, false, 64 }, - { ARM64::ST4Fourv8b_POST, "st4", ".8b", 1, false, 32 }, - { ARM64::ST4Fourv4h_POST, "st4", ".4h", 1, false, 32 }, - { ARM64::ST4Fourv2s_POST, "st4", ".2s", 1, false, 32 }, -}; - -static LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) { - unsigned Idx; - for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx) - if (LdStNInstInfo[Idx].Opcode == Opcode) - return &LdStNInstInfo[Idx]; - - return nullptr; -} - -void ARM64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O, - StringRef Annot) { - unsigned Opcode = MI->getOpcode(); - StringRef Layout, Mnemonic; - - bool IsTbx; - if (isTblTbxInstruction(MI->getOpcode(), Layout, IsTbx)) { - O << "\t" << (IsTbx ? "tbx" : "tbl") << Layout << '\t' - << getRegisterName(MI->getOperand(0).getReg(), ARM64::vreg) << ", "; - - unsigned ListOpNum = IsTbx ? 2 : 1; - printVectorList(MI, ListOpNum, O, ""); - - O << ", " - << getRegisterName(MI->getOperand(ListOpNum + 1).getReg(), ARM64::vreg); - printAnnotation(O, Annot); - return; - } - - if (LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) { - O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t'; - - // Now onto the operands: first a vector list with possible lane - // specifier. E.g. { v0 }[2] - int OpNum = LdStDesc->ListOperand; - printVectorList(MI, OpNum++, O, ""); - - if (LdStDesc->HasLane) - O << '[' << MI->getOperand(OpNum++).getImm() << ']'; - - // Next the address: [xN] - unsigned AddrReg = MI->getOperand(OpNum++).getReg(); - O << ", [" << getRegisterName(AddrReg) << ']'; - - // Finally, there might be a post-indexed offset. - if (LdStDesc->NaturalOffset != 0) { - unsigned Reg = MI->getOperand(OpNum++).getReg(); - if (Reg != ARM64::XZR) - O << ", " << getRegisterName(Reg); - else { - assert(LdStDesc->NaturalOffset && "no offset on post-inc instruction?"); - O << ", #" << LdStDesc->NaturalOffset; - } - } - - printAnnotation(O, Annot); - return; - } - - ARM64InstPrinter::printInst(MI, O, Annot); -} - -bool ARM64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) { -#ifndef NDEBUG - unsigned Opcode = MI->getOpcode(); - assert(Opcode == ARM64::SYSxt && "Invalid opcode for SYS alias!"); -#endif - - const char *Asm = nullptr; - const MCOperand &Op1 = MI->getOperand(0); - const MCOperand &Cn = MI->getOperand(1); - const MCOperand &Cm = MI->getOperand(2); - const MCOperand &Op2 = MI->getOperand(3); - - unsigned Op1Val = Op1.getImm(); - unsigned CnVal = Cn.getImm(); - unsigned CmVal = Cm.getImm(); - unsigned Op2Val = Op2.getImm(); - - if (CnVal == 7) { - switch (CmVal) { - default: - break; - - // IC aliases - case 1: - if (Op1Val == 0 && Op2Val == 0) - Asm = "ic\tialluis"; - break; - case 5: - if (Op1Val == 0 && Op2Val == 0) - Asm = "ic\tiallu"; - else if (Op1Val == 3 && Op2Val == 1) - Asm = "ic\tivau"; - break; - - // DC aliases - case 4: - if (Op1Val == 3 && Op2Val == 1) - Asm = "dc\tzva"; - break; - case 6: - if (Op1Val == 0 && Op2Val == 1) - Asm = "dc\tivac"; - if (Op1Val == 0 && Op2Val == 2) - Asm = "dc\tisw"; - break; - case 10: - if (Op1Val == 3 && Op2Val == 1) - Asm = "dc\tcvac"; - else if (Op1Val == 0 && Op2Val == 2) - Asm = "dc\tcsw"; - break; - case 11: - if (Op1Val == 3 && Op2Val == 1) - Asm = "dc\tcvau"; - break; - case 14: - if (Op1Val == 3 && Op2Val == 1) - Asm = "dc\tcivac"; - else if (Op1Val == 0 && Op2Val == 2) - Asm = "dc\tcisw"; - break; - - // AT aliases - case 8: - switch (Op1Val) { - default: - break; - case 0: - switch (Op2Val) { - default: - break; - case 0: Asm = "at\ts1e1r"; break; - case 1: Asm = "at\ts1e1w"; break; - case 2: Asm = "at\ts1e0r"; break; - case 3: Asm = "at\ts1e0w"; break; - } - break; - case 4: - switch (Op2Val) { - default: - break; - case 0: Asm = "at\ts1e2r"; break; - case 1: Asm = "at\ts1e2w"; break; - case 4: Asm = "at\ts12e1r"; break; - case 5: Asm = "at\ts12e1w"; break; - case 6: Asm = "at\ts12e0r"; break; - case 7: Asm = "at\ts12e0w"; break; - } - break; - case 6: - switch (Op2Val) { - default: - break; - case 0: Asm = "at\ts1e3r"; break; - case 1: Asm = "at\ts1e3w"; break; - } - break; - } - break; - } - } else if (CnVal == 8) { - // TLBI aliases - switch (CmVal) { - default: - break; - case 3: - switch (Op1Val) { - default: - break; - case 0: - switch (Op2Val) { - default: - break; - case 0: Asm = "tlbi\tvmalle1is"; break; - case 1: Asm = "tlbi\tvae1is"; break; - case 2: Asm = "tlbi\taside1is"; break; - case 3: Asm = "tlbi\tvaae1is"; break; - case 5: Asm = "tlbi\tvale1is"; break; - case 7: Asm = "tlbi\tvaale1is"; break; - } - break; - case 4: - switch (Op2Val) { - default: - break; - case 0: Asm = "tlbi\talle2is"; break; - case 1: Asm = "tlbi\tvae2is"; break; - case 4: Asm = "tlbi\talle1is"; break; - case 5: Asm = "tlbi\tvale2is"; break; - case 6: Asm = "tlbi\tvmalls12e1is"; break; - } - break; - case 6: - switch (Op2Val) { - default: - break; - case 0: Asm = "tlbi\talle3is"; break; - case 1: Asm = "tlbi\tvae3is"; break; - case 5: Asm = "tlbi\tvale3is"; break; - } - break; - } - break; - case 0: - switch (Op1Val) { - default: - break; - case 4: - switch (Op2Val) { - default: - break; - case 1: Asm = "tlbi\tipas2e1is"; break; - case 5: Asm = "tlbi\tipas2le1is"; break; - } - break; - } - break; - case 4: - switch (Op1Val) { - default: - break; - case 4: - switch (Op2Val) { - default: - break; - case 1: Asm = "tlbi\tipas2e1"; break; - case 5: Asm = "tlbi\tipas2le1"; break; - } - break; - } - break; - case 7: - switch (Op1Val) { - default: - break; - case 0: - switch (Op2Val) { - default: - break; - case 0: Asm = "tlbi\tvmalle1"; break; - case 1: Asm = "tlbi\tvae1"; break; - case 2: Asm = "tlbi\taside1"; break; - case 3: Asm = "tlbi\tvaae1"; break; - case 5: Asm = "tlbi\tvale1"; break; - case 7: Asm = "tlbi\tvaale1"; break; - } - break; - case 4: - switch (Op2Val) { - default: - break; - case 0: Asm = "tlbi\talle2"; break; - case 1: Asm = "tlbi\tvae2"; break; - case 4: Asm = "tlbi\talle1"; break; - case 5: Asm = "tlbi\tvale2"; break; - case 6: Asm = "tlbi\tvmalls12e1"; break; - } - break; - case 6: - switch (Op2Val) { - default: - break; - case 0: Asm = "tlbi\talle3"; break; - case 1: Asm = "tlbi\tvae3"; break; - case 5: Asm = "tlbi\tvale3"; break; - } - break; - } - break; - } - } - - if (Asm) { - unsigned Reg = MI->getOperand(4).getReg(); - - O << '\t' << Asm; - if (StringRef(Asm).lower().find("all") == StringRef::npos) - O << ", " << getRegisterName(Reg); - } - - return Asm != nullptr; -} - -void ARM64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isReg()) { - unsigned Reg = Op.getReg(); - O << getRegisterName(Reg); - } else if (Op.isImm()) { - O << '#' << Op.getImm(); - } else { - assert(Op.isExpr() && "unknown operand kind in printOperand"); - O << *Op.getExpr(); - } -} - -void ARM64InstPrinter::printHexImm(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - O << format("#%#llx", Op.getImm()); -} - -void ARM64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo, - unsigned Imm, raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - if (Op.isReg()) { - unsigned Reg = Op.getReg(); - if (Reg == ARM64::XZR) - O << "#" << Imm; - else - O << getRegisterName(Reg); - } else - assert(0 && "unknown operand kind in printPostIncOperand64"); -} - -void ARM64InstPrinter::printVRegOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - assert(Op.isReg() && "Non-register vreg operand!"); - unsigned Reg = Op.getReg(); - O << getRegisterName(Reg, ARM64::vreg); -} - -void ARM64InstPrinter::printSysCROperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNo); - assert(Op.isImm() && "System instruction C[nm] operands must be immediates!"); - O << "c" << Op.getImm(); -} - -void ARM64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - const MCOperand &MO = MI->getOperand(OpNum); - if (MO.isImm()) { - unsigned Val = (MO.getImm() & 0xfff); - assert(Val == MO.getImm() && "Add/sub immediate out of range!"); - unsigned Shift = - ARM64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm()); - O << '#' << Val; - if (Shift != 0) - printShifter(MI, OpNum + 1, O); - - if (CommentStream) - *CommentStream << '=' << (Val << Shift) << '\n'; - } else { - assert(MO.isExpr() && "Unexpected operand type!"); - O << *MO.getExpr(); - printShifter(MI, OpNum + 1, O); - } -} - -void ARM64InstPrinter::printLogicalImm32(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - uint64_t Val = MI->getOperand(OpNum).getImm(); - O << "#0x"; - O.write_hex(ARM64_AM::decodeLogicalImmediate(Val, 32)); -} - -void ARM64InstPrinter::printLogicalImm64(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - uint64_t Val = MI->getOperand(OpNum).getImm(); - O << "#0x"; - O.write_hex(ARM64_AM::decodeLogicalImmediate(Val, 64)); -} - -void ARM64InstPrinter::printShifter(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - unsigned Val = MI->getOperand(OpNum).getImm(); - // LSL #0 should not be printed. - if (ARM64_AM::getShiftType(Val) == ARM64_AM::LSL && - ARM64_AM::getShiftValue(Val) == 0) - return; - O << ", " << ARM64_AM::getShiftExtendName(ARM64_AM::getShiftType(Val)) << " #" - << ARM64_AM::getShiftValue(Val); -} - -void ARM64InstPrinter::printShiftedRegister(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - O << getRegisterName(MI->getOperand(OpNum).getReg()); - printShifter(MI, OpNum + 1, O); -} - -void ARM64InstPrinter::printExtendedRegister(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - O << getRegisterName(MI->getOperand(OpNum).getReg()); - printArithExtend(MI, OpNum + 1, O); -} - -void ARM64InstPrinter::printArithExtend(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - unsigned Val = MI->getOperand(OpNum).getImm(); - ARM64_AM::ShiftExtendType ExtType = ARM64_AM::getArithExtendType(Val); - unsigned ShiftVal = ARM64_AM::getArithShiftValue(Val); - - // If the destination or first source register operand is [W]SP, print - // UXTW/UXTX as LSL, and if the shift amount is also zero, print nothing at - // all. - if (ExtType == ARM64_AM::UXTW || ExtType == ARM64_AM::UXTX) { - unsigned Dest = MI->getOperand(0).getReg(); - unsigned Src1 = MI->getOperand(1).getReg(); - if ( ((Dest == ARM64::SP || Src1 == ARM64::SP) && - ExtType == ARM64_AM::UXTX) || - ((Dest == ARM64::WSP || Src1 == ARM64::WSP) && - ExtType == ARM64_AM::UXTW) ) { - if (ShiftVal != 0) - O << ", lsl #" << ShiftVal; - return; - } - } - O << ", " << ARM64_AM::getShiftExtendName(ExtType); - if (ShiftVal != 0) - O << " #" << ShiftVal; -} - -void ARM64InstPrinter::printMemExtend(const MCInst *MI, unsigned OpNum, - raw_ostream &O, char SrcRegKind, - unsigned Width) { - unsigned SignExtend = MI->getOperand(OpNum).getImm(); - unsigned DoShift = MI->getOperand(OpNum + 1).getImm(); - - // sxtw, sxtx, uxtw or lsl (== uxtx) - bool IsLSL = !SignExtend && SrcRegKind == 'x'; - if (IsLSL) - O << "lsl"; - else - O << (SignExtend ? 's' : 'u') << "xt" << SrcRegKind; - - if (DoShift || IsLSL) - O << " #" << Log2_32(Width / 8); -} - -void ARM64InstPrinter::printCondCode(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - ARM64CC::CondCode CC = (ARM64CC::CondCode)MI->getOperand(OpNum).getImm(); - O << ARM64CC::getCondCodeName(CC); -} - -void ARM64InstPrinter::printInverseCondCode(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - ARM64CC::CondCode CC = (ARM64CC::CondCode)MI->getOperand(OpNum).getImm(); - O << ARM64CC::getCondCodeName(ARM64CC::getInvertedCondCode(CC)); -} - -void ARM64InstPrinter::printAMNoIndex(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']'; -} - -template -void ARM64InstPrinter::printImmScale(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - O << '#' << Scale * MI->getOperand(OpNum).getImm(); -} - -void ARM64InstPrinter::printUImm12Offset(const MCInst *MI, unsigned OpNum, - unsigned Scale, raw_ostream &O) { - const MCOperand MO = MI->getOperand(OpNum); - if (MO.isImm()) { - O << "#" << (MO.getImm() * Scale); - } else { - assert(MO.isExpr() && "Unexpected operand type!"); - O << *MO.getExpr(); - } -} - -void ARM64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum, - unsigned Scale, raw_ostream &O) { - const MCOperand MO1 = MI->getOperand(OpNum + 1); - O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()); - if (MO1.isImm()) { - O << ", #" << (MO1.getImm() * Scale); - } else { - assert(MO1.isExpr() && "Unexpected operand type!"); - O << ", " << *MO1.getExpr(); - } - O << ']'; -} - -void ARM64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - unsigned prfop = MI->getOperand(OpNum).getImm(); - bool Valid; - StringRef Name = ARM64PRFM::PRFMMapper().toString(prfop, Valid); - if (Valid) - O << Name; - else - O << '#' << prfop; -} - -void ARM64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - const MCOperand &MO = MI->getOperand(OpNum); - float FPImm = MO.isFPImm() ? MO.getFPImm() : ARM64_AM::getFPImmFloat(MO.getImm()); - - // 8 decimal places are enough to perfectly represent permitted floats. - O << format("#%.8f", FPImm); -} - -static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) { - while (Stride--) { - switch (Reg) { - default: - assert(0 && "Vector register expected!"); - case ARM64::Q0: Reg = ARM64::Q1; break; - case ARM64::Q1: Reg = ARM64::Q2; break; - case ARM64::Q2: Reg = ARM64::Q3; break; - case ARM64::Q3: Reg = ARM64::Q4; break; - case ARM64::Q4: Reg = ARM64::Q5; break; - case ARM64::Q5: Reg = ARM64::Q6; break; - case ARM64::Q6: Reg = ARM64::Q7; break; - case ARM64::Q7: Reg = ARM64::Q8; break; - case ARM64::Q8: Reg = ARM64::Q9; break; - case ARM64::Q9: Reg = ARM64::Q10; break; - case ARM64::Q10: Reg = ARM64::Q11; break; - case ARM64::Q11: Reg = ARM64::Q12; break; - case ARM64::Q12: Reg = ARM64::Q13; break; - case ARM64::Q13: Reg = ARM64::Q14; break; - case ARM64::Q14: Reg = ARM64::Q15; break; - case ARM64::Q15: Reg = ARM64::Q16; break; - case ARM64::Q16: Reg = ARM64::Q17; break; - case ARM64::Q17: Reg = ARM64::Q18; break; - case ARM64::Q18: Reg = ARM64::Q19; break; - case ARM64::Q19: Reg = ARM64::Q20; break; - case ARM64::Q20: Reg = ARM64::Q21; break; - case ARM64::Q21: Reg = ARM64::Q22; break; - case ARM64::Q22: Reg = ARM64::Q23; break; - case ARM64::Q23: Reg = ARM64::Q24; break; - case ARM64::Q24: Reg = ARM64::Q25; break; - case ARM64::Q25: Reg = ARM64::Q26; break; - case ARM64::Q26: Reg = ARM64::Q27; break; - case ARM64::Q27: Reg = ARM64::Q28; break; - case ARM64::Q28: Reg = ARM64::Q29; break; - case ARM64::Q29: Reg = ARM64::Q30; break; - case ARM64::Q30: Reg = ARM64::Q31; break; - // Vector lists can wrap around. - case ARM64::Q31: - Reg = ARM64::Q0; - break; - } - } - return Reg; -} - -void ARM64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum, - raw_ostream &O, StringRef LayoutSuffix) { - unsigned Reg = MI->getOperand(OpNum).getReg(); - - O << "{ "; - - // Work out how many registers there are in the list (if there is an actual - // list). - unsigned NumRegs = 1; - if (MRI.getRegClass(ARM64::DDRegClassID).contains(Reg) || - MRI.getRegClass(ARM64::QQRegClassID).contains(Reg)) - NumRegs = 2; - else if (MRI.getRegClass(ARM64::DDDRegClassID).contains(Reg) || - MRI.getRegClass(ARM64::QQQRegClassID).contains(Reg)) - NumRegs = 3; - else if (MRI.getRegClass(ARM64::DDDDRegClassID).contains(Reg) || - MRI.getRegClass(ARM64::QQQQRegClassID).contains(Reg)) - NumRegs = 4; - - // Now forget about the list and find out what the first register is. - if (unsigned FirstReg = MRI.getSubReg(Reg, ARM64::dsub0)) - Reg = FirstReg; - else if (unsigned FirstReg = MRI.getSubReg(Reg, ARM64::qsub0)) - Reg = FirstReg; - - // If it's a D-reg, we need to promote it to the equivalent Q-reg before - // printing (otherwise getRegisterName fails). - if (MRI.getRegClass(ARM64::FPR64RegClassID).contains(Reg)) { - const MCRegisterClass &FPR128RC = MRI.getRegClass(ARM64::FPR128RegClassID); - Reg = MRI.getMatchingSuperReg(Reg, ARM64::dsub, &FPR128RC); - } - - for (unsigned i = 0; i < NumRegs; ++i, Reg = getNextVectorRegister(Reg)) { - O << getRegisterName(Reg, ARM64::vreg) << LayoutSuffix; - if (i + 1 != NumRegs) - O << ", "; - } - - O << " }"; -} - -void ARM64InstPrinter::printImplicitlyTypedVectorList(const MCInst *MI, - unsigned OpNum, - raw_ostream &O) { - printVectorList(MI, OpNum, O, ""); -} - -template -void ARM64InstPrinter::printTypedVectorList(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - std::string Suffix("."); - if (NumLanes) - Suffix += itostr(NumLanes) + LaneKind; - else - Suffix += LaneKind; - - printVectorList(MI, OpNum, O, Suffix); -} - -void ARM64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - O << "[" << MI->getOperand(OpNum).getImm() << "]"; -} - -void ARM64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNum); - - // If the label has already been resolved to an immediate offset (say, when - // we're running the disassembler), just print the immediate. - if (Op.isImm()) { - O << "#" << (Op.getImm() << 2); - return; - } - - // If the branch target is simply an address then print it in hex. - const MCConstantExpr *BranchTarget = - dyn_cast(MI->getOperand(OpNum).getExpr()); - int64_t Address; - if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) { - O << "0x"; - O.write_hex(Address); - } else { - // Otherwise, just print the expression. - O << *MI->getOperand(OpNum).getExpr(); - } -} - -void ARM64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum, - raw_ostream &O) { - const MCOperand &Op = MI->getOperand(OpNum); - - // If the label has already been resolved to an immediate offset (say, when - // we're running the disassembler), just print the immediate. - if (Op.isImm()) { - O << "#" << (Op.getImm() << 12); - return; - } - - // Otherwise, just print the expression. - O << *MI->getOperand(OpNum).getExpr(); -} - -void ARM64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned Val = MI->getOperand(OpNo).getImm(); - unsigned Opcode = MI->getOpcode(); - - bool Valid; - StringRef Name; - if (Opcode == ARM64::ISB) - Name = ARM64ISB::ISBMapper().toString(Val, Valid); - else - Name = ARM64DB::DBarrierMapper().toString(Val, Valid); - if (Valid) - O << Name; - else - O << "#" << Val; -} - -void ARM64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned Val = MI->getOperand(OpNo).getImm(); - - bool Valid; - auto Mapper = ARM64SysReg::MRSMapper(getAvailableFeatures()); - std::string Name = Mapper.toString(Val, Valid); - - if (Valid) - O << StringRef(Name).upper(); -} - -void ARM64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned Val = MI->getOperand(OpNo).getImm(); - - bool Valid; - auto Mapper = ARM64SysReg::MSRMapper(getAvailableFeatures()); - std::string Name = Mapper.toString(Val, Valid); - - if (Valid) - O << StringRef(Name).upper(); -} - -void ARM64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned Val = MI->getOperand(OpNo).getImm(); - - bool Valid; - StringRef Name = ARM64PState::PStateMapper().toString(Val, Valid); - if (Valid) - O << StringRef(Name.str()).upper(); - else - O << "#" << Val; -} - -void ARM64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) { - unsigned RawVal = MI->getOperand(OpNo).getImm(); - uint64_t Val = ARM64_AM::decodeAdvSIMDModImmType10(RawVal); - O << format("#%#016llx", Val); -} diff --git a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.h b/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.h deleted file mode 100644 index 0fd6f100712..00000000000 --- a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.h +++ /dev/null @@ -1,140 +0,0 @@ -//===-- ARM64InstPrinter.h - Convert ARM64 MCInst to assembly syntax ------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This class prints an ARM64 MCInst to a .s file. -// -//===----------------------------------------------------------------------===// - -#ifndef ARM64INSTPRINTER_H -#define ARM64INSTPRINTER_H - -#include "MCTargetDesc/ARM64MCTargetDesc.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/MC/MCInstPrinter.h" -#include "llvm/MC/MCSubtargetInfo.h" - -namespace llvm { - -class MCOperand; - -class ARM64InstPrinter : public MCInstPrinter { -public: - ARM64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI, const MCSubtargetInfo &STI); - - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override; - void printRegName(raw_ostream &OS, unsigned RegNo) const override; - - // Autogenerated by tblgen. - virtual void printInstruction(const MCInst *MI, raw_ostream &O); - virtual bool printAliasInstr(const MCInst *MI, raw_ostream &O); - virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, - unsigned PrintMethodIdx, raw_ostream &O); - virtual StringRef getRegName(unsigned RegNo) const { - return getRegisterName(RegNo); - } - static const char *getRegisterName(unsigned RegNo, - unsigned AltIdx = ARM64::NoRegAltName); - -protected: - bool printSysAlias(const MCInst *MI, raw_ostream &O); - // Operand printers - void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printHexImm(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm, - raw_ostream &O); - template - void printPostIncOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) { - printPostIncOperand(MI, OpNo, Amount, O); - } - - void printVRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printSysCROperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); - void printAddSubImm(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printLogicalImm32(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printLogicalImm64(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printShifter(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printShiftedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printExtendedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printArithExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O); - - void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O, - char SrcRegKind, unsigned Width); - template - void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O) { - printMemExtend(MI, OpNum, O, SrcRegKind, Width); - } - - void printCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printInverseCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printAlignedLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printUImm12Offset(const MCInst *MI, unsigned OpNum, unsigned Scale, - raw_ostream &O); - void printAMIndexedWB(const MCInst *MI, unsigned OpNum, unsigned Scale, - raw_ostream &O); - - template - void printUImm12Offset(const MCInst *MI, unsigned OpNum, raw_ostream &O) { - printUImm12Offset(MI, OpNum, Scale, O); - } - - template - void printAMIndexedWB(const MCInst *MI, unsigned OpNum, raw_ostream &O) { - printAMIndexedWB(MI, OpNum, BitWidth / 8, O); - } - - void printAMNoIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O); - - template - void printImmScale(const MCInst *MI, unsigned OpNum, raw_ostream &O); - - void printPrefetchOp(const MCInst *MI, unsigned OpNum, raw_ostream &O); - - void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O); - - void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O, - StringRef LayoutSuffix); - - /// Print a list of vector registers where the type suffix is implicit - /// (i.e. attached to the instruction rather than the registers). - void printImplicitlyTypedVectorList(const MCInst *MI, unsigned OpNum, - raw_ostream &O); - - template - void printTypedVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O); - - void printVectorIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printAdrpLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printBarrierOption(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printMSRSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printMRSSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printSystemPStateField(const MCInst *MI, unsigned OpNum, raw_ostream &O); - void printSIMDType10Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O); -}; - -class ARM64AppleInstPrinter : public ARM64InstPrinter { -public: - ARM64AppleInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, - const MCRegisterInfo &MRI, const MCSubtargetInfo &STI); - - void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override; - - void printInstruction(const MCInst *MI, raw_ostream &O) override; - bool printAliasInstr(const MCInst *MI, raw_ostream &O) override; - virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx, - unsigned PrintMethodIdx, raw_ostream &O); - StringRef getRegName(unsigned RegNo) const override { - return getRegisterName(RegNo); - } - static const char *getRegisterName(unsigned RegNo, - unsigned AltIdx = ARM64::NoRegAltName); -}; -} - -#endif diff --git a/lib/Target/ARM64/InstPrinter/CMakeLists.txt b/lib/Target/ARM64/InstPrinter/CMakeLists.txt deleted file mode 100644 index b8ee12c5541..00000000000 --- a/lib/Target/ARM64/InstPrinter/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) - -add_llvm_library(LLVMARM64AsmPrinter - ARM64InstPrinter.cpp - ) - -add_dependencies(LLVMARM64AsmPrinter ARM64CommonTableGen) diff --git a/lib/Target/ARM64/InstPrinter/LLVMBuild.txt b/lib/Target/ARM64/InstPrinter/LLVMBuild.txt deleted file mode 100644 index 7ab43924921..00000000000 --- a/lib/Target/ARM64/InstPrinter/LLVMBuild.txt +++ /dev/null @@ -1,24 +0,0 @@ -;===- ./lib/Target/ARM64/InstPrinter/LLVMBuild.txt -------------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[component_0] -type = Library -name = ARM64AsmPrinter -parent = ARM64 -required_libraries = ARM64Utils MC Support -add_to_library_groups = ARM64 - diff --git a/lib/Target/ARM64/InstPrinter/Makefile b/lib/Target/ARM64/InstPrinter/Makefile deleted file mode 100644 index a59efb08465..00000000000 --- a/lib/Target/ARM64/InstPrinter/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/ARM64/AsmPrinter/Makefile ----------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMARM64AsmPrinter - -# Hack: we need to include 'main' arm target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/ARM64/LLVMBuild.txt b/lib/Target/ARM64/LLVMBuild.txt deleted file mode 100644 index 3d1e56e7ca6..00000000000 --- a/lib/Target/ARM64/LLVMBuild.txt +++ /dev/null @@ -1,35 +0,0 @@ -;===- ./lib/Target/ARM64/LLVMBuild.txt -------------------------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[common] -subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo Utils - -[component_0] -type = TargetGroup -name = ARM64 -parent = Target -has_asmparser = 1 -has_asmprinter = 1 -has_disassembler = 1 -has_jit = 1 - -[component_1] -type = Library -name = ARM64CodeGen -parent = ARM64 -required_libraries = ARM64AsmPrinter ARM64Desc ARM64Info ARM64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target -add_to_library_groups = ARM64 diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64AddressingModes.h b/lib/Target/ARM64/MCTargetDesc/ARM64AddressingModes.h deleted file mode 100644 index 53bd3545a59..00000000000 --- a/lib/Target/ARM64/MCTargetDesc/ARM64AddressingModes.h +++ /dev/null @@ -1,738 +0,0 @@ -//===- ARM64AddressingModes.h - ARM64 Addressing Modes ----------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the ARM64 addressing mode implementation stuff. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TARGET_ARM64_ARM64ADDRESSINGMODES_H -#define LLVM_TARGET_ARM64_ARM64ADDRESSINGMODES_H - -#include "llvm/ADT/APFloat.h" -#include "llvm/ADT/APInt.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MathExtras.h" -#include - -namespace llvm { - -/// ARM64_AM - ARM64 Addressing Mode Stuff -namespace ARM64_AM { - -//===----------------------------------------------------------------------===// -// Shifts -// - -enum ShiftExtendType { - InvalidShiftExtend = -1, - LSL = 0, - LSR, - ASR, - ROR, - MSL, - - UXTB, - UXTH, - UXTW, - UXTX, - - SXTB, - SXTH, - SXTW, - SXTX, -}; - -/// getShiftName - Get the string encoding for the shift type. -static inline const char *getShiftExtendName(ARM64_AM::ShiftExtendType ST) { - switch (ST) { - default: assert(false && "unhandled shift type!"); - case ARM64_AM::LSL: return "lsl"; - case ARM64_AM::LSR: return "lsr"; - case ARM64_AM::ASR: return "asr"; - case ARM64_AM::ROR: return "ror"; - case ARM64_AM::MSL: return "msl"; - case ARM64_AM::UXTB: return "uxtb"; - case ARM64_AM::UXTH: return "uxth"; - case ARM64_AM::UXTW: return "uxtw"; - case ARM64_AM::UXTX: return "uxtx"; - case ARM64_AM::SXTB: return "sxtb"; - case ARM64_AM::SXTH: return "sxth"; - case ARM64_AM::SXTW: return "sxtw"; - case ARM64_AM::SXTX: return "sxtx"; - } - return nullptr; -} - -/// getShiftType - Extract the shift type. -static inline ARM64_AM::ShiftExtendType getShiftType(unsigned Imm) { - switch ((Imm >> 6) & 0x7) { - default: return ARM64_AM::InvalidShiftExtend; - case 0: return ARM64_AM::LSL; - case 1: return ARM64_AM::LSR; - case 2: return ARM64_AM::ASR; - case 3: return ARM64_AM::ROR; - case 4: return ARM64_AM::MSL; - } -} - -/// getShiftValue - Extract the shift value. -static inline unsigned getShiftValue(unsigned Imm) { - return Imm & 0x3f; -} - -/// getShifterImm - Encode the shift type and amount: -/// imm: 6-bit shift amount -/// shifter: 000 ==> lsl -/// 001 ==> lsr -/// 010 ==> asr -/// 011 ==> ror -/// 100 ==> msl -/// {8-6} = shifter -/// {5-0} = imm -static inline unsigned getShifterImm(ARM64_AM::ShiftExtendType ST, - unsigned Imm) { - assert((Imm & 0x3f) == Imm && "Illegal shifted immedate value!"); - unsigned STEnc = 0; - switch (ST) { - default: llvm_unreachable("Invalid shift requested"); - case ARM64_AM::LSL: STEnc = 0; break; - case ARM64_AM::LSR: STEnc = 1; break; - case ARM64_AM::ASR: STEnc = 2; break; - case ARM64_AM::ROR: STEnc = 3; break; - case ARM64_AM::MSL: STEnc = 4; break; - } - return (STEnc << 6) | (Imm & 0x3f); -} - -//===----------------------------------------------------------------------===// -// Extends -// - -/// getArithShiftValue - get the arithmetic shift value. -static inline unsigned getArithShiftValue(unsigned Imm) { - return Imm & 0x7; -} - -/// getExtendType - Extract the extend type for operands of arithmetic ops. -static inline ARM64_AM::ShiftExtendType getExtendType(unsigned Imm) { - assert((Imm & 0x7) == Imm && "invalid immediate!"); - switch (Imm) { - default: llvm_unreachable("Compiler bug!"); - case 0: return ARM64_AM::UXTB; - case 1: return ARM64_AM::UXTH; - case 2: return ARM64_AM::UXTW; - case 3: return ARM64_AM::UXTX; - case 4: return ARM64_AM::SXTB; - case 5: return ARM64_AM::SXTH; - case 6: return ARM64_AM::SXTW; - case 7: return ARM64_AM::SXTX; - } -} - -static inline ARM64_AM::ShiftExtendType getArithExtendType(unsigned Imm) { - return getExtendType((Imm >> 3) & 0x7); -} - -/// Mapping from extend bits to required operation: -/// shifter: 000 ==> uxtb -/// 001 ==> uxth -/// 010 ==> uxtw -/// 011 ==> uxtx -/// 100 ==> sxtb -/// 101 ==> sxth -/// 110 ==> sxtw -/// 111 ==> sxtx -inline unsigned getExtendEncoding(ARM64_AM::ShiftExtendType ET) { - switch (ET) { - default: llvm_unreachable("Invalid extend type requested"); - case ARM64_AM::UXTB: return 0; break; - case ARM64_AM::UXTH: return 1; break; - case ARM64_AM::UXTW: return 2; break; - case ARM64_AM::UXTX: return 3; break; - case ARM64_AM::SXTB: return 4; break; - case ARM64_AM::SXTH: return 5; break; - case ARM64_AM::SXTW: return 6; break; - case ARM64_AM::SXTX: return 7; break; - } -} - -/// getArithExtendImm - Encode the extend type and shift amount for an -/// arithmetic instruction: -/// imm: 3-bit extend amount -/// {5-3} = shifter -/// {2-0} = imm3 -static inline unsigned getArithExtendImm(ARM64_AM::ShiftExtendType ET, - unsigned Imm) { - assert((Imm & 0x7) == Imm && "Illegal shifted immedate value!"); - return (getExtendEncoding(ET) << 3) | (Imm & 0x7); -} - -/// getMemDoShift - Extract the "do shift" flag value for load/store -/// instructions. -static inline bool getMemDoShift(unsigned Imm) { - return (Imm & 0x1) != 0; -} - -/// getExtendType - Extract the extend type for the offset operand of -/// loads/stores. -static inline ARM64_AM::ShiftExtendType getMemExtendType(unsigned Imm) { - return getExtendType((Imm >> 1) & 0x7); -} - -/// getExtendImm - Encode the extend type and amount for a load/store inst: -/// doshift: should the offset be scaled by the access size -/// shifter: 000 ==> uxtb -/// 001 ==> uxth -/// 010 ==> uxtw -/// 011 ==> uxtx -/// 100 ==> sxtb -/// 101 ==> sxth -/// 110 ==> sxtw -/// 111 ==> sxtx -/// {3-1} = shifter -/// {0} = doshift -static inline unsigned getMemExtendImm(ARM64_AM::ShiftExtendType ET, - bool DoShift) { - return (getExtendEncoding(ET) << 1) | unsigned(DoShift); -} - -static inline uint64_t ror(uint64_t elt, unsigned size) { - return ((elt & 1) << (size-1)) | (elt >> 1); -} - -/// processLogicalImmediate - Determine if an immediate value can be encoded -/// as the immediate operand of a logical instruction for the given register -/// size. If so, return true with "encoding" set to the encoded value in -/// the form N:immr:imms. -static inline bool processLogicalImmediate(uint64_t imm, unsigned regSize, - uint64_t &encoding) { - if (imm == 0ULL || imm == ~0ULL || - (regSize != 64 && (imm >> regSize != 0 || imm == ~0U))) - return false; - - unsigned size = 2; - uint64_t eltVal = imm; - - // First, determine the element size. - while (size < regSize) { - unsigned numElts = regSize / size; - unsigned mask = (1ULL << size) - 1; - uint64_t lowestEltVal = imm & mask; - - bool allMatched = true; - for (unsigned i = 1; i < numElts; ++i) { - uint64_t currEltVal = (imm >> (i*size)) & mask; - if (currEltVal != lowestEltVal) { - allMatched = false; - break; - } - } - - if (allMatched) { - eltVal = lowestEltVal; - break; - } - - size *= 2; - } - - // Second, determine the rotation to make the element be: 0^m 1^n. - for (unsigned i = 0; i < size; ++i) { - eltVal = ror(eltVal, size); - uint32_t clz = countLeadingZeros(eltVal) - (64 - size); - uint32_t cto = CountTrailingOnes_64(eltVal); - - if (clz + cto == size) { - // Encode in immr the number of RORs it would take to get *from* this - // element value to our target value, where i+1 is the number of RORs - // to go the opposite direction. - unsigned immr = size - (i + 1); - - // If size has a 1 in the n'th bit, create a value that has zeroes in - // bits [0, n] and ones above that. - uint64_t nimms = ~(size-1) << 1; - - // Or the CTO value into the low bits, which must be below the Nth bit - // bit mentioned above. - nimms |= (cto-1); - - // Extract the seventh bit and toggle it to create the N field. - unsigned N = ((nimms >> 6) & 1) ^ 1; - - encoding = (N << 12) | (immr << 6) | (nimms & 0x3f); - return true; - } - } - - return false; -} - -/// isLogicalImmediate - Return true if the immediate is valid for a logical -/// immediate instruction of the given register size. Return false otherwise. -static inline bool isLogicalImmediate(uint64_t imm, unsigned regSize) { - uint64_t encoding; - return processLogicalImmediate(imm, regSize, encoding); -} - -/// encodeLogicalImmediate - Return the encoded immediate value for a logical -/// immediate instruction of the given register size. -static inline uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize) { - uint64_t encoding = 0; - bool res = processLogicalImmediate(imm, regSize, encoding); - assert(res && "invalid logical immediate"); - (void)res; - return encoding; -} - -/// decodeLogicalImmediate - Decode a logical immediate value in the form -/// "N:immr:imms" (where the immr and imms fields are each 6 bits) into the -/// integer value it represents with regSize bits. -static inline uint64_t decodeLogicalImmediate(uint64_t val, unsigned regSize) { - // Extract the N, imms, and immr fields. - unsigned N = (val >> 12) & 1; - unsigned immr = (val >> 6) & 0x3f; - unsigned imms = val & 0x3f; - - assert((regSize == 64 || N == 0) && "undefined logical immediate encoding"); - int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f)); - assert(len >= 0 && "undefined logical immediate encoding"); - unsigned size = (1 << len); - unsigned R = immr & (size - 1); - unsigned S = imms & (size - 1); - assert(S != size - 1 && "undefined logical immediate encoding"); - uint64_t pattern = (1ULL << (S + 1)) - 1; - for (unsigned i = 0; i < R; ++i) - pattern = ror(pattern, size); - - // Replicate the pattern to fill the regSize. - while (size != regSize) { - pattern |= (pattern << size); - size *= 2; - } - return pattern; -} - -/// isValidDecodeLogicalImmediate - Check to see if the logical immediate value -/// in the form "N:immr:imms" (where the immr and imms fields are each 6 bits) -/// is a valid encoding for an integer value with regSize bits. -static inline bool isValidDecodeLogicalImmediate(uint64_t val, - unsigned regSize) { - // Extract the N and imms fields needed for checking. - unsigned N = (val >> 12) & 1; - unsigned imms = val & 0x3f; - - if (regSize == 32 && N != 0) // undefined logical immediate encoding - return false; - int len = 31 - countLeadingZeros((N << 6) | (~imms & 0x3f)); - if (len < 0) // undefined logical immediate encoding - return false; - unsigned size = (1 << len); - unsigned S = imms & (size - 1); - if (S == size - 1) // undefined logical immediate encoding - return false; - - return true; -} - -//===----------------------------------------------------------------------===// -// Floating-point Immediates -// -static inline float getFPImmFloat(unsigned Imm) { - // We expect an 8-bit binary encoding of a floating-point number here. - union { - uint32_t I; - float F; - } FPUnion; - - uint8_t Sign = (Imm >> 7) & 0x1; - uint8_t Exp = (Imm >> 4) & 0x7; - uint8_t Mantissa = Imm & 0xf; - - // 8-bit FP iEEEE Float Encoding - // abcd efgh aBbbbbbc defgh000 00000000 00000000 - // - // where B = NOT(b); - - FPUnion.I = 0; - FPUnion.I |= Sign << 31; - FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30; - FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25; - FPUnion.I |= (Exp & 0x3) << 23; - FPUnion.I |= Mantissa << 19; - return FPUnion.F; -} - -/// getFP32Imm - Return an 8-bit floating-point version of the 32-bit -/// floating-point value. If the value cannot be represented as an 8-bit -/// floating-point value, then return -1. -static inline int getFP32Imm(const APInt &Imm) { - uint32_t Sign = Imm.lshr(31).getZExtValue() & 1; - int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127; // -126 to 127 - int64_t Mantissa = Imm.getZExtValue() & 0x7fffff; // 23 bits - - // We can handle 4 bits of mantissa. - // mantissa = (16+UInt(e:f:g:h))/16. - if (Mantissa & 0x7ffff) - return -1; - Mantissa >>= 19; - if ((Mantissa & 0xf) != Mantissa) - return -1; - - // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3 - if (Exp < -3 || Exp > 4) - return -1; - Exp = ((Exp+3) & 0x7) ^ 4; - - return ((int)Sign << 7) | (Exp << 4) | Mantissa; -} - -static inline int getFP32Imm(const APFloat &FPImm) { - return getFP32Imm(FPImm.bitcastToAPInt()); -} - -/// getFP64Imm - Return an 8-bit floating-point version of the 64-bit -/// floating-point value. If the value cannot be represented as an 8-bit -/// floating-point value, then return -1. -static inline int getFP64Imm(const APInt &Imm) { - uint64_t Sign = Imm.lshr(63).getZExtValue() & 1; - int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023; // -1022 to 1023 - uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL; - - // We can handle 4 bits of mantissa. - // mantissa = (16+UInt(e:f:g:h))/16. - if (Mantissa & 0xffffffffffffULL) - return -1; - Mantissa >>= 48; - if ((Mantissa & 0xf) != Mantissa) - return -1; - - // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3 - if (Exp < -3 || Exp > 4) - return -1; - Exp = ((Exp+3) & 0x7) ^ 4; - - return ((int)Sign << 7) | (Exp << 4) | Mantissa; -} - -static inline int getFP64Imm(const APFloat &FPImm) { - return getFP64Imm(FPImm.bitcastToAPInt()); -} - -//===--------------------------------------------------------------------===// -// AdvSIMD Modified Immediates -//===--------------------------------------------------------------------===// - -// 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh -static inline bool isAdvSIMDModImmType1(uint64_t Imm) { - return ((Imm >> 32) == (Imm & 0xffffffffULL)) && - ((Imm & 0xffffff00ffffff00ULL) == 0); -} - -static inline uint8_t encodeAdvSIMDModImmType1(uint64_t Imm) { - return (Imm & 0xffULL); -} - -static inline uint64_t decodeAdvSIMDModImmType1(uint8_t Imm) { - uint64_t EncVal = Imm; - return (EncVal << 32) | EncVal; -} - -// 0x00 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 -static inline bool isAdvSIMDModImmType2(uint64_t Imm) { - return ((Imm >> 32) == (Imm & 0xffffffffULL)) && - ((Imm & 0xffff00ffffff00ffULL) == 0); -} - -static inline uint8_t encodeAdvSIMDModImmType2(uint64_t Imm) { - return (Imm & 0xff00ULL) >> 8; -} - -static inline uint64_t decodeAdvSIMDModImmType2(uint8_t Imm) { - uint64_t EncVal = Imm; - return (EncVal << 40) | (EncVal << 8); -} - -// 0x00 abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00 -static inline bool isAdvSIMDModImmType3(uint64_t Imm) { - return ((Imm >> 32) == (Imm & 0xffffffffULL)) && - ((Imm & 0xff00ffffff00ffffULL) == 0); -} - -static inline uint8_t encodeAdvSIMDModImmType3(uint64_t Imm) { - return (Imm & 0xff0000ULL) >> 16; -} - -static inline uint64_t decodeAdvSIMDModImmType3(uint8_t Imm) { - uint64_t EncVal = Imm; - return (EncVal << 48) | (EncVal << 16); -} - -// abcdefgh 0x00 0x00 0x00 abcdefgh 0x00 0x00 0x00 -static inline bool isAdvSIMDModImmType4(uint64_t Imm) { - return ((Imm >> 32) == (Imm & 0xffffffffULL)) && - ((Imm & 0x00ffffff00ffffffULL) == 0); -} - -static inline uint8_t encodeAdvSIMDModImmType4(uint64_t Imm) { - return (Imm & 0xff000000ULL) >> 24; -} - -static inline uint64_t decodeAdvSIMDModImmType4(uint8_t Imm) { - uint64_t EncVal = Imm; - return (EncVal << 56) | (EncVal << 24); -} - -// 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh -static inline bool isAdvSIMDModImmType5(uint64_t Imm) { - return ((Imm >> 32) == (Imm & 0xffffffffULL)) && - (((Imm & 0x00ff0000ULL) >> 16) == (Imm & 0x000000ffULL)) && - ((Imm & 0xff00ff00ff00ff00ULL) == 0); -} - -static inline uint8_t encodeAdvSIMDModImmType5(uint64_t Imm) { - return (Imm & 0xffULL); -} - -static inline uint64_t decodeAdvSIMDModImmType5(uint8_t Imm) { - uint64_t EncVal = Imm; - return (EncVal << 48) | (EncVal << 32) | (EncVal << 16) | EncVal; -} - -// abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 abcdefgh 0x00 -static inline bool isAdvSIMDModImmType6(uint64_t Imm) { - return ((Imm >> 32) == (Imm & 0xffffffffULL)) && - (((Imm & 0xff000000ULL) >> 16) == (Imm & 0x0000ff00ULL)) && - ((Imm & 0x00ff00ff00ff00ffULL) == 0); -} - -static inline uint8_t encodeAdvSIMDModImmType6(uint64_t Imm) { - return (Imm & 0xff00ULL) >> 8; -} - -static inline uint64_t decodeAdvSIMDModImmType6(uint8_t Imm) { - uint64_t EncVal = Imm; - return (EncVal << 56) | (EncVal << 40) | (EncVal << 24) | (EncVal << 8); -} - -// 0x00 0x00 abcdefgh 0xFF 0x00 0x00 abcdefgh 0xFF -static inline bool isAdvSIMDModImmType7(uint64_t Imm) { - return ((Imm >> 32) == (Imm & 0xffffffffULL)) && - ((Imm & 0xffff00ffffff00ffULL) == 0x000000ff000000ffULL); -} - -static inline uint8_t encodeAdvSIMDModImmType7(uint64_t Imm) { - return (Imm & 0xff00ULL) >> 8; -} - -static inline uint64_t decodeAdvSIMDModImmType7(uint8_t Imm) { - uint64_t EncVal = Imm; - return (EncVal << 40) | (EncVal << 8) | 0x000000ff000000ffULL; -} - -// 0x00 abcdefgh 0xFF 0xFF 0x00 abcdefgh 0xFF 0xFF -static inline bool isAdvSIMDModImmType8(uint64_t Imm) { - return ((Imm >> 32) == (Imm & 0xffffffffULL)) && - ((Imm & 0xff00ffffff00ffffULL) == 0x0000ffff0000ffffULL); -} - -static inline uint64_t decodeAdvSIMDModImmType8(uint8_t Imm) { - uint64_t EncVal = Imm; - return (EncVal << 48) | (EncVal << 16) | 0x0000ffff0000ffffULL; -} - -static inline uint8_t encodeAdvSIMDModImmType8(uint64_t Imm) { - return (Imm & 0x00ff0000ULL) >> 16; -} - -// abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh -static inline bool isAdvSIMDModImmType9(uint64_t Imm) { - return ((Imm >> 32) == (Imm & 0xffffffffULL)) && - ((Imm >> 48) == (Imm & 0x0000ffffULL)) && - ((Imm >> 56) == (Imm & 0x000000ffULL)); -} - -static inline uint8_t encodeAdvSIMDModImmType9(uint64_t Imm) { - return (Imm & 0xffULL); -} - -static inline uint64_t decodeAdvSIMDModImmType9(uint8_t Imm) { - uint64_t EncVal = Imm; - EncVal |= (EncVal << 8); - EncVal |= (EncVal << 16); - EncVal |= (EncVal << 32); - return EncVal; -} - -// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh -// cmode: 1110, op: 1 -static inline bool isAdvSIMDModImmType10(uint64_t Imm) { - uint64_t ByteA = Imm & 0xff00000000000000ULL; - uint64_t ByteB = Imm & 0x00ff000000000000ULL; - uint64_t ByteC = Imm & 0x0000ff0000000000ULL; - uint64_t ByteD = Imm & 0x000000ff00000000ULL; - uint64_t ByteE = Imm & 0x00000000ff000000ULL; - uint64_t ByteF = Imm & 0x0000000000ff0000ULL; - uint64_t ByteG = Imm & 0x000000000000ff00ULL; - uint64_t ByteH = Imm & 0x00000000000000ffULL; - - return (ByteA == 0ULL || ByteA == 0xff00000000000000ULL) && - (ByteB == 0ULL || ByteB == 0x00ff000000000000ULL) && - (ByteC == 0ULL || ByteC == 0x0000ff0000000000ULL) && - (ByteD == 0ULL || ByteD == 0x000000ff00000000ULL) && - (ByteE == 0ULL || ByteE == 0x00000000ff000000ULL) && - (ByteF == 0ULL || ByteF == 0x0000000000ff0000ULL) && - (ByteG == 0ULL || ByteG == 0x000000000000ff00ULL) && - (ByteH == 0ULL || ByteH == 0x00000000000000ffULL); -} - -static inline uint8_t encodeAdvSIMDModImmType10(uint64_t Imm) { - uint8_t BitA = (Imm & 0xff00000000000000ULL) != 0; - uint8_t BitB = (Imm & 0x00ff000000000000ULL) != 0; - uint8_t BitC = (Imm & 0x0000ff0000000000ULL) != 0; - uint8_t BitD = (Imm & 0x000000ff00000000ULL) != 0; - uint8_t BitE = (Imm & 0x00000000ff000000ULL) != 0; - uint8_t BitF = (Imm & 0x0000000000ff0000ULL) != 0; - uint8_t BitG = (Imm & 0x000000000000ff00ULL) != 0; - uint8_t BitH = (Imm & 0x00000000000000ffULL) != 0; - - uint8_t EncVal = BitA; - EncVal <<= 1; - EncVal |= BitB; - EncVal <<= 1; - EncVal |= BitC; - EncVal <<= 1; - EncVal |= BitD; - EncVal <<= 1; - EncVal |= BitE; - EncVal <<= 1; - EncVal |= BitF; - EncVal <<= 1; - EncVal |= BitG; - EncVal <<= 1; - EncVal |= BitH; - return EncVal; -} - -static inline uint64_t decodeAdvSIMDModImmType10(uint8_t Imm) { - uint64_t EncVal = 0; - if (Imm & 0x80) EncVal |= 0xff00000000000000ULL; - if (Imm & 0x40) EncVal |= 0x00ff000000000000ULL; - if (Imm & 0x20) EncVal |= 0x0000ff0000000000ULL; - if (Imm & 0x10) EncVal |= 0x000000ff00000000ULL; - if (Imm & 0x08) EncVal |= 0x00000000ff000000ULL; - if (Imm & 0x04) EncVal |= 0x0000000000ff0000ULL; - if (Imm & 0x02) EncVal |= 0x000000000000ff00ULL; - if (Imm & 0x01) EncVal |= 0x00000000000000ffULL; - return EncVal; -} - -// aBbbbbbc defgh000 0x00 0x00 aBbbbbbc defgh000 0x00 0x00 -static inline bool isAdvSIMDModImmType11(uint64_t Imm) { - uint64_t BString = (Imm & 0x7E000000ULL) >> 25; - return ((Imm >> 32) == (Imm & 0xffffffffULL)) && - (BString == 0x1f || BString == 0x20) && - ((Imm & 0x0007ffff0007ffffULL) == 0); -} - -static inline uint8_t encodeAdvSIMDModImmType11(uint64_t Imm) { - uint8_t BitA = (Imm & 0x80000000ULL) != 0; - uint8_t BitB = (Imm & 0x20000000ULL) != 0; - uint8_t BitC = (Imm & 0x01000000ULL) != 0; - uint8_t BitD = (Imm & 0x00800000ULL) != 0; - uint8_t BitE = (Imm & 0x00400000ULL) != 0; - uint8_t BitF = (Imm & 0x00200000ULL) != 0; - uint8_t BitG = (Imm & 0x00100000ULL) != 0; - uint8_t BitH = (Imm & 0x00080000ULL) != 0; - - uint8_t EncVal = BitA; - EncVal <<= 1; - EncVal |= BitB; - EncVal <<= 1; - EncVal |= BitC; - EncVal <<= 1; - EncVal |= BitD; - EncVal <<= 1; - EncVal |= BitE; - EncVal <<= 1; - EncVal |= BitF; - EncVal <<= 1; - EncVal |= BitG; - EncVal <<= 1; - EncVal |= BitH; - return EncVal; -} - -static inline uint64_t decodeAdvSIMDModImmType11(uint8_t Imm) { - uint64_t EncVal = 0; - if (Imm & 0x80) EncVal |= 0x80000000ULL; - if (Imm & 0x40) EncVal |= 0x3e000000ULL; - else EncVal |= 0x40000000ULL; - if (Imm & 0x20) EncVal |= 0x01000000ULL; - if (Imm & 0x10) EncVal |= 0x00800000ULL; - if (Imm & 0x08) EncVal |= 0x00400000ULL; - if (Imm & 0x04) EncVal |= 0x00200000ULL; - if (Imm & 0x02) EncVal |= 0x00100000ULL; - if (Imm & 0x01) EncVal |= 0x00080000ULL; - return (EncVal << 32) | EncVal; -} - -// aBbbbbbb bbcdefgh 0x00 0x00 0x00 0x00 0x00 0x00 -static inline bool isAdvSIMDModImmType12(uint64_t Imm) { - uint64_t BString = (Imm & 0x7fc0000000000000ULL) >> 54; - return ((BString == 0xff || BString == 0x100) && - ((Imm & 0x0000ffffffffffffULL) == 0)); -} - -static inline uint8_t encodeAdvSIMDModImmType12(uint64_t Imm) { - uint8_t BitA = (Imm & 0x8000000000000000ULL) != 0; - uint8_t BitB = (Imm & 0x0040000000000000ULL) != 0; - uint8_t BitC = (Imm & 0x0020000000000000ULL) != 0; - uint8_t BitD = (Imm & 0x0010000000000000ULL) != 0; - uint8_t BitE = (Imm & 0x0008000000000000ULL) != 0; - uint8_t BitF = (Imm & 0x0004000000000000ULL) != 0; - uint8_t BitG = (Imm & 0x0002000000000000ULL) != 0; - uint8_t BitH = (Imm & 0x0001000000000000ULL) != 0; - - uint8_t EncVal = BitA; - EncVal <<= 1; - EncVal |= BitB; - EncVal <<= 1; - EncVal |= BitC; - EncVal <<= 1; - EncVal |= BitD; - EncVal <<= 1; - EncVal |= BitE; - EncVal <<= 1; - EncVal |= BitF; - EncVal <<= 1; - EncVal |= BitG; - EncVal <<= 1; - EncVal |= BitH; - return EncVal; -} - -static inline uint64_t decodeAdvSIMDModImmType12(uint8_t Imm) { - uint64_t EncVal = 0; - if (Imm & 0x80) EncVal |= 0x8000000000000000ULL; - if (Imm & 0x40) EncVal |= 0x3fc0000000000000ULL; - else EncVal |= 0x4000000000000000ULL; - if (Imm & 0x20) EncVal |= 0x0020000000000000ULL; - if (Imm & 0x10) EncVal |= 0x0010000000000000ULL; - if (Imm & 0x08) EncVal |= 0x0008000000000000ULL; - if (Imm & 0x04) EncVal |= 0x0004000000000000ULL; - if (Imm & 0x02) EncVal |= 0x0002000000000000ULL; - if (Imm & 0x01) EncVal |= 0x0001000000000000ULL; - return (EncVal << 32) | EncVal; -} - -} // end namespace ARM64_AM - -} // end namespace llvm - -#endif diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64AsmBackend.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64AsmBackend.cpp deleted file mode 100644 index ba5025ab620..00000000000 --- a/lib/Target/ARM64/MCTargetDesc/ARM64AsmBackend.cpp +++ /dev/null @@ -1,564 +0,0 @@ -//===-- ARM64AsmBackend.cpp - ARM64 Assembler Backend ---------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "ARM64.h" -#include "ARM64RegisterInfo.h" -#include "MCTargetDesc/ARM64FixupKinds.h" -#include "llvm/ADT/Triple.h" -#include "llvm/MC/MCAsmBackend.h" -#include "llvm/MC/MCDirectives.h" -#include "llvm/MC/MCFixupKindInfo.h" -#include "llvm/MC/MCObjectWriter.h" -#include "llvm/MC/MCSectionMachO.h" -#include "llvm/MC/MCSectionELF.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MachO.h" -using namespace llvm; - -namespace { - -class ARM64AsmBackend : public MCAsmBackend { - static const unsigned PCRelFlagVal = - MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel; - -public: - ARM64AsmBackend(const Target &T) : MCAsmBackend() {} - - unsigned getNumFixupKinds() const override { - return ARM64::NumTargetFixupKinds; - } - - const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override { - const static MCFixupKindInfo Infos[ARM64::NumTargetFixupKinds] = { - // This table *must* be in the order that the fixup_* kinds are defined in - // ARM64FixupKinds.h. - // - // Name Offset (bits) Size (bits) Flags - { "fixup_arm64_pcrel_adr_imm21", 0, 32, PCRelFlagVal }, - { "fixup_arm64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal }, - { "fixup_arm64_add_imm12", 10, 12, 0 }, - { "fixup_arm64_ldst_imm12_scale1", 10, 12, 0 }, - { "fixup_arm64_ldst_imm12_scale2", 10, 12, 0 }, - { "fixup_arm64_ldst_imm12_scale4", 10, 12, 0 }, - { "fixup_arm64_ldst_imm12_scale8", 10, 12, 0 }, - { "fixup_arm64_ldst_imm12_scale16", 10, 12, 0 }, - { "fixup_arm64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal }, - { "fixup_arm64_movw", 5, 16, 0 }, - { "fixup_arm64_pcrel_branch14", 5, 14, PCRelFlagVal }, - { "fixup_arm64_pcrel_branch19", 5, 19, PCRelFlagVal }, - { "fixup_arm64_pcrel_branch26", 0, 26, PCRelFlagVal }, - { "fixup_arm64_pcrel_call26", 0, 26, PCRelFlagVal }, - { "fixup_arm64_tlsdesc_call", 0, 0, 0 } - }; - - if (Kind < FirstTargetFixupKind) - return MCAsmBackend::getFixupKindInfo(Kind); - - assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() && - "Invalid kind!"); - return Infos[Kind - FirstTargetFixupKind]; - } - - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel) const override; - - bool mayNeedRelaxation(const MCInst &Inst) const override; - bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, - const MCRelaxableFragment *DF, - const MCAsmLayout &Layout) const override; - void relaxInstruction(const MCInst &Inst, MCInst &Res) const override; - bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override; - - void HandleAssemblerFlag(MCAssemblerFlag Flag) {} - - unsigned getPointerSize() const { return 8; } -}; - -} // end anonymous namespace - -/// \brief The number of bytes the fixup may change. -static unsigned getFixupKindNumBytes(unsigned Kind) { - switch (Kind) { - default: - assert(0 && "Unknown fixup kind!"); - - case ARM64::fixup_arm64_tlsdesc_call: - return 0; - - case FK_Data_1: - return 1; - - case FK_Data_2: - case ARM64::fixup_arm64_movw: - return 2; - - case ARM64::fixup_arm64_pcrel_branch14: - case ARM64::fixup_arm64_add_imm12: - case ARM64::fixup_arm64_ldst_imm12_scale1: - case ARM64::fixup_arm64_ldst_imm12_scale2: - case ARM64::fixup_arm64_ldst_imm12_scale4: - case ARM64::fixup_arm64_ldst_imm12_scale8: - case ARM64::fixup_arm64_ldst_imm12_scale16: - case ARM64::fixup_arm64_ldr_pcrel_imm19: - case ARM64::fixup_arm64_pcrel_branch19: - return 3; - - case ARM64::fixup_arm64_pcrel_adr_imm21: - case ARM64::fixup_arm64_pcrel_adrp_imm21: - case ARM64::fixup_arm64_pcrel_branch26: - case ARM64::fixup_arm64_pcrel_call26: - case FK_Data_4: - return 4; - - case FK_Data_8: - return 8; - } -} - -static unsigned AdrImmBits(unsigned Value) { - unsigned lo2 = Value & 0x3; - unsigned hi19 = (Value & 0x1ffffc) >> 2; - return (hi19 << 5) | (lo2 << 29); -} - -static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) { - int64_t SignedValue = static_cast(Value); - switch (Kind) { - default: - assert(false && "Unknown fixup kind!"); - case ARM64::fixup_arm64_pcrel_adr_imm21: - if (SignedValue > 2097151 || SignedValue < -2097152) - report_fatal_error("fixup value out of range"); - return AdrImmBits(Value & 0x1fffffULL); - case ARM64::fixup_arm64_pcrel_adrp_imm21: - return AdrImmBits((Value & 0x1fffff000ULL) >> 12); - case ARM64::fixup_arm64_ldr_pcrel_imm19: - case ARM64::fixup_arm64_pcrel_branch19: - // Signed 21-bit immediate - if (SignedValue > 2097151 || SignedValue < -2097152) - report_fatal_error("fixup value out of range"); - // Low two bits are not encoded. - return (Value >> 2) & 0x7ffff; - case ARM64::fixup_arm64_add_imm12: - case ARM64::fixup_arm64_ldst_imm12_scale1: - // Unsigned 12-bit immediate - if (Value >= 0x1000) - report_fatal_error("invalid imm12 fixup value"); - return Value; - case ARM64::fixup_arm64_ldst_imm12_scale2: - // Unsigned 12-bit immediate which gets multiplied by 2 - if (Value & 1 || Value >= 0x2000) - report_fatal_error("invalid imm12 fixup value"); - return Value >> 1; - case ARM64::fixup_arm64_ldst_imm12_scale4: - // Unsigned 12-bit immediate which gets multiplied by 4 - if (Value & 3 || Value >= 0x4000) - report_fatal_error("invalid imm12 fixup value"); - return Value >> 2; - case ARM64::fixup_arm64_ldst_imm12_scale8: - // Unsigned 12-bit immediate which gets multiplied by 8 - if (Value & 7 || Value >= 0x8000) - report_fatal_error("invalid imm12 fixup value"); - return Value >> 3; - case ARM64::fixup_arm64_ldst_imm12_scale16: - // Unsigned 12-bit immediate which gets multiplied by 16 - if (Value & 15 || Value >= 0x10000) - report_fatal_error("invalid imm12 fixup value"); - return Value >> 4; - case ARM64::fixup_arm64_movw: - report_fatal_error("no resolvable MOVZ/MOVK fixups supported yet"); - return Value; - case ARM64::fixup_arm64_pcrel_branch14: - // Signed 16-bit immediate - if (SignedValue > 32767 || SignedValue < -32768) - report_fatal_error("fixup value out of range"); - // Low two bits are not encoded (4-byte alignment assumed). - if (Value & 0x3) - report_fatal_error("fixup not sufficiently aligned"); - return (Value >> 2) & 0x3fff; - case ARM64::fixup_arm64_pcrel_branch26: - case ARM64::fixup_arm64_pcrel_call26: - // Signed 28-bit immediate - if (SignedValue > 134217727 || SignedValue < -134217728) - report_fatal_error("fixup value out of range"); - // Low two bits are not encoded (4-byte alignment assumed). - if (Value & 0x3) - report_fatal_error("fixup not sufficiently aligned"); - return (Value >> 2) & 0x3ffffff; - case FK_Data_1: - case FK_Data_2: - case FK_Data_4: - case FK_Data_8: - return Value; - } -} - -void ARM64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, - unsigned DataSize, uint64_t Value, - bool IsPCRel) const { - unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); - if (!Value) - return; // Doesn't change encoding. - MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind()); - // Apply any target-specific value adjustments. - Value = adjustFixupValue(Fixup.getKind(), Value); - - // Shift the value into position. - Value <<= Info.TargetOffset; - - unsigned Offset = Fixup.getOffset(); - assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!"); - - // For each byte of the fragment that the fixup touches, mask in the - // bits from the fixup value. - for (unsigned i = 0; i != NumBytes; ++i) - Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); -} - -bool ARM64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const { - return false; -} - -bool ARM64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, - const MCRelaxableFragment *DF, - const MCAsmLayout &Layout) const { - // FIXME: This isn't correct for ARM64. Just moving the "generic" logic - // into the targets for now. - // - // Relax if the value is too big for a (signed) i8. - return int64_t(Value) != int64_t(int8_t(Value)); -} - -void ARM64AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const { - assert(false && "ARM64AsmBackend::relaxInstruction() unimplemented"); -} - -bool ARM64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { - // If the count is not 4-byte aligned, we must be writing data into the text - // section (otherwise we have unaligned instructions, and thus have far - // bigger problems), so just write zeros instead. - if ((Count & 3) != 0) { - for (uint64_t i = 0, e = (Count & 3); i != e; ++i) - OW->Write8(0); - } - - // We are properly aligned, so write NOPs as requested. - Count /= 4; - for (uint64_t i = 0; i != Count; ++i) - OW->Write32(0xd503201f); - return true; -} - -namespace { - -namespace CU { - -/// \brief Compact unwind encoding values. -enum CompactUnwindEncodings { - /// \brief A "frameless" leaf function, where no non-volatile registers are - /// saved. The return remains in LR throughout the function. - UNWIND_ARM64_MODE_FRAMELESS = 0x02000000, - - /// \brief No compact unwind encoding available. Instead the low 23-bits of - /// the compact unwind encoding is the offset of the DWARF FDE in the - /// __eh_frame section. This mode is never used in object files. It is only - /// generated by the linker in final linked images, which have only DWARF info - /// for a function. - UNWIND_ARM64_MODE_DWARF = 0x03000000, - - /// \brief This is a standard arm64 prologue where FP/LR are immediately - /// pushed on the stack, then SP is copied to FP. If there are any - /// non-volatile register saved, they are copied into the stack fame in pairs - /// in a contiguous ranger right below the saved FP/LR pair. Any subset of the - /// five X pairs and four D pairs can be saved, but the memory layout must be - /// in register number order. - UNWIND_ARM64_MODE_FRAME = 0x04000000, - - /// \brief Frame register pair encodings. - UNWIND_ARM64_FRAME_X19_X20_PAIR = 0x00000001, - UNWIND_ARM64_FRAME_X21_X22_PAIR = 0x00000002, - UNWIND_ARM64_FRAME_X23_X24_PAIR = 0x00000004, - UNWIND_ARM64_FRAME_X25_X26_PAIR = 0x00000008, - UNWIND_ARM64_FRAME_X27_X28_PAIR = 0x00000010, - UNWIND_ARM64_FRAME_D8_D9_PAIR = 0x00000100, - UNWIND_ARM64_FRAME_D10_D11_PAIR = 0x00000200, - UNWIND_ARM64_FRAME_D12_D13_PAIR = 0x00000400, - UNWIND_ARM64_FRAME_D14_D15_PAIR = 0x00000800 -}; - -} // end CU namespace - -// FIXME: This should be in a separate file. -class DarwinARM64AsmBackend : public ARM64AsmBackend { - const MCRegisterInfo &MRI; - - /// \brief Encode compact unwind stack adjustment for frameless functions. - /// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h. - /// The stack size always needs to be 16 byte aligned. - uint32_t encodeStackAdjustment(uint32_t StackSize) const { - return (StackSize / 16) << 12; - } - -public: - DarwinARM64AsmBackend(const Target &T, const MCRegisterInfo &MRI) - : ARM64AsmBackend(T), MRI(MRI) {} - - MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { - return createARM64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64, - MachO::CPU_SUBTYPE_ARM64_ALL); - } - - bool doesSectionRequireSymbols(const MCSection &Section) const override { - // Any section for which the linker breaks things into atoms needs to - // preserve symbols, including assembler local symbols, to identify - // those atoms. These sections are: - // Sections of type: - // - // S_CSTRING_LITERALS (e.g. __cstring) - // S_LITERAL_POINTERS (e.g. objc selector pointers) - // S_16BYTE_LITERALS, S_8BYTE_LITERALS, S_4BYTE_LITERALS - // - // Sections named: - // - // __TEXT,__eh_frame - // __TEXT,__ustring - // __DATA,__cfstring - // __DATA,__objc_classrefs - // __DATA,__objc_catlist - // - // FIXME: It would be better if the compiler used actual linker local - // symbols for each of these sections rather than preserving what - // are ostensibly assembler local symbols. - const MCSectionMachO &SMO = static_cast(Section); - return (SMO.getType() == MachO::S_CSTRING_LITERALS || - SMO.getType() == MachO::S_4BYTE_LITERALS || - SMO.getType() == MachO::S_8BYTE_LITERALS || - SMO.getType() == MachO::S_16BYTE_LITERALS || - SMO.getType() == MachO::S_LITERAL_POINTERS || - (SMO.getSegmentName() == "__TEXT" && - (SMO.getSectionName() == "__eh_frame" || - SMO.getSectionName() == "__ustring")) || - (SMO.getSegmentName() == "__DATA" && - (SMO.getSectionName() == "__cfstring" || - SMO.getSectionName() == "__objc_classrefs" || - SMO.getSectionName() == "__objc_catlist"))); - } - - /// \brief Generate the compact unwind encoding from the CFI directives. - uint32_t generateCompactUnwindEncoding( - ArrayRef Instrs) const override { - if (Instrs.empty()) - return CU::UNWIND_ARM64_MODE_FRAMELESS; - - bool HasFP = false; - unsigned StackSize = 0; - - uint32_t CompactUnwindEncoding = 0; - for (size_t i = 0, e = Instrs.size(); i != e; ++i) { - const MCCFIInstruction &Inst = Instrs[i]; - - switch (Inst.getOperation()) { - default: - // Cannot handle this directive: bail out. - return CU::UNWIND_ARM64_MODE_DWARF; - case MCCFIInstruction::OpDefCfa: { - // Defines a frame pointer. - assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) == - ARM64::FP && - "Invalid frame pointer!"); - assert(i + 2 < e && "Insufficient CFI instructions to define a frame!"); - - const MCCFIInstruction &LRPush = Instrs[++i]; - assert(LRPush.getOperation() == MCCFIInstruction::OpOffset && - "Link register not pushed!"); - const MCCFIInstruction &FPPush = Instrs[++i]; - assert(FPPush.getOperation() == MCCFIInstruction::OpOffset && - "Frame pointer not pushed!"); - - unsigned LRReg = MRI.getLLVMRegNum(LRPush.getRegister(), true); - unsigned FPReg = MRI.getLLVMRegNum(FPPush.getRegister(), true); - - LRReg = getXRegFromWReg(LRReg); - FPReg = getXRegFromWReg(FPReg); - - assert(LRReg == ARM64::LR && FPReg == ARM64::FP && - "Pushing invalid registers for frame!"); - - // Indicate that the function has a frame. - CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAME; - HasFP = true; - break; - } - case MCCFIInstruction::OpDefCfaOffset: { - assert(StackSize == 0 && "We already have the CFA offset!"); - StackSize = std::abs(Inst.getOffset()); - break; - } - case MCCFIInstruction::OpOffset: { - // Registers are saved in pairs. We expect there to be two consecutive - // `.cfi_offset' instructions with the appropriate registers specified. - unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true); - if (i + 1 == e) - return CU::UNWIND_ARM64_MODE_DWARF; - - const MCCFIInstruction &Inst2 = Instrs[++i]; - if (Inst2.getOperation() != MCCFIInstruction::OpOffset) - return CU::UNWIND_ARM64_MODE_DWARF; - unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true); - - // N.B. The encodings must be in register number order, and the X - // registers before the D registers. - - // X19/X20 pair = 0x00000001, - // X21/X22 pair = 0x00000002, - // X23/X24 pair = 0x00000004, - // X25/X26 pair = 0x00000008, - // X27/X28 pair = 0x00000010 - Reg1 = getXRegFromWReg(Reg1); - Reg2 = getXRegFromWReg(Reg2); - - if (Reg1 == ARM64::X19 && Reg2 == ARM64::X20 && - (CompactUnwindEncoding & 0xF1E) == 0) - CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X19_X20_PAIR; - else if (Reg1 == ARM64::X21 && Reg2 == ARM64::X22 && - (CompactUnwindEncoding & 0xF1C) == 0) - CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X21_X22_PAIR; - else if (Reg1 == ARM64::X23 && Reg2 == ARM64::X24 && - (CompactUnwindEncoding & 0xF18) == 0) - CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X23_X24_PAIR; - else if (Reg1 == ARM64::X25 && Reg2 == ARM64::X26 && - (CompactUnwindEncoding & 0xF10) == 0) - CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X25_X26_PAIR; - else if (Reg1 == ARM64::X27 && Reg2 == ARM64::X28 && - (CompactUnwindEncoding & 0xF00) == 0) - CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X27_X28_PAIR; - else { - Reg1 = getDRegFromBReg(Reg1); - Reg2 = getDRegFromBReg(Reg2); - - // D8/D9 pair = 0x00000100, - // D10/D11 pair = 0x00000200, - // D12/D13 pair = 0x00000400, - // D14/D15 pair = 0x00000800 - if (Reg1 == ARM64::D8 && Reg2 == ARM64::D9 && - (CompactUnwindEncoding & 0xE00) == 0) - CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D8_D9_PAIR; - else if (Reg1 == ARM64::D10 && Reg2 == ARM64::D11 && - (CompactUnwindEncoding & 0xC00) == 0) - CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D10_D11_PAIR; - else if (Reg1 == ARM64::D12 && Reg2 == ARM64::D13 && - (CompactUnwindEncoding & 0x800) == 0) - CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D12_D13_PAIR; - else if (Reg1 == ARM64::D14 && Reg2 == ARM64::D15) - CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D14_D15_PAIR; - else - // A pair was pushed which we cannot handle. - return CU::UNWIND_ARM64_MODE_DWARF; - } - - break; - } - } - } - - if (!HasFP) { - // With compact unwind info we can only represent stack adjustments of up - // to 65520 bytes. - if (StackSize > 65520) - return CU::UNWIND_ARM64_MODE_DWARF; - - CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAMELESS; - CompactUnwindEncoding |= encodeStackAdjustment(StackSize); - } - - return CompactUnwindEncoding; - } -}; - -} // end anonymous namespace - -namespace { - -class ELFARM64AsmBackend : public ARM64AsmBackend { -public: - uint8_t OSABI; - bool IsLittleEndian; - - ELFARM64AsmBackend(const Target &T, uint8_t OSABI, bool IsLittleEndian) - : ARM64AsmBackend(T), OSABI(OSABI), IsLittleEndian(IsLittleEndian) {} - - MCObjectWriter *createObjectWriter(raw_ostream &OS) const override { - return createARM64ELFObjectWriter(OS, OSABI, IsLittleEndian); - } - - void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout, - const MCFixup &Fixup, const MCFragment *DF, - const MCValue &Target, uint64_t &Value, - bool &IsResolved) override; - - void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, - uint64_t Value, bool IsPCRel) const override; -}; - -void ELFARM64AsmBackend::processFixupValue(const MCAssembler &Asm, - const MCAsmLayout &Layout, - const MCFixup &Fixup, - const MCFragment *DF, - const MCValue &Target, - uint64_t &Value, bool &IsResolved) { - // The ADRP instruction adds some multiple of 0x1000 to the current PC & - // ~0xfff. This means that the required offset to reach a symbol can vary by - // up to one step depending on where the ADRP is in memory. For example: - // - // ADRP x0, there - // there: - // - // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and - // we'll need that as an offset. At any other address "there" will be in the - // same page as the ADRP and the instruction should encode 0x0. Assuming the - // section isn't 0x1000-aligned, we therefore need to delegate this decision - // to the linker -- a relocation! - if ((uint32_t)Fixup.getKind() == ARM64::fixup_arm64_pcrel_adrp_imm21) - IsResolved = false; -} - -void ELFARM64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data, - unsigned DataSize, uint64_t Value, - bool IsPCRel) const { - // store fixups in .eh_frame section in big endian order - if (!IsLittleEndian && Fixup.getKind() == FK_Data_4) { - const MCSection *Sec = Fixup.getValue()->FindAssociatedSection(); - const MCSectionELF *SecELF = static_cast(Sec); - if (SecELF->getSectionName() == ".eh_frame") - Value = ByteSwap_32(unsigned(Value)); - } - ARM64AsmBackend::applyFixup (Fixup, Data, DataSize, Value, IsPCRel); -} -} - -MCAsmBackend *llvm::createARM64leAsmBackend(const Target &T, - const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU) { - Triple TheTriple(TT); - - if (TheTriple.isOSDarwin()) - return new DarwinARM64AsmBackend(T, MRI); - - assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target"); - return new ELFARM64AsmBackend(T, TheTriple.getOS(), /*IsLittleEndian=*/true); -} - -MCAsmBackend *llvm::createARM64beAsmBackend(const Target &T, - const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU) { - Triple TheTriple(TT); - - assert(TheTriple.isOSBinFormatELF() && "Big endian is only supported for ELF targets!"); - return new ELFARM64AsmBackend(T, TheTriple.getOS(), /*IsLittleEndian=*/false); -} diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64ELFObjectWriter.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64ELFObjectWriter.cpp deleted file mode 100644 index 0990a701bc8..00000000000 --- a/lib/Target/ARM64/MCTargetDesc/ARM64ELFObjectWriter.cpp +++ /dev/null @@ -1,255 +0,0 @@ -//===-- ARM64ELFObjectWriter.cpp - ARM64 ELF Writer -----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file handles ELF-specific object emission, converting LLVM's internal -// fixups into the appropriate relocations. -// -//===----------------------------------------------------------------------===// - -#include "MCTargetDesc/ARM64FixupKinds.h" -#include "MCTargetDesc/ARM64MCExpr.h" -#include "MCTargetDesc/ARM64MCTargetDesc.h" -#include "llvm/MC/MCELFObjectWriter.h" -#include "llvm/MC/MCValue.h" -#include "llvm/Support/ErrorHandling.h" - -using namespace llvm; - -namespace { -class ARM64ELFObjectWriter : public MCELFObjectTargetWriter { -public: - ARM64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian); - - virtual ~ARM64ELFObjectWriter(); - -protected: - unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, - bool IsPCRel) const override; - -private: -}; -} - -ARM64ELFObjectWriter::ARM64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian) - : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64, - /*HasRelocationAddend*/ true) {} - -ARM64ELFObjectWriter::~ARM64ELFObjectWriter() {} - -unsigned ARM64ELFObjectWriter::GetRelocType(const MCValue &Target, - const MCFixup &Fixup, - bool IsPCRel) const { - ARM64MCExpr::VariantKind RefKind = - static_cast(Target.getRefKind()); - ARM64MCExpr::VariantKind SymLoc = ARM64MCExpr::getSymbolLoc(RefKind); - bool IsNC = ARM64MCExpr::isNotChecked(RefKind); - - assert((!Target.getSymA() || - Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None) && - "Should only be expression-level modifiers here"); - - assert((!Target.getSymB() || - Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None) && - "Should only be expression-level modifiers here"); - - if (IsPCRel) { - switch ((unsigned)Fixup.getKind()) { - case FK_Data_2: - return ELF::R_AARCH64_PREL16; - case FK_Data_4: - return ELF::R_AARCH64_PREL32; - case FK_Data_8: - return ELF::R_AARCH64_PREL64; - case ARM64::fixup_arm64_pcrel_adr_imm21: - assert(SymLoc == ARM64MCExpr::VK_NONE && "unexpected ADR relocation"); - return ELF::R_AARCH64_ADR_PREL_LO21; - case ARM64::fixup_arm64_pcrel_adrp_imm21: - if (SymLoc == ARM64MCExpr::VK_ABS && !IsNC) - return ELF::R_AARCH64_ADR_PREL_PG_HI21; - if (SymLoc == ARM64MCExpr::VK_GOT && !IsNC) - return ELF::R_AARCH64_ADR_GOT_PAGE; - if (SymLoc == ARM64MCExpr::VK_GOTTPREL && !IsNC) - return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21; - if (SymLoc == ARM64MCExpr::VK_TLSDESC && !IsNC) - return ELF::R_AARCH64_TLSDESC_ADR_PAGE; - llvm_unreachable("invalid symbol kind for ADRP relocation"); - case ARM64::fixup_arm64_pcrel_branch26: - return ELF::R_AARCH64_JUMP26; - case ARM64::fixup_arm64_pcrel_call26: - return ELF::R_AARCH64_CALL26; - case ARM64::fixup_arm64_ldr_pcrel_imm19: - if (SymLoc == ARM64MCExpr::VK_GOTTPREL) - return ELF::R_AARCH64_TLSIE_LD_GOTTPREL_PREL19; - return ELF::R_AARCH64_LD_PREL_LO19; - case ARM64::fixup_arm64_pcrel_branch14: - return ELF::R_AARCH64_TSTBR14; - case ARM64::fixup_arm64_pcrel_branch19: - return ELF::R_AARCH64_CONDBR19; - default: - llvm_unreachable("Unsupported pc-relative fixup kind"); - } - } else { - switch ((unsigned)Fixup.getKind()) { - case FK_Data_2: - return ELF::R_AARCH64_ABS16; - case FK_Data_4: - return ELF::R_AARCH64_ABS32; - case FK_Data_8: - return ELF::R_AARCH64_ABS64; - case ARM64::fixup_arm64_add_imm12: - if (RefKind == ARM64MCExpr::VK_DTPREL_HI12) - return ELF::R_AARCH64_TLSLD_ADD_DTPREL_HI12; - if (RefKind == ARM64MCExpr::VK_TPREL_HI12) - return ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12; - if (RefKind == ARM64MCExpr::VK_DTPREL_LO12_NC) - return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC; - if (RefKind == ARM64MCExpr::VK_DTPREL_LO12) - return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12; - if (RefKind == ARM64MCExpr::VK_TPREL_LO12_NC) - return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC; - if (RefKind == ARM64MCExpr::VK_TPREL_LO12) - return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12; - if (RefKind == ARM64MCExpr::VK_TLSDESC_LO12) - return ELF::R_AARCH64_TLSDESC_ADD_LO12_NC; - if (SymLoc == ARM64MCExpr::VK_ABS && IsNC) - return ELF::R_AARCH64_ADD_ABS_LO12_NC; - - report_fatal_error("invalid fixup for add (uimm12) instruction"); - return 0; - case ARM64::fixup_arm64_ldst_imm12_scale1: - if (SymLoc == ARM64MCExpr::VK_ABS && IsNC) - return ELF::R_AARCH64_LDST8_ABS_LO12_NC; - if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC) - return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12; - if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC) - return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC; - if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC) - return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12; - if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC) - return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC; - - report_fatal_error("invalid fixup for 8-bit load/store instruction"); - return 0; - case ARM64::fixup_arm64_ldst_imm12_scale2: - if (SymLoc == ARM64MCExpr::VK_ABS && IsNC) - return ELF::R_AARCH64_LDST16_ABS_LO12_NC; - if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC) - return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12; - if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC) - return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC; - if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC) - return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12; - if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC) - return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC; - - report_fatal_error("invalid fixup for 16-bit load/store instruction"); - return 0; - case ARM64::fixup_arm64_ldst_imm12_scale4: - if (SymLoc == ARM64MCExpr::VK_ABS && IsNC) - return ELF::R_AARCH64_LDST32_ABS_LO12_NC; - if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC) - return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12; - if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC) - return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC; - if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC) - return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12; - if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC) - return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC; - - report_fatal_error("invalid fixup for 32-bit load/store instruction"); - return 0; - case ARM64::fixup_arm64_ldst_imm12_scale8: - if (SymLoc == ARM64MCExpr::VK_ABS && IsNC) - return ELF::R_AARCH64_LDST64_ABS_LO12_NC; - if (SymLoc == ARM64MCExpr::VK_GOT && IsNC) - return ELF::R_AARCH64_LD64_GOT_LO12_NC; - if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC) - return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12; - if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC) - return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC; - if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC) - return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12; - if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC) - return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC; - if (SymLoc == ARM64MCExpr::VK_GOTTPREL && IsNC) - return ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC; - if (SymLoc == ARM64MCExpr::VK_TLSDESC && IsNC) - return ELF::R_AARCH64_TLSDESC_LD64_LO12_NC; - - report_fatal_error("invalid fixup for 64-bit load/store instruction"); - return 0; - case ARM64::fixup_arm64_ldst_imm12_scale16: - if (SymLoc == ARM64MCExpr::VK_ABS && IsNC) - return ELF::R_AARCH64_LDST128_ABS_LO12_NC; - - report_fatal_error("invalid fixup for 128-bit load/store instruction"); - return 0; - case ARM64::fixup_arm64_movw: - if (RefKind == ARM64MCExpr::VK_ABS_G3) - return ELF::R_AARCH64_MOVW_UABS_G3; - if (RefKind == ARM64MCExpr::VK_ABS_G2) - return ELF::R_AARCH64_MOVW_UABS_G2; - if (RefKind == ARM64MCExpr::VK_ABS_G2_S) - return ELF::R_AARCH64_MOVW_SABS_G2; - if (RefKind == ARM64MCExpr::VK_ABS_G2_NC) - return ELF::R_AARCH64_MOVW_UABS_G2_NC; - if (RefKind == ARM64MCExpr::VK_ABS_G1) - return ELF::R_AARCH64_MOVW_UABS_G1; - if (RefKind == ARM64MCExpr::VK_ABS_G1_S) - return ELF::R_AARCH64_MOVW_SABS_G1; - if (RefKind == ARM64MCExpr::VK_ABS_G1_NC) - return ELF::R_AARCH64_MOVW_UABS_G1_NC; - if (RefKind == ARM64MCExpr::VK_ABS_G0) - return ELF::R_AARCH64_MOVW_UABS_G0; - if (RefKind == ARM64MCExpr::VK_ABS_G0_S) - return ELF::R_AARCH64_MOVW_SABS_G0; - if (RefKind == ARM64MCExpr::VK_ABS_G0_NC) - return ELF::R_AARCH64_MOVW_UABS_G0_NC; - if (RefKind == ARM64MCExpr::VK_DTPREL_G2) - return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2; - if (RefKind == ARM64MCExpr::VK_DTPREL_G1) - return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1; - if (RefKind == ARM64MCExpr::VK_DTPREL_G1_NC) - return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC; - if (RefKind == ARM64MCExpr::VK_DTPREL_G0) - return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0; - if (RefKind == ARM64MCExpr::VK_DTPREL_G0_NC) - return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC; - if (RefKind == ARM64MCExpr::VK_TPREL_G2) - return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2; - if (RefKind == ARM64MCExpr::VK_TPREL_G1) - return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1; - if (RefKind == ARM64MCExpr::VK_TPREL_G1_NC) - return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC; - if (RefKind == ARM64MCExpr::VK_TPREL_G0) - return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0; - if (RefKind == ARM64MCExpr::VK_TPREL_G0_NC) - return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC; - if (RefKind == ARM64MCExpr::VK_GOTTPREL_G1) - return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1; - if (RefKind == ARM64MCExpr::VK_GOTTPREL_G0_NC) - return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC; - report_fatal_error("invalid fixup for movz/movk instruction"); - return 0; - case ARM64::fixup_arm64_tlsdesc_call: - return ELF::R_AARCH64_TLSDESC_CALL; - default: - llvm_unreachable("Unknown ELF relocation type"); - } - } - - llvm_unreachable("Unimplemented fixup -> relocation"); -} - -MCObjectWriter *llvm::createARM64ELFObjectWriter(raw_ostream &OS, - uint8_t OSABI, - bool IsLittleEndian) { - MCELFObjectTargetWriter *MOTW = new ARM64ELFObjectWriter(OSABI, IsLittleEndian); - return createELFObjectWriter(MOTW, OS, IsLittleEndian); -} diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.cpp deleted file mode 100644 index adbf8307972..00000000000 --- a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.cpp +++ /dev/null @@ -1,160 +0,0 @@ -//===- lib/MC/ARM64ELFStreamer.cpp - ELF Object Output for ARM64 ----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file assembles .s files and emits AArch64 ELF .o object files. Different -// from generic ELF streamer in emitting mapping symbols ($x and $d) to delimit -// regions of data and code. -// -//===----------------------------------------------------------------------===// - -#include "llvm/MC/MCELFStreamer.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/Twine.h" -#include "llvm/MC/MCAsmBackend.h" -#include "llvm/MC/MCAssembler.h" -#include "llvm/MC/MCCodeEmitter.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCELF.h" -#include "llvm/MC/MCELFStreamer.h" -#include "llvm/MC/MCELFSymbolFlags.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCObjectStreamer.h" -#include "llvm/MC/MCSection.h" -#include "llvm/MC/MCSectionELF.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/MC/MCValue.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ELF.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -namespace { - -/// Extend the generic ELFStreamer class so that it can emit mapping symbols at -/// the appropriate points in the object files. These symbols are defined in the -/// AArch64 ELF ABI: -/// infocenter.arm.com/help/topic/com.arm.doc.ihi0056a/IHI0056A_aaelf64.pdf -/// -/// In brief: $x or $d should be emitted at the start of each contiguous region -/// of A64 code or data in a section. In practice, this emission does not rely -/// on explicit assembler directives but on inherent properties of the -/// directives doing the emission (e.g. ".byte" is data, "add x0, x0, x0" an -/// instruction). -/// -/// As a result this system is orthogonal to the DataRegion infrastructure used -/// by MachO. Beware! -class ARM64ELFStreamer : public MCELFStreamer { -public: - ARM64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS, - MCCodeEmitter *Emitter) - : MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0), - LastEMS(EMS_None) {} - - ~ARM64ELFStreamer() {} - - void ChangeSection(const MCSection *Section, - const MCExpr *Subsection) override { - // We have to keep track of the mapping symbol state of any sections we - // use. Each one should start off as EMS_None, which is provided as the - // default constructor by DenseMap::lookup. - LastMappingSymbols[getPreviousSection().first] = LastEMS; - LastEMS = LastMappingSymbols.lookup(Section); - - MCELFStreamer::ChangeSection(Section, Subsection); - } - - /// This function is the one used to emit instruction data into the ELF - /// streamer. We override it to add the appropriate mapping symbol if - /// necessary. - void EmitInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI) override { - EmitA64MappingSymbol(); - MCELFStreamer::EmitInstruction(Inst, STI); - } - - /// This is one of the functions used to emit data into an ELF section, so the - /// ARM64 streamer overrides it to add the appropriate mapping symbol ($d) - /// if necessary. - void EmitBytes(StringRef Data) override { - EmitDataMappingSymbol(); - MCELFStreamer::EmitBytes(Data); - } - - /// This is one of the functions used to emit data into an ELF section, so the - /// ARM64 streamer overrides it to add the appropriate mapping symbol ($d) - /// if necessary. - void EmitValueImpl(const MCExpr *Value, unsigned Size, - const SMLoc &Loc) override { - EmitDataMappingSymbol(); - MCELFStreamer::EmitValueImpl(Value, Size); - } - -private: - enum ElfMappingSymbol { - EMS_None, - EMS_A64, - EMS_Data - }; - - void EmitDataMappingSymbol() { - if (LastEMS == EMS_Data) - return; - EmitMappingSymbol("$d"); - LastEMS = EMS_Data; - } - - void EmitA64MappingSymbol() { - if (LastEMS == EMS_A64) - return; - EmitMappingSymbol("$x"); - LastEMS = EMS_A64; - } - - void EmitMappingSymbol(StringRef Name) { - MCSymbol *Start = getContext().CreateTempSymbol(); - EmitLabel(Start); - - MCSymbol *Symbol = getContext().GetOrCreateSymbol( - Name + "." + Twine(MappingSymbolCounter++)); - - MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol); - MCELF::SetType(SD, ELF::STT_NOTYPE); - MCELF::SetBinding(SD, ELF::STB_LOCAL); - SD.setExternal(false); - Symbol->setSection(*getCurrentSection().first); - - const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext()); - Symbol->setVariableValue(Value); - } - - int64_t MappingSymbolCounter; - - DenseMap LastMappingSymbols; - ElfMappingSymbol LastEMS; - - /// @} -}; -} - -namespace llvm { -MCELFStreamer *createARM64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, - raw_ostream &OS, MCCodeEmitter *Emitter, - bool RelaxAll, bool NoExecStack) { - ARM64ELFStreamer *S = new ARM64ELFStreamer(Context, TAB, OS, Emitter); - if (RelaxAll) - S->getAssembler().setRelaxAll(true); - if (NoExecStack) - S->getAssembler().setNoExecStack(true); - return S; -} -} diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.h b/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.h deleted file mode 100644 index 72dadbc50aa..00000000000 --- a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.h +++ /dev/null @@ -1,26 +0,0 @@ -//===-- ARM64ELFStreamer.h - ELF Streamer for ARM64 -------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements ELF streamer information for the ARM64 backend. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_AARCH64_ELF_STREAMER_H -#define LLVM_AARCH64_ELF_STREAMER_H - -#include "llvm/MC/MCELFStreamer.h" - -namespace llvm { - -MCELFStreamer *createARM64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, - raw_ostream &OS, MCCodeEmitter *Emitter, - bool RelaxAll, bool NoExecStack); -} - -#endif // ARM64_ELF_STREAMER_H diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64FixupKinds.h b/lib/Target/ARM64/MCTargetDesc/ARM64FixupKinds.h deleted file mode 100644 index 7106b314ea2..00000000000 --- a/lib/Target/ARM64/MCTargetDesc/ARM64FixupKinds.h +++ /dev/null @@ -1,76 +0,0 @@ -//===-- ARM64FixupKinds.h - ARM64 Specific Fixup Entries --------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_ARM64FIXUPKINDS_H -#define LLVM_ARM64FIXUPKINDS_H - -#include "llvm/MC/MCFixup.h" - -namespace llvm { -namespace ARM64 { - -enum Fixups { - // fixup_arm64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into - // an ADR instruction. - fixup_arm64_pcrel_adr_imm21 = FirstTargetFixupKind, - - // fixup_arm64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into - // an ADRP instruction. - fixup_arm64_pcrel_adrp_imm21, - - // fixup_arm64_imm12 - 12-bit fixup for add/sub instructions. - // No alignment adjustment. All value bits are encoded. - fixup_arm64_add_imm12, - - // fixup_arm64_ldst_imm12_* - unsigned 12-bit fixups for load and - // store instructions. - fixup_arm64_ldst_imm12_scale1, - fixup_arm64_ldst_imm12_scale2, - fixup_arm64_ldst_imm12_scale4, - fixup_arm64_ldst_imm12_scale8, - fixup_arm64_ldst_imm12_scale16, - - // fixup_arm64_ldr_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative - // immediate. Same encoding as fixup_arm64_pcrel_adrhi, except this is used by - // pc-relative loads and generates relocations directly when necessary. - fixup_arm64_ldr_pcrel_imm19, - - // FIXME: comment - fixup_arm64_movw, - - // fixup_arm64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative - // immediate. - fixup_arm64_pcrel_branch14, - - // fixup_arm64_pcrel_branch19 - The high 19 bits of a 21-bit pc-relative - // immediate. Same encoding as fixup_arm64_pcrel_adrhi, except this is use by - // b.cc and generates relocations directly when necessary. - fixup_arm64_pcrel_branch19, - - // fixup_arm64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative - // immediate. - fixup_arm64_pcrel_branch26, - - // fixup_arm64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative - // immediate. Distinguished from branch26 only on ELF. - fixup_arm64_pcrel_call26, - - // fixup_arm64_tlsdesc_call - zero-space placeholder for the ELF - // R_AARCH64_TLSDESC_CALL relocation. - fixup_arm64_tlsdesc_call, - - // Marker - LastTargetFixupKind, - NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind -}; - -} // end namespace ARM64 -} // end namespace llvm - -#endif diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.cpp deleted file mode 100644 index e211d3428bf..00000000000 --- a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.cpp +++ /dev/null @@ -1,99 +0,0 @@ -//===-- ARM64MCAsmInfo.cpp - ARM64 asm properties -----------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the declarations of the ARM64MCAsmInfo properties. -// -//===----------------------------------------------------------------------===// - -#include "ARM64MCAsmInfo.h" -#include "llvm/ADT/Triple.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/Support/CommandLine.h" -using namespace llvm; - -enum AsmWriterVariantTy { - Default = -1, - Generic = 0, - Apple = 1 -}; - -static cl::opt AsmWriterVariant( - "arm64-neon-syntax", cl::init(Default), - cl::desc("Choose style of NEON code to emit from ARM64 backend:"), - cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"), - clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly"), - clEnumValEnd)); - -ARM64MCAsmInfoDarwin::ARM64MCAsmInfoDarwin() { - // We prefer NEON instructions to be printed in the short form. - AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant; - - PrivateGlobalPrefix = "L"; - SeparatorString = "%%"; - CommentString = ";"; - PointerSize = CalleeSaveStackSlotSize = 8; - - AlignmentIsInBytes = false; - UsesELFSectionDirectiveForBSS = true; - SupportsDebugInformation = true; - UseDataRegionDirectives = true; - - ExceptionsType = ExceptionHandling::DwarfCFI; -} - -const MCExpr *ARM64MCAsmInfoDarwin::getExprForPersonalitySymbol( - const MCSymbol *Sym, unsigned Encoding, MCStreamer &Streamer) const { - // On Darwin, we can reference dwarf symbols with foo@GOT-., which - // is an indirect pc-relative reference. The default implementation - // won't reference using the GOT, so we need this target-specific - // version. - MCContext &Context = Streamer.getContext(); - const MCExpr *Res = - MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, Context); - MCSymbol *PCSym = Context.CreateTempSymbol(); - Streamer.EmitLabel(PCSym); - const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, Context); - return MCBinaryExpr::CreateSub(Res, PC, Context); -} - -ARM64MCAsmInfoELF::ARM64MCAsmInfoELF(StringRef TT) { - Triple T(TT); - if (T.getArch() == Triple::arm64_be) - IsLittleEndian = false; - - // We prefer NEON instructions to be printed in the short form. - AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant; - - PointerSize = 8; - - // ".comm align is in bytes but .align is pow-2." - AlignmentIsInBytes = false; - - CommentString = "//"; - PrivateGlobalPrefix = ".L"; - Code32Directive = ".code\t32"; - - Data16bitsDirective = "\t.hword\t"; - Data32bitsDirective = "\t.word\t"; - Data64bitsDirective = "\t.xword\t"; - - UseDataRegionDirectives = false; - - WeakRefDirective = "\t.weak\t"; - - HasLEB128 = true; - SupportsDebugInformation = true; - - // Exceptions handling - ExceptionsType = ExceptionHandling::DwarfCFI; - - UseIntegratedAssembler = true; -} diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.h b/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.h deleted file mode 100644 index 324bc39560f..00000000000 --- a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.h +++ /dev/null @@ -1,36 +0,0 @@ -//=====-- ARM64MCAsmInfo.h - ARM64 asm properties -----------*- C++ -*--====// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the declaration of the ARM64MCAsmInfo class. -// -//===----------------------------------------------------------------------===// - -#ifndef ARM64TARGETASMINFO_H -#define ARM64TARGETASMINFO_H - -#include "llvm/MC/MCAsmInfoDarwin.h" - -namespace llvm { -class Target; -class StringRef; -class MCStreamer; -struct ARM64MCAsmInfoDarwin : public MCAsmInfoDarwin { - explicit ARM64MCAsmInfoDarwin(); - const MCExpr * - getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding, - MCStreamer &Streamer) const override; -}; - -struct ARM64MCAsmInfoELF : public MCAsmInfo { - explicit ARM64MCAsmInfoELF(StringRef TT); -}; - -} // namespace llvm - -#endif diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp deleted file mode 100644 index 0db08f422e4..00000000000 --- a/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp +++ /dev/null @@ -1,658 +0,0 @@ -//===-- ARM64/ARM64MCCodeEmitter.cpp - Convert ARM64 code to machine code -===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements the ARM64MCCodeEmitter class. -// -//===----------------------------------------------------------------------===// - -#include "MCTargetDesc/ARM64AddressingModes.h" -#include "MCTargetDesc/ARM64FixupKinds.h" -#include "MCTargetDesc/ARM64MCExpr.h" -#include "Utils/ARM64BaseInfo.h" -#include "llvm/MC/MCCodeEmitter.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Support/raw_ostream.h" -using namespace llvm; - -#define DEBUG_TYPE "mccodeemitter" - -STATISTIC(MCNumEmitted, "Number of MC instructions emitted."); -STATISTIC(MCNumFixups, "Number of MC fixups created."); - -namespace { - -class ARM64MCCodeEmitter : public MCCodeEmitter { - MCContext &Ctx; - - ARM64MCCodeEmitter(const ARM64MCCodeEmitter &); // DO NOT IMPLEMENT - void operator=(const ARM64MCCodeEmitter &); // DO NOT IMPLEMENT -public: - ARM64MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti, - MCContext &ctx) - : Ctx(ctx) {} - - ~ARM64MCCodeEmitter() {} - - // getBinaryCodeForInstr - TableGen'erated function for getting the - // binary encoding for an instruction. - uint64_t getBinaryCodeForInstr(const MCInst &MI, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - /// getMachineOpValue - Return binary encoding of operand. If the machine - /// operand requires relocation, record the relocation and return zero. - unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - /// getLdStUImm12OpValue - Return encoding info for 12-bit unsigned immediate - /// attached to a load, store or prfm instruction. If operand requires a - /// relocation, record it and return zero in that part of the encoding. - template - uint32_t getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - /// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label - /// target. - uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - /// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and - /// the 2-bit shift field. - uint32_t getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - /// getCondBranchTargetOpValue - Return the encoded value for a conditional - /// branch target. - uint32_t getCondBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - /// getLoadLiteralOpValue - Return the encoded value for a load-literal - /// pc-relative address. - uint32_t getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - /// getMemExtendOpValue - Return the encoded value for a reg-extend load/store - /// instruction: bit 0 is whether a shift is present, bit 1 is whether the - /// operation is a sign extend (as opposed to a zero extend). - uint32_t getMemExtendOpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - /// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and- - /// branch target. - uint32_t getTestBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - /// getBranchTargetOpValue - Return the encoded value for an unconditional - /// branch target. - uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - /// getMoveWideImmOpValue - Return the encoded value for the immediate operand - /// of a MOVZ or MOVK instruction. - uint32_t getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - /// getVecShifterOpValue - Return the encoded value for the vector shifter. - uint32_t getVecShifterOpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - /// getMoveVecShifterOpValue - Return the encoded value for the vector move - /// shifter (MSL). - uint32_t getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - /// getFixedPointScaleOpValue - Return the encoded value for the - // FP-to-fixed-point scale factor. - uint32_t getFixedPointScaleOpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - uint32_t getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - uint32_t getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - uint32_t getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - uint32_t getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - uint32_t getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - uint32_t getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - uint32_t getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - uint32_t getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - /// getSIMDShift64OpValue - Return the encoded value for the - // shift-by-immediate AdvSIMD instructions. - uint32_t getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - uint32_t getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - uint32_t getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - uint32_t getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const; - - unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue, - const MCSubtargetInfo &STI) const; - - void EmitByte(unsigned char C, raw_ostream &OS) const { OS << (char)C; } - - void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const { - // Output the constant in little endian byte order. - for (unsigned i = 0; i != Size; ++i) { - EmitByte(Val & 255, OS); - Val >>= 8; - } - } - - void EncodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const override; - - unsigned fixMulHigh(const MCInst &MI, unsigned EncodedValue, - const MCSubtargetInfo &STI) const; - - template unsigned - fixLoadStoreExclusive(const MCInst &MI, unsigned EncodedValue, - const MCSubtargetInfo &STI) const; - - unsigned fixOneOperandFPComparison(const MCInst &MI, unsigned EncodedValue, - const MCSubtargetInfo &STI) const; -}; - -} // end anonymous namespace - -MCCodeEmitter *llvm::createARM64MCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, - MCContext &Ctx) { - return new ARM64MCCodeEmitter(MCII, STI, Ctx); -} - -/// getMachineOpValue - Return binary encoding of operand. If the machine -/// operand requires relocation, record the relocation and return zero. -unsigned -ARM64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - if (MO.isReg()) - return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()); - else { - assert(MO.isImm() && "did not expect relocated expression"); - return static_cast(MO.getImm()); - } - - assert(0 && "Unable to encode MCOperand!"); - return 0; -} - -template uint32_t -ARM64MCCodeEmitter::getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - uint32_t ImmVal = 0; - - if (MO.isImm()) - ImmVal = static_cast(MO.getImm()); - else { - assert(MO.isExpr() && "unable to encode load/store imm operand"); - MCFixupKind Kind = MCFixupKind(FixupKind); - Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc())); - ++MCNumFixups; - } - - return ImmVal; -} - -/// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label -/// target. -uint32_t -ARM64MCCodeEmitter::getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - - // If the destination is an immediate, we have nothing to do. - if (MO.isImm()) - return MO.getImm(); - assert(MO.isExpr() && "Unexpected target type!"); - const MCExpr *Expr = MO.getExpr(); - - MCFixupKind Kind = MI.getOpcode() == ARM64::ADR - ? MCFixupKind(ARM64::fixup_arm64_pcrel_adr_imm21) - : MCFixupKind(ARM64::fixup_arm64_pcrel_adrp_imm21); - Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc())); - - MCNumFixups += 1; - - // All of the information is in the fixup. - return 0; -} - -/// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and -/// the 2-bit shift field. The shift field is stored in bits 13-14 of the -/// return value. -uint32_t -ARM64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - // Suboperands are [imm, shifter]. - const MCOperand &MO = MI.getOperand(OpIdx); - const MCOperand &MO1 = MI.getOperand(OpIdx + 1); - assert(ARM64_AM::getShiftType(MO1.getImm()) == ARM64_AM::LSL && - "unexpected shift type for add/sub immediate"); - unsigned ShiftVal = ARM64_AM::getShiftValue(MO1.getImm()); - assert((ShiftVal == 0 || ShiftVal == 12) && - "unexpected shift value for add/sub immediate"); - if (MO.isImm()) - return MO.getImm() | (ShiftVal == 0 ? 0 : (1 << 12)); - assert(MO.isExpr() && "Unable to encode MCOperand!"); - const MCExpr *Expr = MO.getExpr(); - - // Encode the 12 bits of the fixup. - MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_add_imm12); - Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc())); - - ++MCNumFixups; - - return 0; -} - -/// getCondBranchTargetOpValue - Return the encoded value for a conditional -/// branch target. -uint32_t ARM64MCCodeEmitter::getCondBranchTargetOpValue( - const MCInst &MI, unsigned OpIdx, SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - - // If the destination is an immediate, we have nothing to do. - if (MO.isImm()) - return MO.getImm(); - assert(MO.isExpr() && "Unexpected target type!"); - - MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_pcrel_branch19); - Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc())); - - ++MCNumFixups; - - // All of the information is in the fixup. - return 0; -} - -/// getLoadLiteralOpValue - Return the encoded value for a load-literal -/// pc-relative address. -uint32_t -ARM64MCCodeEmitter::getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - - // If the destination is an immediate, we have nothing to do. - if (MO.isImm()) - return MO.getImm(); - assert(MO.isExpr() && "Unexpected target type!"); - - MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_ldr_pcrel_imm19); - Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc())); - - ++MCNumFixups; - - // All of the information is in the fixup. - return 0; -} - -uint32_t -ARM64MCCodeEmitter::getMemExtendOpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - unsigned SignExtend = MI.getOperand(OpIdx).getImm(); - unsigned DoShift = MI.getOperand(OpIdx + 1).getImm(); - return (SignExtend << 1) | DoShift; -} - -uint32_t -ARM64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - - if (MO.isImm()) - return MO.getImm(); - assert(MO.isExpr() && "Unexpected movz/movk immediate"); - - Fixups.push_back(MCFixup::Create( - 0, MO.getExpr(), MCFixupKind(ARM64::fixup_arm64_movw), MI.getLoc())); - - ++MCNumFixups; - - return 0; -} - -/// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and- -/// branch target. -uint32_t ARM64MCCodeEmitter::getTestBranchTargetOpValue( - const MCInst &MI, unsigned OpIdx, SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - - // If the destination is an immediate, we have nothing to do. - if (MO.isImm()) - return MO.getImm(); - assert(MO.isExpr() && "Unexpected ADR target type!"); - - MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_pcrel_branch14); - Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc())); - - ++MCNumFixups; - - // All of the information is in the fixup. - return 0; -} - -/// getBranchTargetOpValue - Return the encoded value for an unconditional -/// branch target. -uint32_t -ARM64MCCodeEmitter::getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - - // If the destination is an immediate, we have nothing to do. - if (MO.isImm()) - return MO.getImm(); - assert(MO.isExpr() && "Unexpected ADR target type!"); - - MCFixupKind Kind = MI.getOpcode() == ARM64::BL - ? MCFixupKind(ARM64::fixup_arm64_pcrel_call26) - : MCFixupKind(ARM64::fixup_arm64_pcrel_branch26); - Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc())); - - ++MCNumFixups; - - // All of the information is in the fixup. - return 0; -} - -/// getVecShifterOpValue - Return the encoded value for the vector shifter: -/// -/// 00 -> 0 -/// 01 -> 8 -/// 10 -> 16 -/// 11 -> 24 -uint32_t -ARM64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && "Expected an immediate value for the shift amount!"); - - switch (MO.getImm()) { - default: - break; - case 0: - return 0; - case 8: - return 1; - case 16: - return 2; - case 24: - return 3; - } - - assert(false && "Invalid value for vector shift amount!"); - return 0; -} - -uint32_t -ARM64MCCodeEmitter::getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && "Expected an immediate value for the shift amount!"); - return 64 - (MO.getImm()); -} - -uint32_t -ARM64MCCodeEmitter::getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && "Expected an immediate value for the shift amount!"); - return 64 - (MO.getImm() | 32); -} - -uint32_t -ARM64MCCodeEmitter::getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && "Expected an immediate value for the shift amount!"); - return 32 - (MO.getImm() | 16); -} - -uint32_t -ARM64MCCodeEmitter::getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && "Expected an immediate value for the shift amount!"); - return 16 - (MO.getImm() | 8); -} - -/// getFixedPointScaleOpValue - Return the encoded value for the -// FP-to-fixed-point scale factor. -uint32_t ARM64MCCodeEmitter::getFixedPointScaleOpValue( - const MCInst &MI, unsigned OpIdx, SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && "Expected an immediate value for the scale amount!"); - return 64 - MO.getImm(); -} - -uint32_t -ARM64MCCodeEmitter::getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && "Expected an immediate value for the scale amount!"); - return 64 - MO.getImm(); -} - -uint32_t -ARM64MCCodeEmitter::getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && "Expected an immediate value for the scale amount!"); - return 32 - MO.getImm(); -} - -uint32_t -ARM64MCCodeEmitter::getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && "Expected an immediate value for the scale amount!"); - return 16 - MO.getImm(); -} - -uint32_t -ARM64MCCodeEmitter::getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && "Expected an immediate value for the scale amount!"); - return 8 - MO.getImm(); -} - -uint32_t -ARM64MCCodeEmitter::getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && "Expected an immediate value for the scale amount!"); - return MO.getImm() - 64; -} - -uint32_t -ARM64MCCodeEmitter::getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && "Expected an immediate value for the scale amount!"); - return MO.getImm() - 32; -} - -uint32_t -ARM64MCCodeEmitter::getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && "Expected an immediate value for the scale amount!"); - return MO.getImm() - 16; -} - -uint32_t -ARM64MCCodeEmitter::getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && "Expected an immediate value for the scale amount!"); - return MO.getImm() - 8; -} - -/// getMoveVecShifterOpValue - Return the encoded value for the vector move -/// shifter (MSL). -uint32_t -ARM64MCCodeEmitter::getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - const MCOperand &MO = MI.getOperand(OpIdx); - assert(MO.isImm() && - "Expected an immediate value for the move shift amount!"); - unsigned ShiftVal = ARM64_AM::getShiftValue(MO.getImm()); - assert((ShiftVal == 8 || ShiftVal == 16) && "Invalid shift amount!"); - return ShiftVal == 8 ? 0 : 1; -} - -unsigned ARM64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue, - const MCSubtargetInfo &STI) const { - // If one of the signed fixup kinds is applied to a MOVZ instruction, the - // eventual result could be either a MOVZ or a MOVN. It's the MCCodeEmitter's - // job to ensure that any bits possibly affected by this are 0. This means we - // must zero out bit 30 (essentially emitting a MOVN). - MCOperand UImm16MO = MI.getOperand(1); - - // Nothing to do if there's no fixup. - if (UImm16MO.isImm()) - return EncodedValue; - - const ARM64MCExpr *A64E = cast(UImm16MO.getExpr()); - switch (A64E->getKind()) { - case ARM64MCExpr::VK_DTPREL_G2: - case ARM64MCExpr::VK_DTPREL_G1: - case ARM64MCExpr::VK_DTPREL_G0: - case ARM64MCExpr::VK_GOTTPREL_G1: - case ARM64MCExpr::VK_TPREL_G2: - case ARM64MCExpr::VK_TPREL_G1: - case ARM64MCExpr::VK_TPREL_G0: - return EncodedValue & ~(1u << 30); - default: - // Nothing to do for an unsigned fixup. - return EncodedValue; - } - - - return EncodedValue & ~(1u << 30); -} - -void ARM64MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, - SmallVectorImpl &Fixups, - const MCSubtargetInfo &STI) const { - if (MI.getOpcode() == ARM64::TLSDESCCALL) { - // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the - // following (BLR) instruction. It doesn't emit any code itself so it - // doesn't go through the normal TableGenerated channels. - MCFixupKind Fixup = MCFixupKind(ARM64::fixup_arm64_tlsdesc_call); - Fixups.push_back(MCFixup::Create(0, MI.getOperand(0).getExpr(), Fixup)); - return; - } - - uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI); - EmitConstant(Binary, 4, OS); - ++MCNumEmitted; // Keep track of the # of mi's emitted. -} - -unsigned -ARM64MCCodeEmitter::fixMulHigh(const MCInst &MI, - unsigned EncodedValue, - const MCSubtargetInfo &STI) const { - // The Ra field of SMULH and UMULH is unused: it should be assembled as 31 - // (i.e. all bits 1) but is ignored by the processor. - EncodedValue |= 0x1f << 10; - return EncodedValue; -} - -template unsigned -ARM64MCCodeEmitter::fixLoadStoreExclusive(const MCInst &MI, - unsigned EncodedValue, - const MCSubtargetInfo &STI) const { - if (!hasRs) EncodedValue |= 0x001F0000; - if (!hasRt2) EncodedValue |= 0x00007C00; - - return EncodedValue; -} - -unsigned -ARM64MCCodeEmitter::fixOneOperandFPComparison(const MCInst &MI, - unsigned EncodedValue, - const MCSubtargetInfo &STI) const { - // The Rm field of FCMP and friends is unused - it should be assembled - // as 0, but is ignored by the processor. - EncodedValue &= ~(0x1f << 16); - return EncodedValue; -} - -#include "ARM64GenMCCodeEmitter.inc" diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp deleted file mode 100644 index efa820b097f..00000000000 --- a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp +++ /dev/null @@ -1,174 +0,0 @@ -//===-- ARM64MCExpr.cpp - ARM64 specific MC expression classes --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the implementation of the assembly expression modifiers -// accepted by the AArch64 architecture (e.g. ":lo12:", ":gottprel_g1:", ...). -// -//===----------------------------------------------------------------------===// - -#include "ARM64MCExpr.h" -#include "llvm/MC/MCAssembler.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCELF.h" -#include "llvm/MC/MCSymbol.h" -#include "llvm/MC/MCValue.h" -#include "llvm/Object/ELF.h" -#include "llvm/Support/ErrorHandling.h" - -using namespace llvm; - -#define DEBUG_TYPE "aarch64symbolrefexpr" - -const ARM64MCExpr *ARM64MCExpr::Create(const MCExpr *Expr, VariantKind Kind, - MCContext &Ctx) { - return new (Ctx) ARM64MCExpr(Expr, Kind); -} - -StringRef ARM64MCExpr::getVariantKindName() const { - switch (static_cast(getKind())) { - case VK_CALL: return ""; - case VK_LO12: return ":lo12:"; - case VK_ABS_G3: return ":abs_g3:"; - case VK_ABS_G2: return ":abs_g2:"; - case VK_ABS_G2_S: return ":abs_g2_s:"; - case VK_ABS_G2_NC: return ":abs_g2_nc:"; - case VK_ABS_G1: return ":abs_g1:"; - case VK_ABS_G1_S: return ":abs_g1_s:"; - case VK_ABS_G1_NC: return ":abs_g1_nc:"; - case VK_ABS_G0: return ":abs_g0:"; - case VK_ABS_G0_S: return ":abs_g0_s:"; - case VK_ABS_G0_NC: return ":abs_g0_nc:"; - case VK_DTPREL_G2: return ":dtprel_g2:"; - case VK_DTPREL_G1: return ":dtprel_g1:"; - case VK_DTPREL_G1_NC: return ":dtprel_g1_nc:"; - case VK_DTPREL_G0: return ":dtprel_g0:"; - case VK_DTPREL_G0_NC: return ":dtprel_g0_nc:"; - case VK_DTPREL_HI12: return ":dtprel_hi12:"; - case VK_DTPREL_LO12: return ":dtprel_lo12:"; - case VK_DTPREL_LO12_NC: return ":dtprel_lo12_nc:"; - case VK_TPREL_G2: return ":tprel_g2:"; - case VK_TPREL_G1: return ":tprel_g1:"; - case VK_TPREL_G1_NC: return ":tprel_g1_nc:"; - case VK_TPREL_G0: return ":tprel_g0:"; - case VK_TPREL_G0_NC: return ":tprel_g0_nc:"; - case VK_TPREL_HI12: return ":tprel_hi12:"; - case VK_TPREL_LO12: return ":tprel_lo12:"; - case VK_TPREL_LO12_NC: return ":tprel_lo12_nc:"; - case VK_TLSDESC_LO12: return ":tlsdesc_lo12:"; - case VK_ABS_PAGE: return ""; - case VK_GOT_PAGE: return ":got:"; - case VK_GOT_LO12: return ":got_lo12:"; - case VK_GOTTPREL_PAGE: return ":gottprel:"; - case VK_GOTTPREL_LO12_NC: return ":gottprel_lo12:"; - case VK_GOTTPREL_G1: return ":gottprel_g1:"; - case VK_GOTTPREL_G0_NC: return ":gottprel_g0_nc:"; - case VK_TLSDESC: return ""; - case VK_TLSDESC_PAGE: return ":tlsdesc:"; - default: - llvm_unreachable("Invalid ELF symbol kind"); - } -} - -void ARM64MCExpr::PrintImpl(raw_ostream &OS) const { - if (getKind() != VK_NONE) - OS << getVariantKindName(); - OS << *Expr; -} - -// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps -// that method should be made public? -// FIXME: really do above: now that two backends are using it. -static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) { - switch (Value->getKind()) { - case MCExpr::Target: - llvm_unreachable("Can't handle nested target expr!"); - break; - - case MCExpr::Constant: - break; - - case MCExpr::Binary: { - const MCBinaryExpr *BE = cast(Value); - AddValueSymbolsImpl(BE->getLHS(), Asm); - AddValueSymbolsImpl(BE->getRHS(), Asm); - break; - } - - case MCExpr::SymbolRef: - Asm->getOrCreateSymbolData(cast(Value)->getSymbol()); - break; - - case MCExpr::Unary: - AddValueSymbolsImpl(cast(Value)->getSubExpr(), Asm); - break; - } -} - -void ARM64MCExpr::AddValueSymbols(MCAssembler *Asm) const { - AddValueSymbolsImpl(getSubExpr(), Asm); -} - -const MCSection *ARM64MCExpr::FindAssociatedSection() const { - llvm_unreachable("FIXME: what goes here?"); -} - -bool ARM64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res, - const MCAsmLayout *Layout) const { - if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout)) - return false; - - Res = - MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind()); - - return true; -} - -static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) { - switch (Expr->getKind()) { - case MCExpr::Target: - llvm_unreachable("Can't handle nested target expression"); - break; - case MCExpr::Constant: - break; - - case MCExpr::Binary: { - const MCBinaryExpr *BE = cast(Expr); - fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm); - fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm); - break; - } - - case MCExpr::SymbolRef: { - // We're known to be under a TLS fixup, so any symbol should be - // modified. There should be only one. - const MCSymbolRefExpr &SymRef = *cast(Expr); - MCSymbolData &SD = Asm.getOrCreateSymbolData(SymRef.getSymbol()); - MCELF::SetType(SD, ELF::STT_TLS); - break; - } - - case MCExpr::Unary: - fixELFSymbolsInTLSFixupsImpl(cast(Expr)->getSubExpr(), Asm); - break; - } -} - -void ARM64MCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const { - switch (getSymbolLoc(Kind)) { - default: - return; - case VK_DTPREL: - case VK_GOTTPREL: - case VK_TPREL: - case VK_TLSDESC: - break; - } - - fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm); -} diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h b/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h deleted file mode 100644 index d8325465178..00000000000 --- a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h +++ /dev/null @@ -1,168 +0,0 @@ -//=---- ARM64MCExpr.h - ARM64 specific MC expression classes ------*- C++ -*-=// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file describes ARM64-specific MCExprs, used for modifiers like -// ":lo12:" or ":gottprel_g1:". -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_ARM64MCEXPR_H -#define LLVM_ARM64MCEXPR_H - -#include "llvm/MC/MCExpr.h" -#include "llvm/Support/ErrorHandling.h" - -namespace llvm { - -class ARM64MCExpr : public MCTargetExpr { -public: - enum VariantKind { - VK_NONE = 0x000, - - // Symbol locations specifying (roughly speaking) what calculation should be - // performed to construct the final address for the relocated - // symbol. E.g. direct, via the GOT, ... - VK_ABS = 0x001, - VK_SABS = 0x002, - VK_GOT = 0x003, - VK_DTPREL = 0x004, - VK_GOTTPREL = 0x005, - VK_TPREL = 0x006, - VK_TLSDESC = 0x007, - VK_SymLocBits = 0x00f, - - // Variants specifying which part of the final address calculation is - // used. E.g. the low 12 bits for an ADD/LDR, the middle 16 bits for a - // MOVZ/MOVK. - VK_PAGE = 0x010, - VK_PAGEOFF = 0x020, - VK_HI12 = 0x030, - VK_G0 = 0x040, - VK_G1 = 0x050, - VK_G2 = 0x060, - VK_G3 = 0x070, - VK_AddressFragBits = 0x0f0, - - // Whether the final relocation is a checked one (where a linker should - // perform a range-check on the final address) or not. Note that this field - // is unfortunately sometimes omitted from the assembly syntax. E.g. :lo12: - // on its own is a non-checked relocation. We side with ELF on being - // explicit about this! - VK_NC = 0x100, - - // Convenience definitions for referring to specific textual representations - // of relocation specifiers. Note that this means the "_NC" is sometimes - // omitted in line with assembly syntax here (VK_LO12 rather than VK_LO12_NC - // since a user would write ":lo12:"). - VK_CALL = VK_ABS, - VK_ABS_PAGE = VK_ABS | VK_PAGE, - VK_ABS_G3 = VK_ABS | VK_G3, - VK_ABS_G2 = VK_ABS | VK_G2, - VK_ABS_G2_S = VK_SABS | VK_G2, - VK_ABS_G2_NC = VK_ABS | VK_G2 | VK_NC, - VK_ABS_G1 = VK_ABS | VK_G1, - VK_ABS_G1_S = VK_SABS | VK_G1, - VK_ABS_G1_NC = VK_ABS | VK_G1 | VK_NC, - VK_ABS_G0 = VK_ABS | VK_G0, - VK_ABS_G0_S = VK_SABS | VK_G0, - VK_ABS_G0_NC = VK_ABS | VK_G0 | VK_NC, - VK_LO12 = VK_ABS | VK_PAGEOFF | VK_NC, - VK_GOT_LO12 = VK_GOT | VK_PAGEOFF | VK_NC, - VK_GOT_PAGE = VK_GOT | VK_PAGE, - VK_DTPREL_G2 = VK_DTPREL | VK_G2, - VK_DTPREL_G1 = VK_DTPREL | VK_G1, - VK_DTPREL_G1_NC = VK_DTPREL | VK_G1 | VK_NC, - VK_DTPREL_G0 = VK_DTPREL | VK_G0, - VK_DTPREL_G0_NC = VK_DTPREL | VK_G0 | VK_NC, - VK_DTPREL_HI12 = VK_DTPREL | VK_HI12, - VK_DTPREL_LO12 = VK_DTPREL | VK_PAGEOFF, - VK_DTPREL_LO12_NC = VK_DTPREL | VK_PAGEOFF | VK_NC, - VK_GOTTPREL_PAGE = VK_GOTTPREL | VK_PAGE, - VK_GOTTPREL_LO12_NC = VK_GOTTPREL | VK_PAGEOFF | VK_NC, - VK_GOTTPREL_G1 = VK_GOTTPREL | VK_G1, - VK_GOTTPREL_G0_NC = VK_GOTTPREL | VK_G0 | VK_NC, - VK_TPREL_G2 = VK_TPREL | VK_G2, - VK_TPREL_G1 = VK_TPREL | VK_G1, - VK_TPREL_G1_NC = VK_TPREL | VK_G1 | VK_NC, - VK_TPREL_G0 = VK_TPREL | VK_G0, - VK_TPREL_G0_NC = VK_TPREL | VK_G0 | VK_NC, - VK_TPREL_HI12 = VK_TPREL | VK_HI12, - VK_TPREL_LO12 = VK_TPREL | VK_PAGEOFF, - VK_TPREL_LO12_NC = VK_TPREL | VK_PAGEOFF | VK_NC, - VK_TLSDESC_LO12 = VK_TLSDESC | VK_PAGEOFF | VK_NC, - VK_TLSDESC_PAGE = VK_TLSDESC | VK_PAGE, - - VK_INVALID = 0xfff - }; - -private: - const MCExpr *Expr; - const VariantKind Kind; - - explicit ARM64MCExpr(const MCExpr *Expr, VariantKind Kind) - : Expr(Expr), Kind(Kind) {} - -public: - /// @name Construction - /// @{ - - static const ARM64MCExpr *Create(const MCExpr *Expr, VariantKind Kind, - MCContext &Ctx); - - /// @} - /// @name Accessors - /// @{ - - /// Get the kind of this expression. - VariantKind getKind() const { return static_cast(Kind); } - - /// Get the expression this modifier applies to. - const MCExpr *getSubExpr() const { return Expr; } - - /// @} - /// @name VariantKind information extractors. - /// @{ - - static VariantKind getSymbolLoc(VariantKind Kind) { - return static_cast(Kind & VK_SymLocBits); - } - - static VariantKind getAddressFrag(VariantKind Kind) { - return static_cast(Kind & VK_AddressFragBits); - } - - static bool isNotChecked(VariantKind Kind) { return Kind & VK_NC; } - - /// @} - - /// Convert the variant kind into an ELF-appropriate modifier - /// (e.g. ":got:", ":lo12:"). - StringRef getVariantKindName() const; - - void PrintImpl(raw_ostream &OS) const override; - - void AddValueSymbols(MCAssembler *) const override; - - const MCSection *FindAssociatedSection() const override; - - bool EvaluateAsRelocatableImpl(MCValue &Res, - const MCAsmLayout *Layout) const override; - - void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override; - - static bool classof(const MCExpr *E) { - return E->getKind() == MCExpr::Target; - } - - static bool classof(const ARM64MCExpr *) { return true; } - -}; -} // end namespace llvm - -#endif diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp deleted file mode 100644 index 079d3588f6e..00000000000 --- a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp +++ /dev/null @@ -1,210 +0,0 @@ -//===-- ARM64MCTargetDesc.cpp - ARM64 Target Descriptions -------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file provides ARM64 specific target descriptions. -// -//===----------------------------------------------------------------------===// - -#include "ARM64MCTargetDesc.h" -#include "ARM64ELFStreamer.h" -#include "ARM64MCAsmInfo.h" -#include "InstPrinter/ARM64InstPrinter.h" -#include "llvm/MC/MCCodeGenInfo.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/TargetRegistry.h" - -using namespace llvm; - -#define GET_INSTRINFO_MC_DESC -#include "ARM64GenInstrInfo.inc" - -#define GET_SUBTARGETINFO_MC_DESC -#include "ARM64GenSubtargetInfo.inc" - -#define GET_REGINFO_MC_DESC -#include "ARM64GenRegisterInfo.inc" - -static MCInstrInfo *createARM64MCInstrInfo() { - MCInstrInfo *X = new MCInstrInfo(); - InitARM64MCInstrInfo(X); - return X; -} - -static MCSubtargetInfo *createARM64MCSubtargetInfo(StringRef TT, StringRef CPU, - StringRef FS) { - MCSubtargetInfo *X = new MCSubtargetInfo(); - - if (CPU.empty()) - CPU = "generic"; - - InitARM64MCSubtargetInfo(X, TT, CPU, FS); - return X; -} - -static MCRegisterInfo *createARM64MCRegisterInfo(StringRef Triple) { - MCRegisterInfo *X = new MCRegisterInfo(); - InitARM64MCRegisterInfo(X, ARM64::LR); - return X; -} - -static MCAsmInfo *createARM64MCAsmInfo(const MCRegisterInfo &MRI, - StringRef TT) { - Triple TheTriple(TT); - - MCAsmInfo *MAI; - if (TheTriple.isOSDarwin()) - MAI = new ARM64MCAsmInfoDarwin(); - else { - assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF"); - MAI = new ARM64MCAsmInfoELF(TT); - } - - // Initial state of the frame pointer is SP. - unsigned Reg = MRI.getDwarfRegNum(ARM64::SP, true); - MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0); - MAI->addInitialFrameState(Inst); - - return MAI; -} - -static MCCodeGenInfo *createARM64MCCodeGenInfo(StringRef TT, Reloc::Model RM, - CodeModel::Model CM, - CodeGenOpt::Level OL) { - Triple TheTriple(TT); - assert((TheTriple.isOSBinFormatELF() || TheTriple.isOSBinFormatMachO()) && - "Only expect Darwin and ELF targets"); - - if (CM == CodeModel::Default) - CM = CodeModel::Small; - // The default MCJIT memory managers make no guarantees about where they can - // find an executable page; JITed code needs to be able to refer to globals - // no matter how far away they are. - else if (CM == CodeModel::JITDefault) - CM = CodeModel::Large; - else if (CM != CodeModel::Small && CM != CodeModel::Large) - report_fatal_error("Only small and large code models are allowed on ARM64"); - - // ARM64 Darwin is always PIC. - if (TheTriple.isOSDarwin()) - RM = Reloc::PIC_; - // On ELF platforms the default static relocation model has a smart enough - // linker to cope with referencing external symbols defined in a shared - // library. Hence DynamicNoPIC doesn't need to be promoted to PIC. - else if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC) - RM = Reloc::Static; - - MCCodeGenInfo *X = new MCCodeGenInfo(); - X->InitMCCodeGenInfo(RM, CM, OL); - return X; -} - -static MCInstPrinter *createARM64MCInstPrinter(const Target &T, - unsigned SyntaxVariant, - const MCAsmInfo &MAI, - const MCInstrInfo &MII, - const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI) { - if (SyntaxVariant == 0) - return new ARM64InstPrinter(MAI, MII, MRI, STI); - if (SyntaxVariant == 1) - return new ARM64AppleInstPrinter(MAI, MII, MRI, STI); - - return nullptr; -} - -static MCStreamer *createMCStreamer(const Target &T, StringRef TT, - MCContext &Ctx, MCAsmBackend &TAB, - raw_ostream &OS, MCCodeEmitter *Emitter, - const MCSubtargetInfo &STI, bool RelaxAll, - bool NoExecStack) { - Triple TheTriple(TT); - - if (TheTriple.isOSDarwin()) - return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll, - /*LabelSections*/ true); - - return createARM64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack); -} - -// Force static initialization. -extern "C" void LLVMInitializeARM64TargetMC() { - // Register the MC asm info. - RegisterMCAsmInfoFn X(TheARM64leTarget, createARM64MCAsmInfo); - RegisterMCAsmInfoFn Y(TheARM64beTarget, createARM64MCAsmInfo); - RegisterMCAsmInfoFn Z(TheAArch64leTarget, createARM64MCAsmInfo); - RegisterMCAsmInfoFn W(TheAArch64beTarget, createARM64MCAsmInfo); - - // Register the MC codegen info. - TargetRegistry::RegisterMCCodeGenInfo(TheARM64leTarget, - createARM64MCCodeGenInfo); - TargetRegistry::RegisterMCCodeGenInfo(TheARM64beTarget, - createARM64MCCodeGenInfo); - TargetRegistry::RegisterMCCodeGenInfo(TheAArch64leTarget, - createARM64MCCodeGenInfo); - TargetRegistry::RegisterMCCodeGenInfo(TheAArch64beTarget, - createARM64MCCodeGenInfo); - - // Register the MC instruction info. - TargetRegistry::RegisterMCInstrInfo(TheARM64leTarget, createARM64MCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(TheARM64beTarget, createARM64MCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(TheAArch64leTarget, createARM64MCInstrInfo); - TargetRegistry::RegisterMCInstrInfo(TheAArch64beTarget, createARM64MCInstrInfo); - - // Register the MC register info. - TargetRegistry::RegisterMCRegInfo(TheARM64leTarget, createARM64MCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(TheARM64beTarget, createARM64MCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(TheAArch64leTarget, createARM64MCRegisterInfo); - TargetRegistry::RegisterMCRegInfo(TheAArch64beTarget, createARM64MCRegisterInfo); - - // Register the MC subtarget info. - TargetRegistry::RegisterMCSubtargetInfo(TheARM64leTarget, - createARM64MCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(TheARM64beTarget, - createARM64MCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(TheAArch64leTarget, - createARM64MCSubtargetInfo); - TargetRegistry::RegisterMCSubtargetInfo(TheAArch64beTarget, - createARM64MCSubtargetInfo); - - // Register the asm backend. - TargetRegistry::RegisterMCAsmBackend(TheARM64leTarget, createARM64leAsmBackend); - TargetRegistry::RegisterMCAsmBackend(TheARM64beTarget, createARM64beAsmBackend); - TargetRegistry::RegisterMCAsmBackend(TheAArch64leTarget, createARM64leAsmBackend); - TargetRegistry::RegisterMCAsmBackend(TheAArch64beTarget, createARM64beAsmBackend); - - // Register the MC Code Emitter - TargetRegistry::RegisterMCCodeEmitter(TheARM64leTarget, - createARM64MCCodeEmitter); - TargetRegistry::RegisterMCCodeEmitter(TheARM64beTarget, - createARM64MCCodeEmitter); - TargetRegistry::RegisterMCCodeEmitter(TheAArch64leTarget, - createARM64MCCodeEmitter); - TargetRegistry::RegisterMCCodeEmitter(TheAArch64beTarget, - createARM64MCCodeEmitter); - - // Register the object streamer. - TargetRegistry::RegisterMCObjectStreamer(TheARM64leTarget, createMCStreamer); - TargetRegistry::RegisterMCObjectStreamer(TheARM64beTarget, createMCStreamer); - TargetRegistry::RegisterMCObjectStreamer(TheAArch64leTarget, createMCStreamer); - TargetRegistry::RegisterMCObjectStreamer(TheAArch64beTarget, createMCStreamer); - - // Register the MCInstPrinter. - TargetRegistry::RegisterMCInstPrinter(TheARM64leTarget, - createARM64MCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(TheARM64beTarget, - createARM64MCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(TheAArch64leTarget, - createARM64MCInstPrinter); - TargetRegistry::RegisterMCInstPrinter(TheAArch64beTarget, - createARM64MCInstPrinter); -} diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h b/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h deleted file mode 100644 index f2e9c17a378..00000000000 --- a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h +++ /dev/null @@ -1,68 +0,0 @@ -//===-- ARM64MCTargetDesc.h - ARM64 Target Descriptions ---------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file provides ARM64 specific target descriptions. -// -//===----------------------------------------------------------------------===// - -#ifndef ARM64MCTARGETDESC_H -#define ARM64MCTARGETDESC_H - -#include "llvm/Support/DataTypes.h" -#include - -namespace llvm { -class MCAsmBackend; -class MCCodeEmitter; -class MCContext; -class MCInstrInfo; -class MCRegisterInfo; -class MCObjectWriter; -class MCSubtargetInfo; -class StringRef; -class Target; -class raw_ostream; - -extern Target TheARM64leTarget; -extern Target TheARM64beTarget; -extern Target TheAArch64leTarget; -extern Target TheAArch64beTarget; - -MCCodeEmitter *createARM64MCCodeEmitter(const MCInstrInfo &MCII, - const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, - MCContext &Ctx); -MCAsmBackend *createARM64leAsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU); -MCAsmBackend *createARM64beAsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU); - - MCObjectWriter *createARM64ELFObjectWriter(raw_ostream &OS, uint8_t OSABI, - bool IsLittleEndian); - -MCObjectWriter *createARM64MachObjectWriter(raw_ostream &OS, uint32_t CPUType, - uint32_t CPUSubtype); - -} // End llvm namespace - -// Defines symbolic names for ARM64 registers. This defines a mapping from -// register name to register number. -// -#define GET_REGINFO_ENUM -#include "ARM64GenRegisterInfo.inc" - -// Defines symbolic names for the ARM64 instructions. -// -#define GET_INSTRINFO_ENUM -#include "ARM64GenInstrInfo.inc" - -#define GET_SUBTARGETINFO_ENUM -#include "ARM64GenSubtargetInfo.inc" - -#endif diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp deleted file mode 100644 index 1c48159bbe9..00000000000 --- a/lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp +++ /dev/null @@ -1,395 +0,0 @@ -//===-- ARMMachObjectWriter.cpp - ARM Mach Object Writer ------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "MCTargetDesc/ARM64FixupKinds.h" -#include "MCTargetDesc/ARM64MCTargetDesc.h" -#include "llvm/MC/MCAssembler.h" -#include "llvm/MC/MCAsmLayout.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCFixup.h" -#include "llvm/MC/MCMachObjectWriter.h" -#include "llvm/MC/MCSectionMachO.h" -#include "llvm/MC/MCValue.h" -#include "llvm/ADT/Twine.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MachO.h" -using namespace llvm; - -namespace { -class ARM64MachObjectWriter : public MCMachObjectTargetWriter { - bool getARM64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType, - const MCSymbolRefExpr *Sym, - unsigned &Log2Size, const MCAssembler &Asm); - -public: - ARM64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype) - : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype, - /*UseAggressiveSymbolFolding=*/true) {} - - void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm, - const MCAsmLayout &Layout, const MCFragment *Fragment, - const MCFixup &Fixup, MCValue Target, - uint64_t &FixedValue) override; -}; -} - -bool ARM64MachObjectWriter::getARM64FixupKindMachOInfo( - const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym, - unsigned &Log2Size, const MCAssembler &Asm) { - RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED); - Log2Size = ~0U; - - switch ((unsigned)Fixup.getKind()) { - default: - return false; - - case FK_Data_1: - Log2Size = llvm::Log2_32(1); - return true; - case FK_Data_2: - Log2Size = llvm::Log2_32(2); - return true; - case FK_Data_4: - Log2Size = llvm::Log2_32(4); - if (Sym->getKind() == MCSymbolRefExpr::VK_GOT) - RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT); - return true; - case FK_Data_8: - Log2Size = llvm::Log2_32(8); - if (Sym->getKind() == MCSymbolRefExpr::VK_GOT) - RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT); - return true; - case ARM64::fixup_arm64_add_imm12: - case ARM64::fixup_arm64_ldst_imm12_scale1: - case ARM64::fixup_arm64_ldst_imm12_scale2: - case ARM64::fixup_arm64_ldst_imm12_scale4: - case ARM64::fixup_arm64_ldst_imm12_scale8: - case ARM64::fixup_arm64_ldst_imm12_scale16: - Log2Size = llvm::Log2_32(4); - switch (Sym->getKind()) { - default: - assert(0 && "Unexpected symbol reference variant kind!"); - case MCSymbolRefExpr::VK_PAGEOFF: - RelocType = unsigned(MachO::ARM64_RELOC_PAGEOFF12); - return true; - case MCSymbolRefExpr::VK_GOTPAGEOFF: - RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGEOFF12); - return true; - case MCSymbolRefExpr::VK_TLVPPAGEOFF: - RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12); - return true; - } - case ARM64::fixup_arm64_pcrel_adrp_imm21: - Log2Size = llvm::Log2_32(4); - // This encompasses the relocation for the whole 21-bit value. - switch (Sym->getKind()) { - default: - Asm.getContext().FatalError(Fixup.getLoc(), - "ADR/ADRP relocations must be GOT relative"); - case MCSymbolRefExpr::VK_PAGE: - RelocType = unsigned(MachO::ARM64_RELOC_PAGE21); - return true; - case MCSymbolRefExpr::VK_GOTPAGE: - RelocType = unsigned(MachO::ARM64_RELOC_GOT_LOAD_PAGE21); - return true; - case MCSymbolRefExpr::VK_TLVPPAGE: - RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGE21); - return true; - } - return true; - case ARM64::fixup_arm64_pcrel_branch26: - case ARM64::fixup_arm64_pcrel_call26: - Log2Size = llvm::Log2_32(4); - RelocType = unsigned(MachO::ARM64_RELOC_BRANCH26); - return true; - } -} - -void ARM64MachObjectWriter::RecordRelocation( - MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout, - const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target, - uint64_t &FixedValue) { - unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); - - // See . - uint32_t FixupOffset = Layout.getFragmentOffset(Fragment); - unsigned Log2Size = 0; - int64_t Value = 0; - unsigned Index = 0; - unsigned IsExtern = 0; - unsigned Type = 0; - unsigned Kind = Fixup.getKind(); - - FixupOffset += Fixup.getOffset(); - - // ARM64 pcrel relocation addends do not include the section offset. - if (IsPCRel) - FixedValue += FixupOffset; - - // ADRP fixups use relocations for the whole symbol value and only - // put the addend in the instruction itself. Clear out any value the - // generic code figured out from the sybmol definition. - if (Kind == ARM64::fixup_arm64_pcrel_adrp_imm21) - FixedValue = 0; - - // imm19 relocations are for conditional branches, which require - // assembler local symbols. If we got here, that's not what we have, - // so complain loudly. - if (Kind == ARM64::fixup_arm64_pcrel_branch19) { - Asm.getContext().FatalError(Fixup.getLoc(), - "conditional branch requires assembler-local" - " label. '" + - Target.getSymA()->getSymbol().getName() + - "' is external."); - return; - } - - // 14-bit branch relocations should only target internal labels, and so - // should never get here. - if (Kind == ARM64::fixup_arm64_pcrel_branch14) { - Asm.getContext().FatalError(Fixup.getLoc(), - "Invalid relocation on conditional branch!"); - return; - } - - if (!getARM64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size, - Asm)) { - Asm.getContext().FatalError(Fixup.getLoc(), "unknown ARM64 fixup kind!"); - return; - } - - Value = Target.getConstant(); - - if (Target.isAbsolute()) { // constant - // FIXME: Should this always be extern? - // SymbolNum of 0 indicates the absolute section. - Type = MachO::ARM64_RELOC_UNSIGNED; - Index = 0; - - if (IsPCRel) { - IsExtern = 1; - Asm.getContext().FatalError(Fixup.getLoc(), - "PC relative absolute relocation!"); - - // FIXME: x86_64 sets the type to a branch reloc here. Should we do - // something similar? - } - } else if (Target.getSymB()) { // A - B + constant - const MCSymbol *A = &Target.getSymA()->getSymbol(); - const MCSymbolData &A_SD = Asm.getSymbolData(*A); - const MCSymbolData *A_Base = Asm.getAtom(&A_SD); - - const MCSymbol *B = &Target.getSymB()->getSymbol(); - const MCSymbolData &B_SD = Asm.getSymbolData(*B); - const MCSymbolData *B_Base = Asm.getAtom(&B_SD); - - // Check for "_foo@got - .", which comes through here as: - // Ltmp0: - // ... _foo@got - Ltmp0 - if (Target.getSymA()->getKind() == MCSymbolRefExpr::VK_GOT && - Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None && - Layout.getSymbolOffset(&B_SD) == - Layout.getFragmentOffset(Fragment) + Fixup.getOffset()) { - // SymB is the PC, so use a PC-rel pointer-to-GOT relocation. - Index = A_Base->getIndex(); - IsExtern = 1; - Type = MachO::ARM64_RELOC_POINTER_TO_GOT; - IsPCRel = 1; - MachO::any_relocation_info MRE; - MRE.r_word0 = FixupOffset; - MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | - (IsExtern << 27) | (Type << 28)); - Writer->addRelocation(Fragment->getParent(), MRE); - return; - } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None || - Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) - // Otherwise, neither symbol can be modified. - Asm.getContext().FatalError(Fixup.getLoc(), - "unsupported relocation of modified symbol"); - - // We don't support PCrel relocations of differences. - if (IsPCRel) - Asm.getContext().FatalError(Fixup.getLoc(), - "unsupported pc-relative relocation of " - "difference"); - - // ARM64 always uses external relocations. If there is no symbol to use as - // a base address (a local symbol with no preceding non-local symbol), - // error out. - // - // FIXME: We should probably just synthesize an external symbol and use - // that. - if (!A_Base) - Asm.getContext().FatalError( - Fixup.getLoc(), - "unsupported relocation of local symbol '" + A->getName() + - "'. Must have non-local symbol earlier in section."); - if (!B_Base) - Asm.getContext().FatalError( - Fixup.getLoc(), - "unsupported relocation of local symbol '" + B->getName() + - "'. Must have non-local symbol earlier in section."); - - if (A_Base == B_Base && A_Base) - Asm.getContext().FatalError(Fixup.getLoc(), - "unsupported relocation with identical base"); - - Value += (!A_SD.getFragment() ? 0 - : Writer->getSymbolAddress(&A_SD, Layout)) - - (!A_Base || !A_Base->getFragment() - ? 0 - : Writer->getSymbolAddress(A_Base, Layout)); - Value -= (!B_SD.getFragment() ? 0 - : Writer->getSymbolAddress(&B_SD, Layout)) - - (!B_Base || !B_Base->getFragment() - ? 0 - : Writer->getSymbolAddress(B_Base, Layout)); - - Index = A_Base->getIndex(); - IsExtern = 1; - Type = MachO::ARM64_RELOC_UNSIGNED; - - MachO::any_relocation_info MRE; - MRE.r_word0 = FixupOffset; - MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | - (IsExtern << 27) | (Type << 28)); - Writer->addRelocation(Fragment->getParent(), MRE); - - Index = B_Base->getIndex(); - IsExtern = 1; - Type = MachO::ARM64_RELOC_SUBTRACTOR; - } else { // A + constant - const MCSymbol *Symbol = &Target.getSymA()->getSymbol(); - const MCSymbolData &SD = Asm.getSymbolData(*Symbol); - const MCSymbolData *Base = Asm.getAtom(&SD); - const MCSectionMachO &Section = static_cast( - Fragment->getParent()->getSection()); - - // If the symbol is a variable and we weren't able to get a Base for it - // (i.e., it's not in the symbol table associated with a section) resolve - // the relocation based its expansion instead. - if (Symbol->isVariable() && !Base) { - // If the evaluation is an absolute value, just use that directly - // to keep things easy. - int64_t Res; - if (SD.getSymbol().getVariableValue()->EvaluateAsAbsolute( - Res, Layout, Writer->getSectionAddressMap())) { - FixedValue = Res; - return; - } - - // FIXME: Will the Target we already have ever have any data in it - // we need to preserve and merge with the new Target? How about - // the FixedValue? - if (!Symbol->getVariableValue()->EvaluateAsRelocatable(Target, &Layout)) - Asm.getContext().FatalError(Fixup.getLoc(), - "unable to resolve variable '" + - Symbol->getName() + "'"); - return RecordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target, - FixedValue); - } - - // Relocations inside debug sections always use local relocations when - // possible. This seems to be done because the debugger doesn't fully - // understand relocation entries and expects to find values that - // have already been fixed up. - if (Symbol->isInSection()) { - if (Section.hasAttribute(MachO::S_ATTR_DEBUG)) - Base = nullptr; - } - - // ARM64 uses external relocations as much as possible. For debug sections, - // and for pointer-sized relocations (.quad), we allow section relocations. - // It's code sections that run into trouble. - if (Base) { - Index = Base->getIndex(); - IsExtern = 1; - - // Add the local offset, if needed. - if (Base != &SD) - Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base); - } else if (Symbol->isInSection()) { - // Pointer-sized relocations can use a local relocation. Otherwise, - // we have to be in a debug info section. - if (!Section.hasAttribute(MachO::S_ATTR_DEBUG) && Log2Size != 3) - Asm.getContext().FatalError( - Fixup.getLoc(), - "unsupported relocation of local symbol '" + Symbol->getName() + - "'. Must have non-local symbol earlier in section."); - // Adjust the relocation to be section-relative. - // The index is the section ordinal (1-based). - const MCSectionData &SymSD = - Asm.getSectionData(SD.getSymbol().getSection()); - Index = SymSD.getOrdinal() + 1; - IsExtern = 0; - Value += Writer->getSymbolAddress(&SD, Layout); - - if (IsPCRel) - Value -= Writer->getFragmentAddress(Fragment, Layout) + - Fixup.getOffset() + (1ULL << Log2Size); - } else { - // Resolve constant variables. - if (SD.getSymbol().isVariable()) { - int64_t Res; - if (SD.getSymbol().getVariableValue()->EvaluateAsAbsolute( - Res, Layout, Writer->getSectionAddressMap())) { - FixedValue = Res; - return; - } - } - Asm.getContext().FatalError(Fixup.getLoc(), - "unsupported relocation of variable '" + - Symbol->getName() + "'"); - } - } - - // If the relocation kind is Branch26, Page21, or Pageoff12, any addend - // is represented via an Addend relocation, not encoded directly into - // the instruction. - if ((Type == MachO::ARM64_RELOC_BRANCH26 || - Type == MachO::ARM64_RELOC_PAGE21 || - Type == MachO::ARM64_RELOC_PAGEOFF12) && - Value) { - assert((Value & 0xff000000) == 0 && "Added relocation out of range!"); - - MachO::any_relocation_info MRE; - MRE.r_word0 = FixupOffset; - MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | - (IsExtern << 27) | (Type << 28)); - Writer->addRelocation(Fragment->getParent(), MRE); - - // Now set up the Addend relocation. - Type = MachO::ARM64_RELOC_ADDEND; - Index = Value; - IsPCRel = 0; - Log2Size = 2; - IsExtern = 0; - - // Put zero into the instruction itself. The addend is in the relocation. - Value = 0; - } - - // If there's any addend left to handle, encode it in the instruction. - FixedValue = Value; - - // struct relocation_info (8 bytes) - MachO::any_relocation_info MRE; - MRE.r_word0 = FixupOffset; - MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | - (IsExtern << 27) | (Type << 28)); - Writer->addRelocation(Fragment->getParent(), MRE); -} - -MCObjectWriter *llvm::createARM64MachObjectWriter(raw_ostream &OS, - uint32_t CPUType, - uint32_t CPUSubtype) { - return createMachObjectWriter(new ARM64MachObjectWriter(CPUType, CPUSubtype), - OS, /*IsLittleEndian=*/true); -} diff --git a/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt b/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt deleted file mode 100644 index f8665bcfe94..00000000000 --- a/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt +++ /dev/null @@ -1,14 +0,0 @@ -add_llvm_library(LLVMARM64Desc - ARM64AsmBackend.cpp - ARM64ELFObjectWriter.cpp - ARM64ELFStreamer.cpp - ARM64MCAsmInfo.cpp - ARM64MCCodeEmitter.cpp - ARM64MCExpr.cpp - ARM64MCTargetDesc.cpp - ARM64MachObjectWriter.cpp -) -add_dependencies(LLVMARM64Desc ARM64CommonTableGen) - -# Hack: we need to include 'main' target directory to grab private headers -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..) diff --git a/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt b/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt deleted file mode 100644 index e4c74d285d4..00000000000 --- a/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt +++ /dev/null @@ -1,24 +0,0 @@ -;===- ./lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[component_0] -type = Library -name = ARM64Desc -parent = ARM64 -required_libraries = ARM64AsmPrinter ARM64Info MC Support -add_to_library_groups = ARM64 - diff --git a/lib/Target/ARM64/MCTargetDesc/Makefile b/lib/Target/ARM64/MCTargetDesc/Makefile deleted file mode 100644 index 013cc633f66..00000000000 --- a/lib/Target/ARM64/MCTargetDesc/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -##===- lib/Target/ARM64/TargetDesc/Makefile ----------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../../.. -LIBRARYNAME = LLVMARM64Desc - -# Hack: we need to include 'main' target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/ARM64/Makefile b/lib/Target/ARM64/Makefile deleted file mode 100644 index cfb05d2a87b..00000000000 --- a/lib/Target/ARM64/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -##===- lib/Target/ARM64/Makefile ---------------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## - -LEVEL = ../../.. -LIBRARYNAME = LLVMARM64CodeGen -TARGET = ARM64 - -# Make sure that tblgen is run, first thing. -BUILT_SOURCES = ARM64GenRegisterInfo.inc ARM64GenInstrInfo.inc \ - ARM64GenAsmWriter.inc ARM64GenAsmWriter1.inc \ - ARM64GenDAGISel.inc \ - ARM64GenCallingConv.inc ARM64GenAsmMatcher.inc \ - ARM64GenSubtargetInfo.inc ARM64GenMCCodeEmitter.inc \ - ARM64GenFastISel.inc ARM64GenDisassemblerTables.inc \ - ARM64GenMCPseudoLowering.inc - -DIRS = TargetInfo InstPrinter AsmParser Disassembler MCTargetDesc Utils - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp b/lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp deleted file mode 100644 index 247566825ab..00000000000 --- a/lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp +++ /dev/null @@ -1,31 +0,0 @@ -//===-- ARM64TargetInfo.cpp - ARM64 Target Implementation -----------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/Triple.h" -#include "llvm/Support/TargetRegistry.h" -using namespace llvm; - -namespace llvm { -Target TheARM64leTarget; -Target TheARM64beTarget; -Target TheAArch64leTarget; -Target TheAArch64beTarget; -} // end namespace llvm - -extern "C" void LLVMInitializeARM64TargetInfo() { - RegisterTarget X(TheARM64leTarget, "arm64", - "ARM64 (little endian)"); - RegisterTarget Y(TheARM64beTarget, "arm64_be", - "ARM64 (big endian)"); - - RegisterTarget Z( - TheAArch64leTarget, "aarch64", "ARM64 (little endian)"); - RegisterTarget W( - TheAArch64beTarget, "aarch64_be", "ARM64 (big endian)"); -} diff --git a/lib/Target/ARM64/TargetInfo/CMakeLists.txt b/lib/Target/ARM64/TargetInfo/CMakeLists.txt deleted file mode 100644 index a0142c40713..00000000000 --- a/lib/Target/ARM64/TargetInfo/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. ) - -add_llvm_library(LLVMARM64Info - ARM64TargetInfo.cpp - ) - -add_dependencies(LLVMARM64Info ARM64CommonTableGen) diff --git a/lib/Target/ARM64/TargetInfo/LLVMBuild.txt b/lib/Target/ARM64/TargetInfo/LLVMBuild.txt deleted file mode 100644 index b9ecb706952..00000000000 --- a/lib/Target/ARM64/TargetInfo/LLVMBuild.txt +++ /dev/null @@ -1,23 +0,0 @@ -;===- ./lib/Target/ARM64/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[component_0] -type = Library -name = ARM64Info -parent = ARM64 -required_libraries = Support -add_to_library_groups = ARM64 diff --git a/lib/Target/ARM64/TargetInfo/Makefile b/lib/Target/ARM64/TargetInfo/Makefile deleted file mode 100644 index 2d5a1a087a5..00000000000 --- a/lib/Target/ARM64/TargetInfo/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/ARM64/TargetInfo/Makefile ----------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMARM64Info - -# Hack: we need to include 'main' target directory to grab private headers -CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/ARM64/Utils/ARM64BaseInfo.cpp b/lib/Target/ARM64/Utils/ARM64BaseInfo.cpp deleted file mode 100644 index 5142d18c23c..00000000000 --- a/lib/Target/ARM64/Utils/ARM64BaseInfo.cpp +++ /dev/null @@ -1,901 +0,0 @@ -//===-- ARM64BaseInfo.cpp - ARM64 Base encoding information------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file provides basic encoding and assembly information for ARM64. -// -//===----------------------------------------------------------------------===// -#include "ARM64BaseInfo.h" -#include "llvm/ADT/APFloat.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/Support/Regex.h" - -using namespace llvm; - -StringRef ARM64NamedImmMapper::toString(uint32_t Value, bool &Valid) const { - for (unsigned i = 0; i < NumPairs; ++i) { - if (Pairs[i].Value == Value) { - Valid = true; - return Pairs[i].Name; - } - } - - Valid = false; - return StringRef(); -} - -uint32_t ARM64NamedImmMapper::fromString(StringRef Name, bool &Valid) const { - std::string LowerCaseName = Name.lower(); - for (unsigned i = 0; i < NumPairs; ++i) { - if (Pairs[i].Name == LowerCaseName) { - Valid = true; - return Pairs[i].Value; - } - } - - Valid = false; - return -1; -} - -bool ARM64NamedImmMapper::validImm(uint32_t Value) const { - return Value < TooBigImm; -} - -const ARM64NamedImmMapper::Mapping ARM64AT::ATMapper::ATPairs[] = { - {"s1e1r", S1E1R}, - {"s1e2r", S1E2R}, - {"s1e3r", S1E3R}, - {"s1e1w", S1E1W}, - {"s1e2w", S1E2W}, - {"s1e3w", S1E3W}, - {"s1e0r", S1E0R}, - {"s1e0w", S1E0W}, - {"s12e1r", S12E1R}, - {"s12e1w", S12E1W}, - {"s12e0r", S12E0R}, - {"s12e0w", S12E0W}, -}; - -ARM64AT::ATMapper::ATMapper() - : ARM64NamedImmMapper(ATPairs, 0) {} - -const ARM64NamedImmMapper::Mapping ARM64DB::DBarrierMapper::DBarrierPairs[] = { - {"oshld", OSHLD}, - {"oshst", OSHST}, - {"osh", OSH}, - {"nshld", NSHLD}, - {"nshst", NSHST}, - {"nsh", NSH}, - {"ishld", ISHLD}, - {"ishst", ISHST}, - {"ish", ISH}, - {"ld", LD}, - {"st", ST}, - {"sy", SY} -}; - -ARM64DB::DBarrierMapper::DBarrierMapper() - : ARM64NamedImmMapper(DBarrierPairs, 16u) {} - -const ARM64NamedImmMapper::Mapping ARM64DC::DCMapper::DCPairs[] = { - {"zva", ZVA}, - {"ivac", IVAC}, - {"isw", ISW}, - {"cvac", CVAC}, - {"csw", CSW}, - {"cvau", CVAU}, - {"civac", CIVAC}, - {"cisw", CISW} -}; - -ARM64DC::DCMapper::DCMapper() - : ARM64NamedImmMapper(DCPairs, 0) {} - -const ARM64NamedImmMapper::Mapping ARM64IC::ICMapper::ICPairs[] = { - {"ialluis", IALLUIS}, - {"iallu", IALLU}, - {"ivau", IVAU} -}; - -ARM64IC::ICMapper::ICMapper() - : ARM64NamedImmMapper(ICPairs, 0) {} - -const ARM64NamedImmMapper::Mapping ARM64ISB::ISBMapper::ISBPairs[] = { - {"sy", SY}, -}; - -ARM64ISB::ISBMapper::ISBMapper() - : ARM64NamedImmMapper(ISBPairs, 16) {} - -const ARM64NamedImmMapper::Mapping ARM64PRFM::PRFMMapper::PRFMPairs[] = { - {"pldl1keep", PLDL1KEEP}, - {"pldl1strm", PLDL1STRM}, - {"pldl2keep", PLDL2KEEP}, - {"pldl2strm", PLDL2STRM}, - {"pldl3keep", PLDL3KEEP}, - {"pldl3strm", PLDL3STRM}, - {"plil1keep", PLIL1KEEP}, - {"plil1strm", PLIL1STRM}, - {"plil2keep", PLIL2KEEP}, - {"plil2strm", PLIL2STRM}, - {"plil3keep", PLIL3KEEP}, - {"plil3strm", PLIL3STRM}, - {"pstl1keep", PSTL1KEEP}, - {"pstl1strm", PSTL1STRM}, - {"pstl2keep", PSTL2KEEP}, - {"pstl2strm", PSTL2STRM}, - {"pstl3keep", PSTL3KEEP}, - {"pstl3strm", PSTL3STRM} -}; - -ARM64PRFM::PRFMMapper::PRFMMapper() - : ARM64NamedImmMapper(PRFMPairs, 32) {} - -const ARM64NamedImmMapper::Mapping ARM64PState::PStateMapper::PStatePairs[] = { - {"spsel", SPSel}, - {"daifset", DAIFSet}, - {"daifclr", DAIFClr} -}; - -ARM64PState::PStateMapper::PStateMapper() - : ARM64NamedImmMapper(PStatePairs, 0) {} - -const ARM64NamedImmMapper::Mapping ARM64SysReg::MRSMapper::MRSPairs[] = { - {"mdccsr_el0", MDCCSR_EL0}, - {"dbgdtrrx_el0", DBGDTRRX_EL0}, - {"mdrar_el1", MDRAR_EL1}, - {"oslsr_el1", OSLSR_EL1}, - {"dbgauthstatus_el1", DBGAUTHSTATUS_EL1}, - {"pmceid0_el0", PMCEID0_EL0}, - {"pmceid1_el0", PMCEID1_EL0}, - {"midr_el1", MIDR_EL1}, - {"ccsidr_el1", CCSIDR_EL1}, - {"clidr_el1", CLIDR_EL1}, - {"ctr_el0", CTR_EL0}, - {"mpidr_el1", MPIDR_EL1}, - {"revidr_el1", REVIDR_EL1}, - {"aidr_el1", AIDR_EL1}, - {"dczid_el0", DCZID_EL0}, - {"id_pfr0_el1", ID_PFR0_EL1}, - {"id_pfr1_el1", ID_PFR1_EL1}, - {"id_dfr0_el1", ID_DFR0_EL1}, - {"id_afr0_el1", ID_AFR0_EL1}, - {"id_mmfr0_el1", ID_MMFR0_EL1}, - {"id_mmfr1_el1", ID_MMFR1_EL1}, - {"id_mmfr2_el1", ID_MMFR2_EL1}, - {"id_mmfr3_el1", ID_MMFR3_EL1}, - {"id_isar0_el1", ID_ISAR0_EL1}, - {"id_isar1_el1", ID_ISAR1_EL1}, - {"id_isar2_el1", ID_ISAR2_EL1}, - {"id_isar3_el1", ID_ISAR3_EL1}, - {"id_isar4_el1", ID_ISAR4_EL1}, - {"id_isar5_el1", ID_ISAR5_EL1}, - {"id_aa64pfr0_el1", ID_AARM64PFR0_EL1}, - {"id_aa64pfr1_el1", ID_AARM64PFR1_EL1}, - {"id_aa64dfr0_el1", ID_AARM64DFR0_EL1}, - {"id_aa64dfr1_el1", ID_AARM64DFR1_EL1}, - {"id_aa64afr0_el1", ID_AARM64AFR0_EL1}, - {"id_aa64afr1_el1", ID_AARM64AFR1_EL1}, - {"id_aa64isar0_el1", ID_AARM64ISAR0_EL1}, - {"id_aa64isar1_el1", ID_AARM64ISAR1_EL1}, - {"id_aa64mmfr0_el1", ID_AARM64MMFR0_EL1}, - {"id_aa64mmfr1_el1", ID_AARM64MMFR1_EL1}, - {"mvfr0_el1", MVFR0_EL1}, - {"mvfr1_el1", MVFR1_EL1}, - {"mvfr2_el1", MVFR2_EL1}, - {"rvbar_el1", RVBAR_EL1}, - {"rvbar_el2", RVBAR_EL2}, - {"rvbar_el3", RVBAR_EL3}, - {"isr_el1", ISR_EL1}, - {"cntpct_el0", CNTPCT_EL0}, - {"cntvct_el0", CNTVCT_EL0}, - - // Trace registers - {"trcstatr", TRCSTATR}, - {"trcidr8", TRCIDR8}, - {"trcidr9", TRCIDR9}, - {"trcidr10", TRCIDR10}, - {"trcidr11", TRCIDR11}, - {"trcidr12", TRCIDR12}, - {"trcidr13", TRCIDR13}, - {"trcidr0", TRCIDR0}, - {"trcidr1", TRCIDR1}, - {"trcidr2", TRCIDR2}, - {"trcidr3", TRCIDR3}, - {"trcidr4", TRCIDR4}, - {"trcidr5", TRCIDR5}, - {"trcidr6", TRCIDR6}, - {"trcidr7", TRCIDR7}, - {"trcoslsr", TRCOSLSR}, - {"trcpdsr", TRCPDSR}, - {"trcdevaff0", TRCDEVAFF0}, - {"trcdevaff1", TRCDEVAFF1}, - {"trclsr", TRCLSR}, - {"trcauthstatus", TRCAUTHSTATUS}, - {"trcdevarch", TRCDEVARCH}, - {"trcdevid", TRCDEVID}, - {"trcdevtype", TRCDEVTYPE}, - {"trcpidr4", TRCPIDR4}, - {"trcpidr5", TRCPIDR5}, - {"trcpidr6", TRCPIDR6}, - {"trcpidr7", TRCPIDR7}, - {"trcpidr0", TRCPIDR0}, - {"trcpidr1", TRCPIDR1}, - {"trcpidr2", TRCPIDR2}, - {"trcpidr3", TRCPIDR3}, - {"trccidr0", TRCCIDR0}, - {"trccidr1", TRCCIDR1}, - {"trccidr2", TRCCIDR2}, - {"trccidr3", TRCCIDR3}, - - // GICv3 registers - {"icc_iar1_el1", ICC_IAR1_EL1}, - {"icc_iar0_el1", ICC_IAR0_EL1}, - {"icc_hppir1_el1", ICC_HPPIR1_EL1}, - {"icc_hppir0_el1", ICC_HPPIR0_EL1}, - {"icc_rpr_el1", ICC_RPR_EL1}, - {"ich_vtr_el2", ICH_VTR_EL2}, - {"ich_eisr_el2", ICH_EISR_EL2}, - {"ich_elsr_el2", ICH_ELSR_EL2} -}; - -ARM64SysReg::MRSMapper::MRSMapper(uint64_t FeatureBits) - : SysRegMapper(FeatureBits) { - InstPairs = &MRSPairs[0]; - NumInstPairs = llvm::array_lengthof(MRSPairs); -} - -const ARM64NamedImmMapper::Mapping ARM64SysReg::MSRMapper::MSRPairs[] = { - {"dbgdtrtx_el0", DBGDTRTX_EL0}, - {"oslar_el1", OSLAR_EL1}, - {"pmswinc_el0", PMSWINC_EL0}, - - // Trace registers - {"trcoslar", TRCOSLAR}, - {"trclar", TRCLAR}, - - // GICv3 registers - {"icc_eoir1_el1", ICC_EOIR1_EL1}, - {"icc_eoir0_el1", ICC_EOIR0_EL1}, - {"icc_dir_el1", ICC_DIR_EL1}, - {"icc_sgi1r_el1", ICC_SGI1R_EL1}, - {"icc_asgi1r_el1", ICC_ASGI1R_EL1}, - {"icc_sgi0r_el1", ICC_SGI0R_EL1} -}; - -ARM64SysReg::MSRMapper::MSRMapper(uint64_t FeatureBits) - : SysRegMapper(FeatureBits) { - InstPairs = &MSRPairs[0]; - NumInstPairs = llvm::array_lengthof(MSRPairs); -} - - -const ARM64NamedImmMapper::Mapping ARM64SysReg::SysRegMapper::SysRegPairs[] = { - {"osdtrrx_el1", OSDTRRX_EL1}, - {"osdtrtx_el1", OSDTRTX_EL1}, - {"teecr32_el1", TEECR32_EL1}, - {"mdccint_el1", MDCCINT_EL1}, - {"mdscr_el1", MDSCR_EL1}, - {"dbgdtr_el0", DBGDTR_EL0}, - {"oseccr_el1", OSECCR_EL1}, - {"dbgvcr32_el2", DBGVCR32_EL2}, - {"dbgbvr0_el1", DBGBVR0_EL1}, - {"dbgbvr1_el1", DBGBVR1_EL1}, - {"dbgbvr2_el1", DBGBVR2_EL1}, - {"dbgbvr3_el1", DBGBVR3_EL1}, - {"dbgbvr4_el1", DBGBVR4_EL1}, - {"dbgbvr5_el1", DBGBVR5_EL1}, - {"dbgbvr6_el1", DBGBVR6_EL1}, - {"dbgbvr7_el1", DBGBVR7_EL1}, - {"dbgbvr8_el1", DBGBVR8_EL1}, - {"dbgbvr9_el1", DBGBVR9_EL1}, - {"dbgbvr10_el1", DBGBVR10_EL1}, - {"dbgbvr11_el1", DBGBVR11_EL1}, - {"dbgbvr12_el1", DBGBVR12_EL1}, - {"dbgbvr13_el1", DBGBVR13_EL1}, - {"dbgbvr14_el1", DBGBVR14_EL1}, - {"dbgbvr15_el1", DBGBVR15_EL1}, - {"dbgbcr0_el1", DBGBCR0_EL1}, - {"dbgbcr1_el1", DBGBCR1_EL1}, - {"dbgbcr2_el1", DBGBCR2_EL1}, - {"dbgbcr3_el1", DBGBCR3_EL1}, - {"dbgbcr4_el1", DBGBCR4_EL1}, - {"dbgbcr5_el1", DBGBCR5_EL1}, - {"dbgbcr6_el1", DBGBCR6_EL1}, - {"dbgbcr7_el1", DBGBCR7_EL1}, - {"dbgbcr8_el1", DBGBCR8_EL1}, - {"dbgbcr9_el1", DBGBCR9_EL1}, - {"dbgbcr10_el1", DBGBCR10_EL1}, - {"dbgbcr11_el1", DBGBCR11_EL1}, - {"dbgbcr12_el1", DBGBCR12_EL1}, - {"dbgbcr13_el1", DBGBCR13_EL1}, - {"dbgbcr14_el1", DBGBCR14_EL1}, - {"dbgbcr15_el1", DBGBCR15_EL1}, - {"dbgwvr0_el1", DBGWVR0_EL1}, - {"dbgwvr1_el1", DBGWVR1_EL1}, - {"dbgwvr2_el1", DBGWVR2_EL1}, - {"dbgwvr3_el1", DBGWVR3_EL1}, - {"dbgwvr4_el1", DBGWVR4_EL1}, - {"dbgwvr5_el1", DBGWVR5_EL1}, - {"dbgwvr6_el1", DBGWVR6_EL1}, - {"dbgwvr7_el1", DBGWVR7_EL1}, - {"dbgwvr8_el1", DBGWVR8_EL1}, - {"dbgwvr9_el1", DBGWVR9_EL1}, - {"dbgwvr10_el1", DBGWVR10_EL1}, - {"dbgwvr11_el1", DBGWVR11_EL1}, - {"dbgwvr12_el1", DBGWVR12_EL1}, - {"dbgwvr13_el1", DBGWVR13_EL1}, - {"dbgwvr14_el1", DBGWVR14_EL1}, - {"dbgwvr15_el1", DBGWVR15_EL1}, - {"dbgwcr0_el1", DBGWCR0_EL1}, - {"dbgwcr1_el1", DBGWCR1_EL1}, - {"dbgwcr2_el1", DBGWCR2_EL1}, - {"dbgwcr3_el1", DBGWCR3_EL1}, - {"dbgwcr4_el1", DBGWCR4_EL1}, - {"dbgwcr5_el1", DBGWCR5_EL1}, - {"dbgwcr6_el1", DBGWCR6_EL1}, - {"dbgwcr7_el1", DBGWCR7_EL1}, - {"dbgwcr8_el1", DBGWCR8_EL1}, - {"dbgwcr9_el1", DBGWCR9_EL1}, - {"dbgwcr10_el1", DBGWCR10_EL1}, - {"dbgwcr11_el1", DBGWCR11_EL1}, - {"dbgwcr12_el1", DBGWCR12_EL1}, - {"dbgwcr13_el1", DBGWCR13_EL1}, - {"dbgwcr14_el1", DBGWCR14_EL1}, - {"dbgwcr15_el1", DBGWCR15_EL1}, - {"teehbr32_el1", TEEHBR32_EL1}, - {"osdlr_el1", OSDLR_EL1}, - {"dbgprcr_el1", DBGPRCR_EL1}, - {"dbgclaimset_el1", DBGCLAIMSET_EL1}, - {"dbgclaimclr_el1", DBGCLAIMCLR_EL1}, - {"csselr_el1", CSSELR_EL1}, - {"vpidr_el2", VPIDR_EL2}, - {"vmpidr_el2", VMPIDR_EL2}, - {"sctlr_el1", SCTLR_EL1}, - {"sctlr_el2", SCTLR_EL2}, - {"sctlr_el3", SCTLR_EL3}, - {"actlr_el1", ACTLR_EL1}, - {"actlr_el2", ACTLR_EL2}, - {"actlr_el3", ACTLR_EL3}, - {"cpacr_el1", CPACR_EL1}, - {"hcr_el2", HCR_EL2}, - {"scr_el3", SCR_EL3}, - {"mdcr_el2", MDCR_EL2}, - {"sder32_el3", SDER32_EL3}, - {"cptr_el2", CPTR_EL2}, - {"cptr_el3", CPTR_EL3}, - {"hstr_el2", HSTR_EL2}, - {"hacr_el2", HACR_EL2}, - {"mdcr_el3", MDCR_EL3}, - {"ttbr0_el1", TTBR0_EL1}, - {"ttbr0_el2", TTBR0_EL2}, - {"ttbr0_el3", TTBR0_EL3}, - {"ttbr1_el1", TTBR1_EL1}, - {"tcr_el1", TCR_EL1}, - {"tcr_el2", TCR_EL2}, - {"tcr_el3", TCR_EL3}, - {"vttbr_el2", VTTBR_EL2}, - {"vtcr_el2", VTCR_EL2}, - {"dacr32_el2", DACR32_EL2}, - {"spsr_el1", SPSR_EL1}, - {"spsr_el2", SPSR_EL2}, - {"spsr_el3", SPSR_EL3}, - {"elr_el1", ELR_EL1}, - {"elr_el2", ELR_EL2}, - {"elr_el3", ELR_EL3}, - {"sp_el0", SP_EL0}, - {"sp_el1", SP_EL1}, - {"sp_el2", SP_EL2}, - {"spsel", SPSel}, - {"nzcv", NZCV}, - {"daif", DAIF}, - {"currentel", CurrentEL}, - {"spsr_irq", SPSR_irq}, - {"spsr_abt", SPSR_abt}, - {"spsr_und", SPSR_und}, - {"spsr_fiq", SPSR_fiq}, - {"fpcr", FPCR}, - {"fpsr", FPSR}, - {"dspsr_el0", DSPSR_EL0}, - {"dlr_el0", DLR_EL0}, - {"ifsr32_el2", IFSR32_EL2}, - {"afsr0_el1", AFSR0_EL1}, - {"afsr0_el2", AFSR0_EL2}, - {"afsr0_el3", AFSR0_EL3}, - {"afsr1_el1", AFSR1_EL1}, - {"afsr1_el2", AFSR1_EL2}, - {"afsr1_el3", AFSR1_EL3}, - {"esr_el1", ESR_EL1}, - {"esr_el2", ESR_EL2}, - {"esr_el3", ESR_EL3}, - {"fpexc32_el2", FPEXC32_EL2}, - {"far_el1", FAR_EL1}, - {"far_el2", FAR_EL2}, - {"far_el3", FAR_EL3}, - {"hpfar_el2", HPFAR_EL2}, - {"par_el1", PAR_EL1}, - {"pmcr_el0", PMCR_EL0}, - {"pmcntenset_el0", PMCNTENSET_EL0}, - {"pmcntenclr_el0", PMCNTENCLR_EL0}, - {"pmovsclr_el0", PMOVSCLR_EL0}, - {"pmselr_el0", PMSELR_EL0}, - {"pmccntr_el0", PMCCNTR_EL0}, - {"pmxevtyper_el0", PMXEVTYPER_EL0}, - {"pmxevcntr_el0", PMXEVCNTR_EL0}, - {"pmuserenr_el0", PMUSERENR_EL0}, - {"pmintenset_el1", PMINTENSET_EL1}, - {"pmintenclr_el1", PMINTENCLR_EL1}, - {"pmovsset_el0", PMOVSSET_EL0}, - {"mair_el1", MAIR_EL1}, - {"mair_el2", MAIR_EL2}, - {"mair_el3", MAIR_EL3}, - {"amair_el1", AMAIR_EL1}, - {"amair_el2", AMAIR_EL2}, - {"amair_el3", AMAIR_EL3}, - {"vbar_el1", VBAR_EL1}, - {"vbar_el2", VBAR_EL2}, - {"vbar_el3", VBAR_EL3}, - {"rmr_el1", RMR_EL1}, - {"rmr_el2", RMR_EL2}, - {"rmr_el3", RMR_EL3}, - {"contextidr_el1", CONTEXTIDR_EL1}, - {"tpidr_el0", TPIDR_EL0}, - {"tpidr_el2", TPIDR_EL2}, - {"tpidr_el3", TPIDR_EL3}, - {"tpidrro_el0", TPIDRRO_EL0}, - {"tpidr_el1", TPIDR_EL1}, - {"cntfrq_el0", CNTFRQ_EL0}, - {"cntvoff_el2", CNTVOFF_EL2}, - {"cntkctl_el1", CNTKCTL_EL1}, - {"cnthctl_el2", CNTHCTL_EL2}, - {"cntp_tval_el0", CNTP_TVAL_EL0}, - {"cnthp_tval_el2", CNTHP_TVAL_EL2}, - {"cntps_tval_el1", CNTPS_TVAL_EL1}, - {"cntp_ctl_el0", CNTP_CTL_EL0}, - {"cnthp_ctl_el2", CNTHP_CTL_EL2}, - {"cntps_ctl_el1", CNTPS_CTL_EL1}, - {"cntp_cval_el0", CNTP_CVAL_EL0}, - {"cnthp_cval_el2", CNTHP_CVAL_EL2}, - {"cntps_cval_el1", CNTPS_CVAL_EL1}, - {"cntv_tval_el0", CNTV_TVAL_EL0}, - {"cntv_ctl_el0", CNTV_CTL_EL0}, - {"cntv_cval_el0", CNTV_CVAL_EL0}, - {"pmevcntr0_el0", PMEVCNTR0_EL0}, - {"pmevcntr1_el0", PMEVCNTR1_EL0}, - {"pmevcntr2_el0", PMEVCNTR2_EL0}, - {"pmevcntr3_el0", PMEVCNTR3_EL0}, - {"pmevcntr4_el0", PMEVCNTR4_EL0}, - {"pmevcntr5_el0", PMEVCNTR5_EL0}, - {"pmevcntr6_el0", PMEVCNTR6_EL0}, - {"pmevcntr7_el0", PMEVCNTR7_EL0}, - {"pmevcntr8_el0", PMEVCNTR8_EL0}, - {"pmevcntr9_el0", PMEVCNTR9_EL0}, - {"pmevcntr10_el0", PMEVCNTR10_EL0}, - {"pmevcntr11_el0", PMEVCNTR11_EL0}, - {"pmevcntr12_el0", PMEVCNTR12_EL0}, - {"pmevcntr13_el0", PMEVCNTR13_EL0}, - {"pmevcntr14_el0", PMEVCNTR14_EL0}, - {"pmevcntr15_el0", PMEVCNTR15_EL0}, - {"pmevcntr16_el0", PMEVCNTR16_EL0}, - {"pmevcntr17_el0", PMEVCNTR17_EL0}, - {"pmevcntr18_el0", PMEVCNTR18_EL0}, - {"pmevcntr19_el0", PMEVCNTR19_EL0}, - {"pmevcntr20_el0", PMEVCNTR20_EL0}, - {"pmevcntr21_el0", PMEVCNTR21_EL0}, - {"pmevcntr22_el0", PMEVCNTR22_EL0}, - {"pmevcntr23_el0", PMEVCNTR23_EL0}, - {"pmevcntr24_el0", PMEVCNTR24_EL0}, - {"pmevcntr25_el0", PMEVCNTR25_EL0}, - {"pmevcntr26_el0", PMEVCNTR26_EL0}, - {"pmevcntr27_el0", PMEVCNTR27_EL0}, - {"pmevcntr28_el0", PMEVCNTR28_EL0}, - {"pmevcntr29_el0", PMEVCNTR29_EL0}, - {"pmevcntr30_el0", PMEVCNTR30_EL0}, - {"pmccfiltr_el0", PMCCFILTR_EL0}, - {"pmevtyper0_el0", PMEVTYPER0_EL0}, - {"pmevtyper1_el0", PMEVTYPER1_EL0}, - {"pmevtyper2_el0", PMEVTYPER2_EL0}, - {"pmevtyper3_el0", PMEVTYPER3_EL0}, - {"pmevtyper4_el0", PMEVTYPER4_EL0}, - {"pmevtyper5_el0", PMEVTYPER5_EL0}, - {"pmevtyper6_el0", PMEVTYPER6_EL0}, - {"pmevtyper7_el0", PMEVTYPER7_EL0}, - {"pmevtyper8_el0", PMEVTYPER8_EL0}, - {"pmevtyper9_el0", PMEVTYPER9_EL0}, - {"pmevtyper10_el0", PMEVTYPER10_EL0}, - {"pmevtyper11_el0", PMEVTYPER11_EL0}, - {"pmevtyper12_el0", PMEVTYPER12_EL0}, - {"pmevtyper13_el0", PMEVTYPER13_EL0}, - {"pmevtyper14_el0", PMEVTYPER14_EL0}, - {"pmevtyper15_el0", PMEVTYPER15_EL0}, - {"pmevtyper16_el0", PMEVTYPER16_EL0}, - {"pmevtyper17_el0", PMEVTYPER17_EL0}, - {"pmevtyper18_el0", PMEVTYPER18_EL0}, - {"pmevtyper19_el0", PMEVTYPER19_EL0}, - {"pmevtyper20_el0", PMEVTYPER20_EL0}, - {"pmevtyper21_el0", PMEVTYPER21_EL0}, - {"pmevtyper22_el0", PMEVTYPER22_EL0}, - {"pmevtyper23_el0", PMEVTYPER23_EL0}, - {"pmevtyper24_el0", PMEVTYPER24_EL0}, - {"pmevtyper25_el0", PMEVTYPER25_EL0}, - {"pmevtyper26_el0", PMEVTYPER26_EL0}, - {"pmevtyper27_el0", PMEVTYPER27_EL0}, - {"pmevtyper28_el0", PMEVTYPER28_EL0}, - {"pmevtyper29_el0", PMEVTYPER29_EL0}, - {"pmevtyper30_el0", PMEVTYPER30_EL0}, - - // Trace registers - {"trcprgctlr", TRCPRGCTLR}, - {"trcprocselr", TRCPROCSELR}, - {"trcconfigr", TRCCONFIGR}, - {"trcauxctlr", TRCAUXCTLR}, - {"trceventctl0r", TRCEVENTCTL0R}, - {"trceventctl1r", TRCEVENTCTL1R}, - {"trcstallctlr", TRCSTALLCTLR}, - {"trctsctlr", TRCTSCTLR}, - {"trcsyncpr", TRCSYNCPR}, - {"trcccctlr", TRCCCCTLR}, - {"trcbbctlr", TRCBBCTLR}, - {"trctraceidr", TRCTRACEIDR}, - {"trcqctlr", TRCQCTLR}, - {"trcvictlr", TRCVICTLR}, - {"trcviiectlr", TRCVIIECTLR}, - {"trcvissctlr", TRCVISSCTLR}, - {"trcvipcssctlr", TRCVIPCSSCTLR}, - {"trcvdctlr", TRCVDCTLR}, - {"trcvdsacctlr", TRCVDSACCTLR}, - {"trcvdarcctlr", TRCVDARCCTLR}, - {"trcseqevr0", TRCSEQEVR0}, - {"trcseqevr1", TRCSEQEVR1}, - {"trcseqevr2", TRCSEQEVR2}, - {"trcseqrstevr", TRCSEQRSTEVR}, - {"trcseqstr", TRCSEQSTR}, - {"trcextinselr", TRCEXTINSELR}, - {"trccntrldvr0", TRCCNTRLDVR0}, - {"trccntrldvr1", TRCCNTRLDVR1}, - {"trccntrldvr2", TRCCNTRLDVR2}, - {"trccntrldvr3", TRCCNTRLDVR3}, - {"trccntctlr0", TRCCNTCTLR0}, - {"trccntctlr1", TRCCNTCTLR1}, - {"trccntctlr2", TRCCNTCTLR2}, - {"trccntctlr3", TRCCNTCTLR3}, - {"trccntvr0", TRCCNTVR0}, - {"trccntvr1", TRCCNTVR1}, - {"trccntvr2", TRCCNTVR2}, - {"trccntvr3", TRCCNTVR3}, - {"trcimspec0", TRCIMSPEC0}, - {"trcimspec1", TRCIMSPEC1}, - {"trcimspec2", TRCIMSPEC2}, - {"trcimspec3", TRCIMSPEC3}, - {"trcimspec4", TRCIMSPEC4}, - {"trcimspec5", TRCIMSPEC5}, - {"trcimspec6", TRCIMSPEC6}, - {"trcimspec7", TRCIMSPEC7}, - {"trcrsctlr2", TRCRSCTLR2}, - {"trcrsctlr3", TRCRSCTLR3}, - {"trcrsctlr4", TRCRSCTLR4}, - {"trcrsctlr5", TRCRSCTLR5}, - {"trcrsctlr6", TRCRSCTLR6}, - {"trcrsctlr7", TRCRSCTLR7}, - {"trcrsctlr8", TRCRSCTLR8}, - {"trcrsctlr9", TRCRSCTLR9}, - {"trcrsctlr10", TRCRSCTLR10}, - {"trcrsctlr11", TRCRSCTLR11}, - {"trcrsctlr12", TRCRSCTLR12}, - {"trcrsctlr13", TRCRSCTLR13}, - {"trcrsctlr14", TRCRSCTLR14}, - {"trcrsctlr15", TRCRSCTLR15}, - {"trcrsctlr16", TRCRSCTLR16}, - {"trcrsctlr17", TRCRSCTLR17}, - {"trcrsctlr18", TRCRSCTLR18}, - {"trcrsctlr19", TRCRSCTLR19}, - {"trcrsctlr20", TRCRSCTLR20}, - {"trcrsctlr21", TRCRSCTLR21}, - {"trcrsctlr22", TRCRSCTLR22}, - {"trcrsctlr23", TRCRSCTLR23}, - {"trcrsctlr24", TRCRSCTLR24}, - {"trcrsctlr25", TRCRSCTLR25}, - {"trcrsctlr26", TRCRSCTLR26}, - {"trcrsctlr27", TRCRSCTLR27}, - {"trcrsctlr28", TRCRSCTLR28}, - {"trcrsctlr29", TRCRSCTLR29}, - {"trcrsctlr30", TRCRSCTLR30}, - {"trcrsctlr31", TRCRSCTLR31}, - {"trcssccr0", TRCSSCCR0}, - {"trcssccr1", TRCSSCCR1}, - {"trcssccr2", TRCSSCCR2}, - {"trcssccr3", TRCSSCCR3}, - {"trcssccr4", TRCSSCCR4}, - {"trcssccr5", TRCSSCCR5}, - {"trcssccr6", TRCSSCCR6}, - {"trcssccr7", TRCSSCCR7}, - {"trcsscsr0", TRCSSCSR0}, - {"trcsscsr1", TRCSSCSR1}, - {"trcsscsr2", TRCSSCSR2}, - {"trcsscsr3", TRCSSCSR3}, - {"trcsscsr4", TRCSSCSR4}, - {"trcsscsr5", TRCSSCSR5}, - {"trcsscsr6", TRCSSCSR6}, - {"trcsscsr7", TRCSSCSR7}, - {"trcsspcicr0", TRCSSPCICR0}, - {"trcsspcicr1", TRCSSPCICR1}, - {"trcsspcicr2", TRCSSPCICR2}, - {"trcsspcicr3", TRCSSPCICR3}, - {"trcsspcicr4", TRCSSPCICR4}, - {"trcsspcicr5", TRCSSPCICR5}, - {"trcsspcicr6", TRCSSPCICR6}, - {"trcsspcicr7", TRCSSPCICR7}, - {"trcpdcr", TRCPDCR}, - {"trcacvr0", TRCACVR0}, - {"trcacvr1", TRCACVR1}, - {"trcacvr2", TRCACVR2}, - {"trcacvr3", TRCACVR3}, - {"trcacvr4", TRCACVR4}, - {"trcacvr5", TRCACVR5}, - {"trcacvr6", TRCACVR6}, - {"trcacvr7", TRCACVR7}, - {"trcacvr8", TRCACVR8}, - {"trcacvr9", TRCACVR9}, - {"trcacvr10", TRCACVR10}, - {"trcacvr11", TRCACVR11}, - {"trcacvr12", TRCACVR12}, - {"trcacvr13", TRCACVR13}, - {"trcacvr14", TRCACVR14}, - {"trcacvr15", TRCACVR15}, - {"trcacatr0", TRCACATR0}, - {"trcacatr1", TRCACATR1}, - {"trcacatr2", TRCACATR2}, - {"trcacatr3", TRCACATR3}, - {"trcacatr4", TRCACATR4}, - {"trcacatr5", TRCACATR5}, - {"trcacatr6", TRCACATR6}, - {"trcacatr7", TRCACATR7}, - {"trcacatr8", TRCACATR8}, - {"trcacatr9", TRCACATR9}, - {"trcacatr10", TRCACATR10}, - {"trcacatr11", TRCACATR11}, - {"trcacatr12", TRCACATR12}, - {"trcacatr13", TRCACATR13}, - {"trcacatr14", TRCACATR14}, - {"trcacatr15", TRCACATR15}, - {"trcdvcvr0", TRCDVCVR0}, - {"trcdvcvr1", TRCDVCVR1}, - {"trcdvcvr2", TRCDVCVR2}, - {"trcdvcvr3", TRCDVCVR3}, - {"trcdvcvr4", TRCDVCVR4}, - {"trcdvcvr5", TRCDVCVR5}, - {"trcdvcvr6", TRCDVCVR6}, - {"trcdvcvr7", TRCDVCVR7}, - {"trcdvcmr0", TRCDVCMR0}, - {"trcdvcmr1", TRCDVCMR1}, - {"trcdvcmr2", TRCDVCMR2}, - {"trcdvcmr3", TRCDVCMR3}, - {"trcdvcmr4", TRCDVCMR4}, - {"trcdvcmr5", TRCDVCMR5}, - {"trcdvcmr6", TRCDVCMR6}, - {"trcdvcmr7", TRCDVCMR7}, - {"trccidcvr0", TRCCIDCVR0}, - {"trccidcvr1", TRCCIDCVR1}, - {"trccidcvr2", TRCCIDCVR2}, - {"trccidcvr3", TRCCIDCVR3}, - {"trccidcvr4", TRCCIDCVR4}, - {"trccidcvr5", TRCCIDCVR5}, - {"trccidcvr6", TRCCIDCVR6}, - {"trccidcvr7", TRCCIDCVR7}, - {"trcvmidcvr0", TRCVMIDCVR0}, - {"trcvmidcvr1", TRCVMIDCVR1}, - {"trcvmidcvr2", TRCVMIDCVR2}, - {"trcvmidcvr3", TRCVMIDCVR3}, - {"trcvmidcvr4", TRCVMIDCVR4}, - {"trcvmidcvr5", TRCVMIDCVR5}, - {"trcvmidcvr6", TRCVMIDCVR6}, - {"trcvmidcvr7", TRCVMIDCVR7}, - {"trccidcctlr0", TRCCIDCCTLR0}, - {"trccidcctlr1", TRCCIDCCTLR1}, - {"trcvmidcctlr0", TRCVMIDCCTLR0}, - {"trcvmidcctlr1", TRCVMIDCCTLR1}, - {"trcitctrl", TRCITCTRL}, - {"trcclaimset", TRCCLAIMSET}, - {"trcclaimclr", TRCCLAIMCLR}, - - // GICv3 registers - {"icc_bpr1_el1", ICC_BPR1_EL1}, - {"icc_bpr0_el1", ICC_BPR0_EL1}, - {"icc_pmr_el1", ICC_PMR_EL1}, - {"icc_ctlr_el1", ICC_CTLR_EL1}, - {"icc_ctlr_el3", ICC_CTLR_EL3}, - {"icc_sre_el1", ICC_SRE_EL1}, - {"icc_sre_el2", ICC_SRE_EL2}, - {"icc_sre_el3", ICC_SRE_EL3}, - {"icc_igrpen0_el1", ICC_IGRPEN0_EL1}, - {"icc_igrpen1_el1", ICC_IGRPEN1_EL1}, - {"icc_igrpen1_el3", ICC_IGRPEN1_EL3}, - {"icc_seien_el1", ICC_SEIEN_EL1}, - {"icc_ap0r0_el1", ICC_AP0R0_EL1}, - {"icc_ap0r1_el1", ICC_AP0R1_EL1}, - {"icc_ap0r2_el1", ICC_AP0R2_EL1}, - {"icc_ap0r3_el1", ICC_AP0R3_EL1}, - {"icc_ap1r0_el1", ICC_AP1R0_EL1}, - {"icc_ap1r1_el1", ICC_AP1R1_EL1}, - {"icc_ap1r2_el1", ICC_AP1R2_EL1}, - {"icc_ap1r3_el1", ICC_AP1R3_EL1}, - {"ich_ap0r0_el2", ICH_AP0R0_EL2}, - {"ich_ap0r1_el2", ICH_AP0R1_EL2}, - {"ich_ap0r2_el2", ICH_AP0R2_EL2}, - {"ich_ap0r3_el2", ICH_AP0R3_EL2}, - {"ich_ap1r0_el2", ICH_AP1R0_EL2}, - {"ich_ap1r1_el2", ICH_AP1R1_EL2}, - {"ich_ap1r2_el2", ICH_AP1R2_EL2}, - {"ich_ap1r3_el2", ICH_AP1R3_EL2}, - {"ich_hcr_el2", ICH_HCR_EL2}, - {"ich_misr_el2", ICH_MISR_EL2}, - {"ich_vmcr_el2", ICH_VMCR_EL2}, - {"ich_vseir_el2", ICH_VSEIR_EL2}, - {"ich_lr0_el2", ICH_LR0_EL2}, - {"ich_lr1_el2", ICH_LR1_EL2}, - {"ich_lr2_el2", ICH_LR2_EL2}, - {"ich_lr3_el2", ICH_LR3_EL2}, - {"ich_lr4_el2", ICH_LR4_EL2}, - {"ich_lr5_el2", ICH_LR5_EL2}, - {"ich_lr6_el2", ICH_LR6_EL2}, - {"ich_lr7_el2", ICH_LR7_EL2}, - {"ich_lr8_el2", ICH_LR8_EL2}, - {"ich_lr9_el2", ICH_LR9_EL2}, - {"ich_lr10_el2", ICH_LR10_EL2}, - {"ich_lr11_el2", ICH_LR11_EL2}, - {"ich_lr12_el2", ICH_LR12_EL2}, - {"ich_lr13_el2", ICH_LR13_EL2}, - {"ich_lr14_el2", ICH_LR14_EL2}, - {"ich_lr15_el2", ICH_LR15_EL2} -}; - -const ARM64NamedImmMapper::Mapping -ARM64SysReg::SysRegMapper::CycloneSysRegPairs[] = { - {"cpm_ioacc_ctl_el3", CPM_IOACC_CTL_EL3} -}; - -uint32_t -ARM64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const { - std::string NameLower = Name.lower(); - - // First search the registers shared by all - for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) { - if (SysRegPairs[i].Name == NameLower) { - Valid = true; - return SysRegPairs[i].Value; - } - } - - // Next search for target specific registers - if (FeatureBits & ARM64::ProcCyclone) { - for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) { - if (CycloneSysRegPairs[i].Name == NameLower) { - Valid = true; - return CycloneSysRegPairs[i].Value; - } - } - } - - // Now try the instruction-specific registers (either read-only or - // write-only). - for (unsigned i = 0; i < NumInstPairs; ++i) { - if (InstPairs[i].Name == NameLower) { - Valid = true; - return InstPairs[i].Value; - } - } - - // Try to parse an S____ register name, where the bits - // are: 11 xxx 1x11 xxxx xxx - Regex GenericRegPattern("^s3_([0-7])_c(1[15])_c([0-9]|1[0-5])_([0-7])$"); - - SmallVector Ops; - if (!GenericRegPattern.match(NameLower, &Ops)) { - Valid = false; - return -1; - } - - uint32_t Op0 = 3, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0; - uint32_t Bits; - Ops[1].getAsInteger(10, Op1); - Ops[2].getAsInteger(10, CRn); - Ops[3].getAsInteger(10, CRm); - Ops[4].getAsInteger(10, Op2); - Bits = (Op0 << 14) | (Op1 << 11) | (CRn << 7) | (CRm << 3) | Op2; - - Valid = true; - return Bits; -} - -std::string -ARM64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const { - // First search the registers shared by all - for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) { - if (SysRegPairs[i].Value == Bits) { - Valid = true; - return SysRegPairs[i].Name; - } - } - - // Next search for target specific registers - if (FeatureBits & ARM64::ProcCyclone) { - for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) { - if (CycloneSysRegPairs[i].Value == Bits) { - Valid = true; - return CycloneSysRegPairs[i].Name; - } - } - } - - // Now try the instruction-specific registers (either read-only or - // write-only). - for (unsigned i = 0; i < NumInstPairs; ++i) { - if (InstPairs[i].Value == Bits) { - Valid = true; - return InstPairs[i].Name; - } - } - - uint32_t Op0 = (Bits >> 14) & 0x3; - uint32_t Op1 = (Bits >> 11) & 0x7; - uint32_t CRn = (Bits >> 7) & 0xf; - uint32_t CRm = (Bits >> 3) & 0xf; - uint32_t Op2 = Bits & 0x7; - - // Only combinations matching: 11 xxx 1x11 xxxx xxx are valid for a generic - // name. - if (Op0 != 3 || (CRn != 11 && CRn != 15)) { - Valid = false; - return ""; - } - - assert(Op0 == 3 && (CRn == 11 || CRn == 15) && "Invalid generic sysreg"); - - Valid = true; - return "s3_" + utostr(Op1) + "_c" + utostr(CRn) - + "_c" + utostr(CRm) + "_" + utostr(Op2); -} - -const ARM64NamedImmMapper::Mapping ARM64TLBI::TLBIMapper::TLBIPairs[] = { - {"ipas2e1is", IPAS2E1IS}, - {"ipas2le1is", IPAS2LE1IS}, - {"vmalle1is", VMALLE1IS}, - {"alle2is", ALLE2IS}, - {"alle3is", ALLE3IS}, - {"vae1is", VAE1IS}, - {"vae2is", VAE2IS}, - {"vae3is", VAE3IS}, - {"aside1is", ASIDE1IS}, - {"vaae1is", VAAE1IS}, - {"alle1is", ALLE1IS}, - {"vale1is", VALE1IS}, - {"vale2is", VALE2IS}, - {"vale3is", VALE3IS}, - {"vmalls12e1is", VMALLS12E1IS}, - {"vaale1is", VAALE1IS}, - {"ipas2e1", IPAS2E1}, - {"ipas2le1", IPAS2LE1}, - {"vmalle1", VMALLE1}, - {"alle2", ALLE2}, - {"alle3", ALLE3}, - {"vae1", VAE1}, - {"vae2", VAE2}, - {"vae3", VAE3}, - {"aside1", ASIDE1}, - {"vaae1", VAAE1}, - {"alle1", ALLE1}, - {"vale1", VALE1}, - {"vale2", VALE2}, - {"vale3", VALE3}, - {"vmalls12e1", VMALLS12E1}, - {"vaale1", VAALE1} -}; - -ARM64TLBI::TLBIMapper::TLBIMapper() - : ARM64NamedImmMapper(TLBIPairs, 0) {} diff --git a/lib/Target/ARM64/Utils/ARM64BaseInfo.h b/lib/Target/ARM64/Utils/ARM64BaseInfo.h deleted file mode 100644 index 8075d6b37c9..00000000000 --- a/lib/Target/ARM64/Utils/ARM64BaseInfo.h +++ /dev/null @@ -1,1294 +0,0 @@ -//===-- ARM64BaseInfo.h - Top level definitions for ARM64 -------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains small standalone helper functions and enum definitions for -// the ARM64 target useful for the compiler back-end and the MC libraries. -// As such, it deliberately does not include references to LLVM core -// code gen types, passes, etc.. -// -//===----------------------------------------------------------------------===// - -#ifndef ARM64BASEINFO_H -#define ARM64BASEINFO_H - -// FIXME: Is it easiest to fix this layering violation by moving the .inc -// #includes from ARM64MCTargetDesc.h to here? -#include "MCTargetDesc/ARM64MCTargetDesc.h" // For ARM64::X0 and friends. -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/StringSwitch.h" -#include "llvm/Support/ErrorHandling.h" - -namespace llvm { - -inline static unsigned getWRegFromXReg(unsigned Reg) { - switch (Reg) { - case ARM64::X0: return ARM64::W0; - case ARM64::X1: return ARM64::W1; - case ARM64::X2: return ARM64::W2; - case ARM64::X3: return ARM64::W3; - case ARM64::X4: return ARM64::W4; - case ARM64::X5: return ARM64::W5; - case ARM64::X6: return ARM64::W6; - case ARM64::X7: return ARM64::W7; - case ARM64::X8: return ARM64::W8; - case ARM64::X9: return ARM64::W9; - case ARM64::X10: return ARM64::W10; - case ARM64::X11: return ARM64::W11; - case ARM64::X12: return ARM64::W12; - case ARM64::X13: return ARM64::W13; - case ARM64::X14: return ARM64::W14; - case ARM64::X15: return ARM64::W15; - case ARM64::X16: return ARM64::W16; - case ARM64::X17: return ARM64::W17; - case ARM64::X18: return ARM64::W18; - case ARM64::X19: return ARM64::W19; - case ARM64::X20: return ARM64::W20; - case ARM64::X21: return ARM64::W21; - case ARM64::X22: return ARM64::W22; - case ARM64::X23: return ARM64::W23; - case ARM64::X24: return ARM64::W24; - case ARM64::X25: return ARM64::W25; - case ARM64::X26: return ARM64::W26; - case ARM64::X27: return ARM64::W27; - case ARM64::X28: return ARM64::W28; - case ARM64::FP: return ARM64::W29; - case ARM64::LR: return ARM64::W30; - case ARM64::SP: return ARM64::WSP; - case ARM64::XZR: return ARM64::WZR; - } - // For anything else, return it unchanged. - return Reg; -} - -inline static unsigned getXRegFromWReg(unsigned Reg) { - switch (Reg) { - case ARM64::W0: return ARM64::X0; - case ARM64::W1: return ARM64::X1; - case ARM64::W2: return ARM64::X2; - case ARM64::W3: return ARM64::X3; - case ARM64::W4: return ARM64::X4; - case ARM64::W5: return ARM64::X5; - case ARM64::W6: return ARM64::X6; - case ARM64::W7: return ARM64::X7; - case ARM64::W8: return ARM64::X8; - case ARM64::W9: return ARM64::X9; - case ARM64::W10: return ARM64::X10; - case ARM64::W11: return ARM64::X11; - case ARM64::W12: return ARM64::X12; - case ARM64::W13: return ARM64::X13; - case ARM64::W14: return ARM64::X14; - case ARM64::W15: return ARM64::X15; - case ARM64::W16: return ARM64::X16; - case ARM64::W17: return ARM64::X17; - case ARM64::W18: return ARM64::X18; - case ARM64::W19: return ARM64::X19; - case ARM64::W20: return ARM64::X20; - case ARM64::W21: return ARM64::X21; - case ARM64::W22: return ARM64::X22; - case ARM64::W23: return ARM64::X23; - case ARM64::W24: return ARM64::X24; - case ARM64::W25: return ARM64::X25; - case ARM64::W26: return ARM64::X26; - case ARM64::W27: return ARM64::X27; - case ARM64::W28: return ARM64::X28; - case ARM64::W29: return ARM64::FP; - case ARM64::W30: return ARM64::LR; - case ARM64::WSP: return ARM64::SP; - case ARM64::WZR: return ARM64::XZR; - } - // For anything else, return it unchanged. - return Reg; -} - -static inline unsigned getBRegFromDReg(unsigned Reg) { - switch (Reg) { - case ARM64::D0: return ARM64::B0; - case ARM64::D1: return ARM64::B1; - case ARM64::D2: return ARM64::B2; - case ARM64::D3: return ARM64::B3; - case ARM64::D4: return ARM64::B4; - case ARM64::D5: return ARM64::B5; - case ARM64::D6: return ARM64::B6; - case ARM64::D7: return ARM64::B7; - case ARM64::D8: return ARM64::B8; - case ARM64::D9: return ARM64::B9; - case ARM64::D10: return ARM64::B10; - case ARM64::D11: return ARM64::B11; - case ARM64::D12: return ARM64::B12; - case ARM64::D13: return ARM64::B13; - case ARM64::D14: return ARM64::B14; - case ARM64::D15: return ARM64::B15; - case ARM64::D16: return ARM64::B16; - case ARM64::D17: return ARM64::B17; - case ARM64::D18: return ARM64::B18; - case ARM64::D19: return ARM64::B19; - case ARM64::D20: return ARM64::B20; - case ARM64::D21: return ARM64::B21; - case ARM64::D22: return ARM64::B22; - case ARM64::D23: return ARM64::B23; - case ARM64::D24: return ARM64::B24; - case ARM64::D25: return ARM64::B25; - case ARM64::D26: return ARM64::B26; - case ARM64::D27: return ARM64::B27; - case ARM64::D28: return ARM64::B28; - case ARM64::D29: return ARM64::B29; - case ARM64::D30: return ARM64::B30; - case ARM64::D31: return ARM64::B31; - } - // For anything else, return it unchanged. - return Reg; -} - - -static inline unsigned getDRegFromBReg(unsigned Reg) { - switch (Reg) { - case ARM64::B0: return ARM64::D0; - case ARM64::B1: return ARM64::D1; - case ARM64::B2: return ARM64::D2; - case ARM64::B3: return ARM64::D3; - case ARM64::B4: return ARM64::D4; - case ARM64::B5: return ARM64::D5; - case ARM64::B6: return ARM64::D6; - case ARM64::B7: return ARM64::D7; - case ARM64::B8: return ARM64::D8; - case ARM64::B9: return ARM64::D9; - case ARM64::B10: return ARM64::D10; - case ARM64::B11: return ARM64::D11; - case ARM64::B12: return ARM64::D12; - case ARM64::B13: return ARM64::D13; - case ARM64::B14: return ARM64::D14; - case ARM64::B15: return ARM64::D15; - case ARM64::B16: return ARM64::D16; - case ARM64::B17: return ARM64::D17; - case ARM64::B18: return ARM64::D18; - case ARM64::B19: return ARM64::D19; - case ARM64::B20: return ARM64::D20; - case ARM64::B21: return ARM64::D21; - case ARM64::B22: return ARM64::D22; - case ARM64::B23: return ARM64::D23; - case ARM64::B24: return ARM64::D24; - case ARM64::B25: return ARM64::D25; - case ARM64::B26: return ARM64::D26; - case ARM64::B27: return ARM64::D27; - case ARM64::B28: return ARM64::D28; - case ARM64::B29: return ARM64::D29; - case ARM64::B30: return ARM64::D30; - case ARM64::B31: return ARM64::D31; - } - // For anything else, return it unchanged. - return Reg; -} - -namespace ARM64CC { - -// The CondCodes constants map directly to the 4-bit encoding of the condition -// field for predicated instructions. -enum CondCode { // Meaning (integer) Meaning (floating-point) - EQ = 0x0, // Equal Equal - NE = 0x1, // Not equal Not equal, or unordered - HS = 0x2, // Unsigned higher or same >, ==, or unordered - LO = 0x3, // Unsigned lower Less than - MI = 0x4, // Minus, negative Less than - PL = 0x5, // Plus, positive or zero >, ==, or unordered - VS = 0x6, // Overflow Unordered - VC = 0x7, // No overflow Not unordered - HI = 0x8, // Unsigned higher Greater than, or unordered - LS = 0x9, // Unsigned lower or same Less than or equal - GE = 0xa, // Greater than or equal Greater than or equal - LT = 0xb, // Less than Less than, or unordered - GT = 0xc, // Greater than Greater than - LE = 0xd, // Less than or equal <, ==, or unordered - AL = 0xe, // Always (unconditional) Always (unconditional) - NV = 0xf, // Always (unconditional) Always (unconditional) - // Note the NV exists purely to disassemble 0b1111. Execution is "always". - Invalid -}; - -inline static const char *getCondCodeName(CondCode Code) { - switch (Code) { - default: llvm_unreachable("Unknown condition code"); - case EQ: return "eq"; - case NE: return "ne"; - case HS: return "hs"; - case LO: return "lo"; - case MI: return "mi"; - case PL: return "pl"; - case VS: return "vs"; - case VC: return "vc"; - case HI: return "hi"; - case LS: return "ls"; - case GE: return "ge"; - case LT: return "lt"; - case GT: return "gt"; - case LE: return "le"; - case AL: return "al"; - case NV: return "nv"; - } -} - -inline static CondCode getInvertedCondCode(CondCode Code) { - switch (Code) { - default: llvm_unreachable("Unknown condition code"); - case EQ: return NE; - case NE: return EQ; - case HS: return LO; - case LO: return HS; - case MI: return PL; - case PL: return MI; - case VS: return VC; - case VC: return VS; - case HI: return LS; - case LS: return HI; - case GE: return LT; - case LT: return GE; - case GT: return LE; - case LE: return GT; - } -} - -/// Given a condition code, return NZCV flags that would satisfy that condition. -/// The flag bits are in the format expected by the ccmp instructions. -/// Note that many different flag settings can satisfy a given condition code, -/// this function just returns one of them. -inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) { - // NZCV flags encoded as expected by ccmp instructions, ARMv8 ISA 5.5.7. - enum { N = 8, Z = 4, C = 2, V = 1 }; - switch (Code) { - default: llvm_unreachable("Unknown condition code"); - case EQ: return Z; // Z == 1 - case NE: return 0; // Z == 0 - case HS: return C; // C == 1 - case LO: return 0; // C == 0 - case MI: return N; // N == 1 - case PL: return 0; // N == 0 - case VS: return V; // V == 1 - case VC: return 0; // V == 0 - case HI: return C; // C == 1 && Z == 0 - case LS: return 0; // C == 0 || Z == 1 - case GE: return 0; // N == V - case LT: return N; // N != V - case GT: return 0; // Z == 0 && N == V - case LE: return Z; // Z == 1 || N != V - } -} -} // end namespace ARM64CC - -/// Instances of this class can perform bidirectional mapping from random -/// identifier strings to operand encodings. For example "MSR" takes a named -/// system-register which must be encoded somehow and decoded for printing. This -/// central location means that the information for those transformations is not -/// duplicated and remains in sync. -/// -/// FIXME: currently the algorithm is a completely unoptimised linear -/// search. Obviously this could be improved, but we would probably want to work -/// out just how often these instructions are emitted before working on it. It -/// might even be optimal to just reorder the tables for the common instructions -/// rather than changing the algorithm. -struct ARM64NamedImmMapper { - struct Mapping { - const char *Name; - uint32_t Value; - }; - - template - ARM64NamedImmMapper(const Mapping (&Pairs)[N], uint32_t TooBigImm) - : Pairs(&Pairs[0]), NumPairs(N), TooBigImm(TooBigImm) {} - - StringRef toString(uint32_t Value, bool &Valid) const; - uint32_t fromString(StringRef Name, bool &Valid) const; - - /// Many of the instructions allow an alternative assembly form consisting of - /// a simple immediate. Currently the only valid forms are ranges [0, N) where - /// N being 0 indicates no immediate syntax-form is allowed. - bool validImm(uint32_t Value) const; -protected: - const Mapping *Pairs; - size_t NumPairs; - uint32_t TooBigImm; -}; - -namespace ARM64AT { - enum ATValues { - Invalid = -1, // Op0 Op1 CRn CRm Op2 - S1E1R = 0x43c0, // 01 000 0111 1000 000 - S1E2R = 0x63c0, // 01 100 0111 1000 000 - S1E3R = 0x73c0, // 01 110 0111 1000 000 - S1E1W = 0x43c1, // 01 000 0111 1000 001 - S1E2W = 0x63c1, // 01 100 0111 1000 001 - S1E3W = 0x73c1, // 01 110 0111 1000 001 - S1E0R = 0x43c2, // 01 000 0111 1000 010 - S1E0W = 0x43c3, // 01 000 0111 1000 011 - S12E1R = 0x63c4, // 01 100 0111 1000 100 - S12E1W = 0x63c5, // 01 100 0111 1000 101 - S12E0R = 0x63c6, // 01 100 0111 1000 110 - S12E0W = 0x63c7 // 01 100 0111 1000 111 - }; - - struct ATMapper : ARM64NamedImmMapper { - const static Mapping ATPairs[]; - - ATMapper(); - }; - -} -namespace ARM64DB { - enum DBValues { - Invalid = -1, - OSHLD = 0x1, - OSHST = 0x2, - OSH = 0x3, - NSHLD = 0x5, - NSHST = 0x6, - NSH = 0x7, - ISHLD = 0x9, - ISHST = 0xa, - ISH = 0xb, - LD = 0xd, - ST = 0xe, - SY = 0xf - }; - - struct DBarrierMapper : ARM64NamedImmMapper { - const static Mapping DBarrierPairs[]; - - DBarrierMapper(); - }; -} - -namespace ARM64DC { - enum DCValues { - Invalid = -1, // Op1 CRn CRm Op2 - ZVA = 0x5ba1, // 01 011 0111 0100 001 - IVAC = 0x43b1, // 01 000 0111 0110 001 - ISW = 0x43b2, // 01 000 0111 0110 010 - CVAC = 0x5bd1, // 01 011 0111 1010 001 - CSW = 0x43d2, // 01 000 0111 1010 010 - CVAU = 0x5bd9, // 01 011 0111 1011 001 - CIVAC = 0x5bf1, // 01 011 0111 1110 001 - CISW = 0x43f2 // 01 000 0111 1110 010 - }; - - struct DCMapper : ARM64NamedImmMapper { - const static Mapping DCPairs[]; - - DCMapper(); - }; - -} - -namespace ARM64IC { - enum ICValues { - Invalid = -1, // Op1 CRn CRm Op2 - IALLUIS = 0x0388, // 000 0111 0001 000 - IALLU = 0x03a8, // 000 0111 0101 000 - IVAU = 0x1ba9 // 011 0111 0101 001 - }; - - - struct ICMapper : ARM64NamedImmMapper { - const static Mapping ICPairs[]; - - ICMapper(); - }; - - static inline bool NeedsRegister(ICValues Val) { - return Val == IVAU; - } -} - -namespace ARM64ISB { - enum ISBValues { - Invalid = -1, - SY = 0xf - }; - struct ISBMapper : ARM64NamedImmMapper { - const static Mapping ISBPairs[]; - - ISBMapper(); - }; -} - -namespace ARM64PRFM { - enum PRFMValues { - Invalid = -1, - PLDL1KEEP = 0x00, - PLDL1STRM = 0x01, - PLDL2KEEP = 0x02, - PLDL2STRM = 0x03, - PLDL3KEEP = 0x04, - PLDL3STRM = 0x05, - PLIL1KEEP = 0x08, - PLIL1STRM = 0x09, - PLIL2KEEP = 0x0a, - PLIL2STRM = 0x0b, - PLIL3KEEP = 0x0c, - PLIL3STRM = 0x0d, - PSTL1KEEP = 0x10, - PSTL1STRM = 0x11, - PSTL2KEEP = 0x12, - PSTL2STRM = 0x13, - PSTL3KEEP = 0x14, - PSTL3STRM = 0x15 - }; - - struct PRFMMapper : ARM64NamedImmMapper { - const static Mapping PRFMPairs[]; - - PRFMMapper(); - }; -} - -namespace ARM64PState { - enum PStateValues { - Invalid = -1, - SPSel = 0x05, - DAIFSet = 0x1e, - DAIFClr = 0x1f - }; - - struct PStateMapper : ARM64NamedImmMapper { - const static Mapping PStatePairs[]; - - PStateMapper(); - }; - -} - -namespace ARM64SE { - enum ShiftExtSpecifiers { - Invalid = -1, - LSL, - MSL, - LSR, - ASR, - ROR, - - UXTB, - UXTH, - UXTW, - UXTX, - - SXTB, - SXTH, - SXTW, - SXTX - }; -} - -namespace ARM64Layout { - enum VectorLayout { - Invalid = -1, - VL_8B, - VL_4H, - VL_2S, - VL_1D, - - VL_16B, - VL_8H, - VL_4S, - VL_2D, - - // Bare layout for the 128-bit vector - // (only show ".b", ".h", ".s", ".d" without vector number) - VL_B, - VL_H, - VL_S, - VL_D - }; -} - -inline static const char * -ARM64VectorLayoutToString(ARM64Layout::VectorLayout Layout) { - switch (Layout) { - case ARM64Layout::VL_8B: return ".8b"; - case ARM64Layout::VL_4H: return ".4h"; - case ARM64Layout::VL_2S: return ".2s"; - case ARM64Layout::VL_1D: return ".1d"; - case ARM64Layout::VL_16B: return ".16b"; - case ARM64Layout::VL_8H: return ".8h"; - case ARM64Layout::VL_4S: return ".4s"; - case ARM64Layout::VL_2D: return ".2d"; - case ARM64Layout::VL_B: return ".b"; - case ARM64Layout::VL_H: return ".h"; - case ARM64Layout::VL_S: return ".s"; - case ARM64Layout::VL_D: return ".d"; - default: llvm_unreachable("Unknown Vector Layout"); - } -} - -inline static ARM64Layout::VectorLayout -ARM64StringToVectorLayout(StringRef LayoutStr) { - return StringSwitch(LayoutStr) - .Case(".8b", ARM64Layout::VL_8B) - .Case(".4h", ARM64Layout::VL_4H) - .Case(".2s", ARM64Layout::VL_2S) - .Case(".1d", ARM64Layout::VL_1D) - .Case(".16b", ARM64Layout::VL_16B) - .Case(".8h", ARM64Layout::VL_8H) - .Case(".4s", ARM64Layout::VL_4S) - .Case(".2d", ARM64Layout::VL_2D) - .Case(".b", ARM64Layout::VL_B) - .Case(".h", ARM64Layout::VL_H) - .Case(".s", ARM64Layout::VL_S) - .Case(".d", ARM64Layout::VL_D) - .Default(ARM64Layout::Invalid); -} - -namespace ARM64SysReg { - enum SysRegROValues { - MDCCSR_EL0 = 0x9808, // 10 011 0000 0001 000 - DBGDTRRX_EL0 = 0x9828, // 10 011 0000 0101 000 - MDRAR_EL1 = 0x8080, // 10 000 0001 0000 000 - OSLSR_EL1 = 0x808c, // 10 000 0001 0001 100 - DBGAUTHSTATUS_EL1 = 0x83f6, // 10 000 0111 1110 110 - PMCEID0_EL0 = 0xdce6, // 11 011 1001 1100 110 - PMCEID1_EL0 = 0xdce7, // 11 011 1001 1100 111 - MIDR_EL1 = 0xc000, // 11 000 0000 0000 000 - CCSIDR_EL1 = 0xc800, // 11 001 0000 0000 000 - CLIDR_EL1 = 0xc801, // 11 001 0000 0000 001 - CTR_EL0 = 0xd801, // 11 011 0000 0000 001 - MPIDR_EL1 = 0xc005, // 11 000 0000 0000 101 - REVIDR_EL1 = 0xc006, // 11 000 0000 0000 110 - AIDR_EL1 = 0xc807, // 11 001 0000 0000 111 - DCZID_EL0 = 0xd807, // 11 011 0000 0000 111 - ID_PFR0_EL1 = 0xc008, // 11 000 0000 0001 000 - ID_PFR1_EL1 = 0xc009, // 11 000 0000 0001 001 - ID_DFR0_EL1 = 0xc00a, // 11 000 0000 0001 010 - ID_AFR0_EL1 = 0xc00b, // 11 000 0000 0001 011 - ID_MMFR0_EL1 = 0xc00c, // 11 000 0000 0001 100 - ID_MMFR1_EL1 = 0xc00d, // 11 000 0000 0001 101 - ID_MMFR2_EL1 = 0xc00e, // 11 000 0000 0001 110 - ID_MMFR3_EL1 = 0xc00f, // 11 000 0000 0001 111 - ID_ISAR0_EL1 = 0xc010, // 11 000 0000 0010 000 - ID_ISAR1_EL1 = 0xc011, // 11 000 0000 0010 001 - ID_ISAR2_EL1 = 0xc012, // 11 000 0000 0010 010 - ID_ISAR3_EL1 = 0xc013, // 11 000 0000 0010 011 - ID_ISAR4_EL1 = 0xc014, // 11 000 0000 0010 100 - ID_ISAR5_EL1 = 0xc015, // 11 000 0000 0010 101 - ID_AARM64PFR0_EL1 = 0xc020, // 11 000 0000 0100 000 - ID_AARM64PFR1_EL1 = 0xc021, // 11 000 0000 0100 001 - ID_AARM64DFR0_EL1 = 0xc028, // 11 000 0000 0101 000 - ID_AARM64DFR1_EL1 = 0xc029, // 11 000 0000 0101 001 - ID_AARM64AFR0_EL1 = 0xc02c, // 11 000 0000 0101 100 - ID_AARM64AFR1_EL1 = 0xc02d, // 11 000 0000 0101 101 - ID_AARM64ISAR0_EL1 = 0xc030, // 11 000 0000 0110 000 - ID_AARM64ISAR1_EL1 = 0xc031, // 11 000 0000 0110 001 - ID_AARM64MMFR0_EL1 = 0xc038, // 11 000 0000 0111 000 - ID_AARM64MMFR1_EL1 = 0xc039, // 11 000 0000 0111 001 - MVFR0_EL1 = 0xc018, // 11 000 0000 0011 000 - MVFR1_EL1 = 0xc019, // 11 000 0000 0011 001 - MVFR2_EL1 = 0xc01a, // 11 000 0000 0011 010 - RVBAR_EL1 = 0xc601, // 11 000 1100 0000 001 - RVBAR_EL2 = 0xe601, // 11 100 1100 0000 001 - RVBAR_EL3 = 0xf601, // 11 110 1100 0000 001 - ISR_EL1 = 0xc608, // 11 000 1100 0001 000 - CNTPCT_EL0 = 0xdf01, // 11 011 1110 0000 001 - CNTVCT_EL0 = 0xdf02, // 11 011 1110 0000 010 - - // Trace registers - TRCSTATR = 0x8818, // 10 001 0000 0011 000 - TRCIDR8 = 0x8806, // 10 001 0000 0000 110 - TRCIDR9 = 0x880e, // 10 001 0000 0001 110 - TRCIDR10 = 0x8816, // 10 001 0000 0010 110 - TRCIDR11 = 0x881e, // 10 001 0000 0011 110 - TRCIDR12 = 0x8826, // 10 001 0000 0100 110 - TRCIDR13 = 0x882e, // 10 001 0000 0101 110 - TRCIDR0 = 0x8847, // 10 001 0000 1000 111 - TRCIDR1 = 0x884f, // 10 001 0000 1001 111 - TRCIDR2 = 0x8857, // 10 001 0000 1010 111 - TRCIDR3 = 0x885f, // 10 001 0000 1011 111 - TRCIDR4 = 0x8867, // 10 001 0000 1100 111 - TRCIDR5 = 0x886f, // 10 001 0000 1101 111 - TRCIDR6 = 0x8877, // 10 001 0000 1110 111 - TRCIDR7 = 0x887f, // 10 001 0000 1111 111 - TRCOSLSR = 0x888c, // 10 001 0001 0001 100 - TRCPDSR = 0x88ac, // 10 001 0001 0101 100 - TRCDEVAFF0 = 0x8bd6, // 10 001 0111 1010 110 - TRCDEVAFF1 = 0x8bde, // 10 001 0111 1011 110 - TRCLSR = 0x8bee, // 10 001 0111 1101 110 - TRCAUTHSTATUS = 0x8bf6, // 10 001 0111 1110 110 - TRCDEVARCH = 0x8bfe, // 10 001 0111 1111 110 - TRCDEVID = 0x8b97, // 10 001 0111 0010 111 - TRCDEVTYPE = 0x8b9f, // 10 001 0111 0011 111 - TRCPIDR4 = 0x8ba7, // 10 001 0111 0100 111 - TRCPIDR5 = 0x8baf, // 10 001 0111 0101 111 - TRCPIDR6 = 0x8bb7, // 10 001 0111 0110 111 - TRCPIDR7 = 0x8bbf, // 10 001 0111 0111 111 - TRCPIDR0 = 0x8bc7, // 10 001 0111 1000 111 - TRCPIDR1 = 0x8bcf, // 10 001 0111 1001 111 - TRCPIDR2 = 0x8bd7, // 10 001 0111 1010 111 - TRCPIDR3 = 0x8bdf, // 10 001 0111 1011 111 - TRCCIDR0 = 0x8be7, // 10 001 0111 1100 111 - TRCCIDR1 = 0x8bef, // 10 001 0111 1101 111 - TRCCIDR2 = 0x8bf7, // 10 001 0111 1110 111 - TRCCIDR3 = 0x8bff, // 10 001 0111 1111 111 - - // GICv3 registers - ICC_IAR1_EL1 = 0xc660, // 11 000 1100 1100 000 - ICC_IAR0_EL1 = 0xc640, // 11 000 1100 1000 000 - ICC_HPPIR1_EL1 = 0xc662, // 11 000 1100 1100 010 - ICC_HPPIR0_EL1 = 0xc642, // 11 000 1100 1000 010 - ICC_RPR_EL1 = 0xc65b, // 11 000 1100 1011 011 - ICH_VTR_EL2 = 0xe659, // 11 100 1100 1011 001 - ICH_EISR_EL2 = 0xe65b, // 11 100 1100 1011 011 - ICH_ELSR_EL2 = 0xe65d // 11 100 1100 1011 101 - }; - - enum SysRegWOValues { - DBGDTRTX_EL0 = 0x9828, // 10 011 0000 0101 000 - OSLAR_EL1 = 0x8084, // 10 000 0001 0000 100 - PMSWINC_EL0 = 0xdce4, // 11 011 1001 1100 100 - - // Trace Registers - TRCOSLAR = 0x8884, // 10 001 0001 0000 100 - TRCLAR = 0x8be6, // 10 001 0111 1100 110 - - // GICv3 registers - ICC_EOIR1_EL1 = 0xc661, // 11 000 1100 1100 001 - ICC_EOIR0_EL1 = 0xc641, // 11 000 1100 1000 001 - ICC_DIR_EL1 = 0xc659, // 11 000 1100 1011 001 - ICC_SGI1R_EL1 = 0xc65d, // 11 000 1100 1011 101 - ICC_ASGI1R_EL1 = 0xc65e, // 11 000 1100 1011 110 - ICC_SGI0R_EL1 = 0xc65f // 11 000 1100 1011 111 - }; - - enum SysRegValues { - Invalid = -1, // Op0 Op1 CRn CRm Op2 - OSDTRRX_EL1 = 0x8002, // 10 000 0000 0000 010 - OSDTRTX_EL1 = 0x801a, // 10 000 0000 0011 010 - TEECR32_EL1 = 0x9000, // 10 010 0000 0000 000 - MDCCINT_EL1 = 0x8010, // 10 000 0000 0010 000 - MDSCR_EL1 = 0x8012, // 10 000 0000 0010 010 - DBGDTR_EL0 = 0x9820, // 10 011 0000 0100 000 - OSECCR_EL1 = 0x8032, // 10 000 0000 0110 010 - DBGVCR32_EL2 = 0xa038, // 10 100 0000 0111 000 - DBGBVR0_EL1 = 0x8004, // 10 000 0000 0000 100 - DBGBVR1_EL1 = 0x800c, // 10 000 0000 0001 100 - DBGBVR2_EL1 = 0x8014, // 10 000 0000 0010 100 - DBGBVR3_EL1 = 0x801c, // 10 000 0000 0011 100 - DBGBVR4_EL1 = 0x8024, // 10 000 0000 0100 100 - DBGBVR5_EL1 = 0x802c, // 10 000 0000 0101 100 - DBGBVR6_EL1 = 0x8034, // 10 000 0000 0110 100 - DBGBVR7_EL1 = 0x803c, // 10 000 0000 0111 100 - DBGBVR8_EL1 = 0x8044, // 10 000 0000 1000 100 - DBGBVR9_EL1 = 0x804c, // 10 000 0000 1001 100 - DBGBVR10_EL1 = 0x8054, // 10 000 0000 1010 100 - DBGBVR11_EL1 = 0x805c, // 10 000 0000 1011 100 - DBGBVR12_EL1 = 0x8064, // 10 000 0000 1100 100 - DBGBVR13_EL1 = 0x806c, // 10 000 0000 1101 100 - DBGBVR14_EL1 = 0x8074, // 10 000 0000 1110 100 - DBGBVR15_EL1 = 0x807c, // 10 000 0000 1111 100 - DBGBCR0_EL1 = 0x8005, // 10 000 0000 0000 101 - DBGBCR1_EL1 = 0x800d, // 10 000 0000 0001 101 - DBGBCR2_EL1 = 0x8015, // 10 000 0000 0010 101 - DBGBCR3_EL1 = 0x801d, // 10 000 0000 0011 101 - DBGBCR4_EL1 = 0x8025, // 10 000 0000 0100 101 - DBGBCR5_EL1 = 0x802d, // 10 000 0000 0101 101 - DBGBCR6_EL1 = 0x8035, // 10 000 0000 0110 101 - DBGBCR7_EL1 = 0x803d, // 10 000 0000 0111 101 - DBGBCR8_EL1 = 0x8045, // 10 000 0000 1000 101 - DBGBCR9_EL1 = 0x804d, // 10 000 0000 1001 101 - DBGBCR10_EL1 = 0x8055, // 10 000 0000 1010 101 - DBGBCR11_EL1 = 0x805d, // 10 000 0000 1011 101 - DBGBCR12_EL1 = 0x8065, // 10 000 0000 1100 101 - DBGBCR13_EL1 = 0x806d, // 10 000 0000 1101 101 - DBGBCR14_EL1 = 0x8075, // 10 000 0000 1110 101 - DBGBCR15_EL1 = 0x807d, // 10 000 0000 1111 101 - DBGWVR0_EL1 = 0x8006, // 10 000 0000 0000 110 - DBGWVR1_EL1 = 0x800e, // 10 000 0000 0001 110 - DBGWVR2_EL1 = 0x8016, // 10 000 0000 0010 110 - DBGWVR3_EL1 = 0x801e, // 10 000 0000 0011 110 - DBGWVR4_EL1 = 0x8026, // 10 000 0000 0100 110 - DBGWVR5_EL1 = 0x802e, // 10 000 0000 0101 110 - DBGWVR6_EL1 = 0x8036, // 10 000 0000 0110 110 - DBGWVR7_EL1 = 0x803e, // 10 000 0000 0111 110 - DBGWVR8_EL1 = 0x8046, // 10 000 0000 1000 110 - DBGWVR9_EL1 = 0x804e, // 10 000 0000 1001 110 - DBGWVR10_EL1 = 0x8056, // 10 000 0000 1010 110 - DBGWVR11_EL1 = 0x805e, // 10 000 0000 1011 110 - DBGWVR12_EL1 = 0x8066, // 10 000 0000 1100 110 - DBGWVR13_EL1 = 0x806e, // 10 000 0000 1101 110 - DBGWVR14_EL1 = 0x8076, // 10 000 0000 1110 110 - DBGWVR15_EL1 = 0x807e, // 10 000 0000 1111 110 - DBGWCR0_EL1 = 0x8007, // 10 000 0000 0000 111 - DBGWCR1_EL1 = 0x800f, // 10 000 0000 0001 111 - DBGWCR2_EL1 = 0x8017, // 10 000 0000 0010 111 - DBGWCR3_EL1 = 0x801f, // 10 000 0000 0011 111 - DBGWCR4_EL1 = 0x8027, // 10 000 0000 0100 111 - DBGWCR5_EL1 = 0x802f, // 10 000 0000 0101 111 - DBGWCR6_EL1 = 0x8037, // 10 000 0000 0110 111 - DBGWCR7_EL1 = 0x803f, // 10 000 0000 0111 111 - DBGWCR8_EL1 = 0x8047, // 10 000 0000 1000 111 - DBGWCR9_EL1 = 0x804f, // 10 000 0000 1001 111 - DBGWCR10_EL1 = 0x8057, // 10 000 0000 1010 111 - DBGWCR11_EL1 = 0x805f, // 10 000 0000 1011 111 - DBGWCR12_EL1 = 0x8067, // 10 000 0000 1100 111 - DBGWCR13_EL1 = 0x806f, // 10 000 0000 1101 111 - DBGWCR14_EL1 = 0x8077, // 10 000 0000 1110 111 - DBGWCR15_EL1 = 0x807f, // 10 000 0000 1111 111 - TEEHBR32_EL1 = 0x9080, // 10 010 0001 0000 000 - OSDLR_EL1 = 0x809c, // 10 000 0001 0011 100 - DBGPRCR_EL1 = 0x80a4, // 10 000 0001 0100 100 - DBGCLAIMSET_EL1 = 0x83c6, // 10 000 0111 1000 110 - DBGCLAIMCLR_EL1 = 0x83ce, // 10 000 0111 1001 110 - CSSELR_EL1 = 0xd000, // 11 010 0000 0000 000 - VPIDR_EL2 = 0xe000, // 11 100 0000 0000 000 - VMPIDR_EL2 = 0xe005, // 11 100 0000 0000 101 - CPACR_EL1 = 0xc082, // 11 000 0001 0000 010 - SCTLR_EL1 = 0xc080, // 11 000 0001 0000 000 - SCTLR_EL2 = 0xe080, // 11 100 0001 0000 000 - SCTLR_EL3 = 0xf080, // 11 110 0001 0000 000 - ACTLR_EL1 = 0xc081, // 11 000 0001 0000 001 - ACTLR_EL2 = 0xe081, // 11 100 0001 0000 001 - ACTLR_EL3 = 0xf081, // 11 110 0001 0000 001 - HCR_EL2 = 0xe088, // 11 100 0001 0001 000 - SCR_EL3 = 0xf088, // 11 110 0001 0001 000 - MDCR_EL2 = 0xe089, // 11 100 0001 0001 001 - SDER32_EL3 = 0xf089, // 11 110 0001 0001 001 - CPTR_EL2 = 0xe08a, // 11 100 0001 0001 010 - CPTR_EL3 = 0xf08a, // 11 110 0001 0001 010 - HSTR_EL2 = 0xe08b, // 11 100 0001 0001 011 - HACR_EL2 = 0xe08f, // 11 100 0001 0001 111 - MDCR_EL3 = 0xf099, // 11 110 0001 0011 001 - TTBR0_EL1 = 0xc100, // 11 000 0010 0000 000 - TTBR0_EL2 = 0xe100, // 11 100 0010 0000 000 - TTBR0_EL3 = 0xf100, // 11 110 0010 0000 000 - TTBR1_EL1 = 0xc101, // 11 000 0010 0000 001 - TCR_EL1 = 0xc102, // 11 000 0010 0000 010 - TCR_EL2 = 0xe102, // 11 100 0010 0000 010 - TCR_EL3 = 0xf102, // 11 110 0010 0000 010 - VTTBR_EL2 = 0xe108, // 11 100 0010 0001 000 - VTCR_EL2 = 0xe10a, // 11 100 0010 0001 010 - DACR32_EL2 = 0xe180, // 11 100 0011 0000 000 - SPSR_EL1 = 0xc200, // 11 000 0100 0000 000 - SPSR_EL2 = 0xe200, // 11 100 0100 0000 000 - SPSR_EL3 = 0xf200, // 11 110 0100 0000 000 - ELR_EL1 = 0xc201, // 11 000 0100 0000 001 - ELR_EL2 = 0xe201, // 11 100 0100 0000 001 - ELR_EL3 = 0xf201, // 11 110 0100 0000 001 - SP_EL0 = 0xc208, // 11 000 0100 0001 000 - SP_EL1 = 0xe208, // 11 100 0100 0001 000 - SP_EL2 = 0xf208, // 11 110 0100 0001 000 - SPSel = 0xc210, // 11 000 0100 0010 000 - NZCV = 0xda10, // 11 011 0100 0010 000 - DAIF = 0xda11, // 11 011 0100 0010 001 - CurrentEL = 0xc212, // 11 000 0100 0010 010 - SPSR_irq = 0xe218, // 11 100 0100 0011 000 - SPSR_abt = 0xe219, // 11 100 0100 0011 001 - SPSR_und = 0xe21a, // 11 100 0100 0011 010 - SPSR_fiq = 0xe21b, // 11 100 0100 0011 011 - FPCR = 0xda20, // 11 011 0100 0100 000 - FPSR = 0xda21, // 11 011 0100 0100 001 - DSPSR_EL0 = 0xda28, // 11 011 0100 0101 000 - DLR_EL0 = 0xda29, // 11 011 0100 0101 001 - IFSR32_EL2 = 0xe281, // 11 100 0101 0000 001 - AFSR0_EL1 = 0xc288, // 11 000 0101 0001 000 - AFSR0_EL2 = 0xe288, // 11 100 0101 0001 000 - AFSR0_EL3 = 0xf288, // 11 110 0101 0001 000 - AFSR1_EL1 = 0xc289, // 11 000 0101 0001 001 - AFSR1_EL2 = 0xe289, // 11 100 0101 0001 001 - AFSR1_EL3 = 0xf289, // 11 110 0101 0001 001 - ESR_EL1 = 0xc290, // 11 000 0101 0010 000 - ESR_EL2 = 0xe290, // 11 100 0101 0010 000 - ESR_EL3 = 0xf290, // 11 110 0101 0010 000 - FPEXC32_EL2 = 0xe298, // 11 100 0101 0011 000 - FAR_EL1 = 0xc300, // 11 000 0110 0000 000 - FAR_EL2 = 0xe300, // 11 100 0110 0000 000 - FAR_EL3 = 0xf300, // 11 110 0110 0000 000 - HPFAR_EL2 = 0xe304, // 11 100 0110 0000 100 - PAR_EL1 = 0xc3a0, // 11 000 0111 0100 000 - PMCR_EL0 = 0xdce0, // 11 011 1001 1100 000 - PMCNTENSET_EL0 = 0xdce1, // 11 011 1001 1100 001 - PMCNTENCLR_EL0 = 0xdce2, // 11 011 1001 1100 010 - PMOVSCLR_EL0 = 0xdce3, // 11 011 1001 1100 011 - PMSELR_EL0 = 0xdce5, // 11 011 1001 1100 101 - PMCCNTR_EL0 = 0xdce8, // 11 011 1001 1101 000 - PMXEVTYPER_EL0 = 0xdce9, // 11 011 1001 1101 001 - PMXEVCNTR_EL0 = 0xdcea, // 11 011 1001 1101 010 - PMUSERENR_EL0 = 0xdcf0, // 11 011 1001 1110 000 - PMINTENSET_EL1 = 0xc4f1, // 11 000 1001 1110 001 - PMINTENCLR_EL1 = 0xc4f2, // 11 000 1001 1110 010 - PMOVSSET_EL0 = 0xdcf3, // 11 011 1001 1110 011 - MAIR_EL1 = 0xc510, // 11 000 1010 0010 000 - MAIR_EL2 = 0xe510, // 11 100 1010 0010 000 - MAIR_EL3 = 0xf510, // 11 110 1010 0010 000 - AMAIR_EL1 = 0xc518, // 11 000 1010 0011 000 - AMAIR_EL2 = 0xe518, // 11 100 1010 0011 000 - AMAIR_EL3 = 0xf518, // 11 110 1010 0011 000 - VBAR_EL1 = 0xc600, // 11 000 1100 0000 000 - VBAR_EL2 = 0xe600, // 11 100 1100 0000 000 - VBAR_EL3 = 0xf600, // 11 110 1100 0000 000 - RMR_EL1 = 0xc602, // 11 000 1100 0000 010 - RMR_EL2 = 0xe602, // 11 100 1100 0000 010 - RMR_EL3 = 0xf602, // 11 110 1100 0000 010 - CONTEXTIDR_EL1 = 0xc681, // 11 000 1101 0000 001 - TPIDR_EL0 = 0xde82, // 11 011 1101 0000 010 - TPIDR_EL2 = 0xe682, // 11 100 1101 0000 010 - TPIDR_EL3 = 0xf682, // 11 110 1101 0000 010 - TPIDRRO_EL0 = 0xde83, // 11 011 1101 0000 011 - TPIDR_EL1 = 0xc684, // 11 000 1101 0000 100 - CNTFRQ_EL0 = 0xdf00, // 11 011 1110 0000 000 - CNTVOFF_EL2 = 0xe703, // 11 100 1110 0000 011 - CNTKCTL_EL1 = 0xc708, // 11 000 1110 0001 000 - CNTHCTL_EL2 = 0xe708, // 11 100 1110 0001 000 - CNTP_TVAL_EL0 = 0xdf10, // 11 011 1110 0010 000 - CNTHP_TVAL_EL2 = 0xe710, // 11 100 1110 0010 000 - CNTPS_TVAL_EL1 = 0xff10, // 11 111 1110 0010 000 - CNTP_CTL_EL0 = 0xdf11, // 11 011 1110 0010 001 - CNTHP_CTL_EL2 = 0xe711, // 11 100 1110 0010 001 - CNTPS_CTL_EL1 = 0xff11, // 11 111 1110 0010 001 - CNTP_CVAL_EL0 = 0xdf12, // 11 011 1110 0010 010 - CNTHP_CVAL_EL2 = 0xe712, // 11 100 1110 0010 010 - CNTPS_CVAL_EL1 = 0xff12, // 11 111 1110 0010 010 - CNTV_TVAL_EL0 = 0xdf18, // 11 011 1110 0011 000 - CNTV_CTL_EL0 = 0xdf19, // 11 011 1110 0011 001 - CNTV_CVAL_EL0 = 0xdf1a, // 11 011 1110 0011 010 - PMEVCNTR0_EL0 = 0xdf40, // 11 011 1110 1000 000 - PMEVCNTR1_EL0 = 0xdf41, // 11 011 1110 1000 001 - PMEVCNTR2_EL0 = 0xdf42, // 11 011 1110 1000 010 - PMEVCNTR3_EL0 = 0xdf43, // 11 011 1110 1000 011 - PMEVCNTR4_EL0 = 0xdf44, // 11 011 1110 1000 100 - PMEVCNTR5_EL0 = 0xdf45, // 11 011 1110 1000 101 - PMEVCNTR6_EL0 = 0xdf46, // 11 011 1110 1000 110 - PMEVCNTR7_EL0 = 0xdf47, // 11 011 1110 1000 111 - PMEVCNTR8_EL0 = 0xdf48, // 11 011 1110 1001 000 - PMEVCNTR9_EL0 = 0xdf49, // 11 011 1110 1001 001 - PMEVCNTR10_EL0 = 0xdf4a, // 11 011 1110 1001 010 - PMEVCNTR11_EL0 = 0xdf4b, // 11 011 1110 1001 011 - PMEVCNTR12_EL0 = 0xdf4c, // 11 011 1110 1001 100 - PMEVCNTR13_EL0 = 0xdf4d, // 11 011 1110 1001 101 - PMEVCNTR14_EL0 = 0xdf4e, // 11 011 1110 1001 110 - PMEVCNTR15_EL0 = 0xdf4f, // 11 011 1110 1001 111 - PMEVCNTR16_EL0 = 0xdf50, // 11 011 1110 1010 000 - PMEVCNTR17_EL0 = 0xdf51, // 11 011 1110 1010 001 - PMEVCNTR18_EL0 = 0xdf52, // 11 011 1110 1010 010 - PMEVCNTR19_EL0 = 0xdf53, // 11 011 1110 1010 011 - PMEVCNTR20_EL0 = 0xdf54, // 11 011 1110 1010 100 - PMEVCNTR21_EL0 = 0xdf55, // 11 011 1110 1010 101 - PMEVCNTR22_EL0 = 0xdf56, // 11 011 1110 1010 110 - PMEVCNTR23_EL0 = 0xdf57, // 11 011 1110 1010 111 - PMEVCNTR24_EL0 = 0xdf58, // 11 011 1110 1011 000 - PMEVCNTR25_EL0 = 0xdf59, // 11 011 1110 1011 001 - PMEVCNTR26_EL0 = 0xdf5a, // 11 011 1110 1011 010 - PMEVCNTR27_EL0 = 0xdf5b, // 11 011 1110 1011 011 - PMEVCNTR28_EL0 = 0xdf5c, // 11 011 1110 1011 100 - PMEVCNTR29_EL0 = 0xdf5d, // 11 011 1110 1011 101 - PMEVCNTR30_EL0 = 0xdf5e, // 11 011 1110 1011 110 - PMCCFILTR_EL0 = 0xdf7f, // 11 011 1110 1111 111 - PMEVTYPER0_EL0 = 0xdf60, // 11 011 1110 1100 000 - PMEVTYPER1_EL0 = 0xdf61, // 11 011 1110 1100 001 - PMEVTYPER2_EL0 = 0xdf62, // 11 011 1110 1100 010 - PMEVTYPER3_EL0 = 0xdf63, // 11 011 1110 1100 011 - PMEVTYPER4_EL0 = 0xdf64, // 11 011 1110 1100 100 - PMEVTYPER5_EL0 = 0xdf65, // 11 011 1110 1100 101 - PMEVTYPER6_EL0 = 0xdf66, // 11 011 1110 1100 110 - PMEVTYPER7_EL0 = 0xdf67, // 11 011 1110 1100 111 - PMEVTYPER8_EL0 = 0xdf68, // 11 011 1110 1101 000 - PMEVTYPER9_EL0 = 0xdf69, // 11 011 1110 1101 001 - PMEVTYPER10_EL0 = 0xdf6a, // 11 011 1110 1101 010 - PMEVTYPER11_EL0 = 0xdf6b, // 11 011 1110 1101 011 - PMEVTYPER12_EL0 = 0xdf6c, // 11 011 1110 1101 100 - PMEVTYPER13_EL0 = 0xdf6d, // 11 011 1110 1101 101 - PMEVTYPER14_EL0 = 0xdf6e, // 11 011 1110 1101 110 - PMEVTYPER15_EL0 = 0xdf6f, // 11 011 1110 1101 111 - PMEVTYPER16_EL0 = 0xdf70, // 11 011 1110 1110 000 - PMEVTYPER17_EL0 = 0xdf71, // 11 011 1110 1110 001 - PMEVTYPER18_EL0 = 0xdf72, // 11 011 1110 1110 010 - PMEVTYPER19_EL0 = 0xdf73, // 11 011 1110 1110 011 - PMEVTYPER20_EL0 = 0xdf74, // 11 011 1110 1110 100 - PMEVTYPER21_EL0 = 0xdf75, // 11 011 1110 1110 101 - PMEVTYPER22_EL0 = 0xdf76, // 11 011 1110 1110 110 - PMEVTYPER23_EL0 = 0xdf77, // 11 011 1110 1110 111 - PMEVTYPER24_EL0 = 0xdf78, // 11 011 1110 1111 000 - PMEVTYPER25_EL0 = 0xdf79, // 11 011 1110 1111 001 - PMEVTYPER26_EL0 = 0xdf7a, // 11 011 1110 1111 010 - PMEVTYPER27_EL0 = 0xdf7b, // 11 011 1110 1111 011 - PMEVTYPER28_EL0 = 0xdf7c, // 11 011 1110 1111 100 - PMEVTYPER29_EL0 = 0xdf7d, // 11 011 1110 1111 101 - PMEVTYPER30_EL0 = 0xdf7e, // 11 011 1110 1111 110 - - // Trace registers - TRCPRGCTLR = 0x8808, // 10 001 0000 0001 000 - TRCPROCSELR = 0x8810, // 10 001 0000 0010 000 - TRCCONFIGR = 0x8820, // 10 001 0000 0100 000 - TRCAUXCTLR = 0x8830, // 10 001 0000 0110 000 - TRCEVENTCTL0R = 0x8840, // 10 001 0000 1000 000 - TRCEVENTCTL1R = 0x8848, // 10 001 0000 1001 000 - TRCSTALLCTLR = 0x8858, // 10 001 0000 1011 000 - TRCTSCTLR = 0x8860, // 10 001 0000 1100 000 - TRCSYNCPR = 0x8868, // 10 001 0000 1101 000 - TRCCCCTLR = 0x8870, // 10 001 0000 1110 000 - TRCBBCTLR = 0x8878, // 10 001 0000 1111 000 - TRCTRACEIDR = 0x8801, // 10 001 0000 0000 001 - TRCQCTLR = 0x8809, // 10 001 0000 0001 001 - TRCVICTLR = 0x8802, // 10 001 0000 0000 010 - TRCVIIECTLR = 0x880a, // 10 001 0000 0001 010 - TRCVISSCTLR = 0x8812, // 10 001 0000 0010 010 - TRCVIPCSSCTLR = 0x881a, // 10 001 0000 0011 010 - TRCVDCTLR = 0x8842, // 10 001 0000 1000 010 - TRCVDSACCTLR = 0x884a, // 10 001 0000 1001 010 - TRCVDARCCTLR = 0x8852, // 10 001 0000 1010 010 - TRCSEQEVR0 = 0x8804, // 10 001 0000 0000 100 - TRCSEQEVR1 = 0x880c, // 10 001 0000 0001 100 - TRCSEQEVR2 = 0x8814, // 10 001 0000 0010 100 - TRCSEQRSTEVR = 0x8834, // 10 001 0000 0110 100 - TRCSEQSTR = 0x883c, // 10 001 0000 0111 100 - TRCEXTINSELR = 0x8844, // 10 001 0000 1000 100 - TRCCNTRLDVR0 = 0x8805, // 10 001 0000 0000 101 - TRCCNTRLDVR1 = 0x880d, // 10 001 0000 0001 101 - TRCCNTRLDVR2 = 0x8815, // 10 001 0000 0010 101 - TRCCNTRLDVR3 = 0x881d, // 10 001 0000 0011 101 - TRCCNTCTLR0 = 0x8825, // 10 001 0000 0100 101 - TRCCNTCTLR1 = 0x882d, // 10 001 0000 0101 101 - TRCCNTCTLR2 = 0x8835, // 10 001 0000 0110 101 - TRCCNTCTLR3 = 0x883d, // 10 001 0000 0111 101 - TRCCNTVR0 = 0x8845, // 10 001 0000 1000 101 - TRCCNTVR1 = 0x884d, // 10 001 0000 1001 101 - TRCCNTVR2 = 0x8855, // 10 001 0000 1010 101 - TRCCNTVR3 = 0x885d, // 10 001 0000 1011 101 - TRCIMSPEC0 = 0x8807, // 10 001 0000 0000 111 - TRCIMSPEC1 = 0x880f, // 10 001 0000 0001 111 - TRCIMSPEC2 = 0x8817, // 10 001 0000 0010 111 - TRCIMSPEC3 = 0x881f, // 10 001 0000 0011 111 - TRCIMSPEC4 = 0x8827, // 10 001 0000 0100 111 - TRCIMSPEC5 = 0x882f, // 10 001 0000 0101 111 - TRCIMSPEC6 = 0x8837, // 10 001 0000 0110 111 - TRCIMSPEC7 = 0x883f, // 10 001 0000 0111 111 - TRCRSCTLR2 = 0x8890, // 10 001 0001 0010 000 - TRCRSCTLR3 = 0x8898, // 10 001 0001 0011 000 - TRCRSCTLR4 = 0x88a0, // 10 001 0001 0100 000 - TRCRSCTLR5 = 0x88a8, // 10 001 0001 0101 000 - TRCRSCTLR6 = 0x88b0, // 10 001 0001 0110 000 - TRCRSCTLR7 = 0x88b8, // 10 001 0001 0111 000 - TRCRSCTLR8 = 0x88c0, // 10 001 0001 1000 000 - TRCRSCTLR9 = 0x88c8, // 10 001 0001 1001 000 - TRCRSCTLR10 = 0x88d0, // 10 001 0001 1010 000 - TRCRSCTLR11 = 0x88d8, // 10 001 0001 1011 000 - TRCRSCTLR12 = 0x88e0, // 10 001 0001 1100 000 - TRCRSCTLR13 = 0x88e8, // 10 001 0001 1101 000 - TRCRSCTLR14 = 0x88f0, // 10 001 0001 1110 000 - TRCRSCTLR15 = 0x88f8, // 10 001 0001 1111 000 - TRCRSCTLR16 = 0x8881, // 10 001 0001 0000 001 - TRCRSCTLR17 = 0x8889, // 10 001 0001 0001 001 - TRCRSCTLR18 = 0x8891, // 10 001 0001 0010 001 - TRCRSCTLR19 = 0x8899, // 10 001 0001 0011 001 - TRCRSCTLR20 = 0x88a1, // 10 001 0001 0100 001 - TRCRSCTLR21 = 0x88a9, // 10 001 0001 0101 001 - TRCRSCTLR22 = 0x88b1, // 10 001 0001 0110 001 - TRCRSCTLR23 = 0x88b9, // 10 001 0001 0111 001 - TRCRSCTLR24 = 0x88c1, // 10 001 0001 1000 001 - TRCRSCTLR25 = 0x88c9, // 10 001 0001 1001 001 - TRCRSCTLR26 = 0x88d1, // 10 001 0001 1010 001 - TRCRSCTLR27 = 0x88d9, // 10 001 0001 1011 001 - TRCRSCTLR28 = 0x88e1, // 10 001 0001 1100 001 - TRCRSCTLR29 = 0x88e9, // 10 001 0001 1101 001 - TRCRSCTLR30 = 0x88f1, // 10 001 0001 1110 001 - TRCRSCTLR31 = 0x88f9, // 10 001 0001 1111 001 - TRCSSCCR0 = 0x8882, // 10 001 0001 0000 010 - TRCSSCCR1 = 0x888a, // 10 001 0001 0001 010 - TRCSSCCR2 = 0x8892, // 10 001 0001 0010 010 - TRCSSCCR3 = 0x889a, // 10 001 0001 0011 010 - TRCSSCCR4 = 0x88a2, // 10 001 0001 0100 010 - TRCSSCCR5 = 0x88aa, // 10 001 0001 0101 010 - TRCSSCCR6 = 0x88b2, // 10 001 0001 0110 010 - TRCSSCCR7 = 0x88ba, // 10 001 0001 0111 010 - TRCSSCSR0 = 0x88c2, // 10 001 0001 1000 010 - TRCSSCSR1 = 0x88ca, // 10 001 0001 1001 010 - TRCSSCSR2 = 0x88d2, // 10 001 0001 1010 010 - TRCSSCSR3 = 0x88da, // 10 001 0001 1011 010 - TRCSSCSR4 = 0x88e2, // 10 001 0001 1100 010 - TRCSSCSR5 = 0x88ea, // 10 001 0001 1101 010 - TRCSSCSR6 = 0x88f2, // 10 001 0001 1110 010 - TRCSSCSR7 = 0x88fa, // 10 001 0001 1111 010 - TRCSSPCICR0 = 0x8883, // 10 001 0001 0000 011 - TRCSSPCICR1 = 0x888b, // 10 001 0001 0001 011 - TRCSSPCICR2 = 0x8893, // 10 001 0001 0010 011 - TRCSSPCICR3 = 0x889b, // 10 001 0001 0011 011 - TRCSSPCICR4 = 0x88a3, // 10 001 0001 0100 011 - TRCSSPCICR5 = 0x88ab, // 10 001 0001 0101 011 - TRCSSPCICR6 = 0x88b3, // 10 001 0001 0110 011 - TRCSSPCICR7 = 0x88bb, // 10 001 0001 0111 011 - TRCPDCR = 0x88a4, // 10 001 0001 0100 100 - TRCACVR0 = 0x8900, // 10 001 0010 0000 000 - TRCACVR1 = 0x8910, // 10 001 0010 0010 000 - TRCACVR2 = 0x8920, // 10 001 0010 0100 000 - TRCACVR3 = 0x8930, // 10 001 0010 0110 000 - TRCACVR4 = 0x8940, // 10 001 0010 1000 000 - TRCACVR5 = 0x8950, // 10 001 0010 1010 000 - TRCACVR6 = 0x8960, // 10 001 0010 1100 000 - TRCACVR7 = 0x8970, // 10 001 0010 1110 000 - TRCACVR8 = 0x8901, // 10 001 0010 0000 001 - TRCACVR9 = 0x8911, // 10 001 0010 0010 001 - TRCACVR10 = 0x8921, // 10 001 0010 0100 001 - TRCACVR11 = 0x8931, // 10 001 0010 0110 001 - TRCACVR12 = 0x8941, // 10 001 0010 1000 001 - TRCACVR13 = 0x8951, // 10 001 0010 1010 001 - TRCACVR14 = 0x8961, // 10 001 0010 1100 001 - TRCACVR15 = 0x8971, // 10 001 0010 1110 001 - TRCACATR0 = 0x8902, // 10 001 0010 0000 010 - TRCACATR1 = 0x8912, // 10 001 0010 0010 010 - TRCACATR2 = 0x8922, // 10 001 0010 0100 010 - TRCACATR3 = 0x8932, // 10 001 0010 0110 010 - TRCACATR4 = 0x8942, // 10 001 0010 1000 010 - TRCACATR5 = 0x8952, // 10 001 0010 1010 010 - TRCACATR6 = 0x8962, // 10 001 0010 1100 010 - TRCACATR7 = 0x8972, // 10 001 0010 1110 010 - TRCACATR8 = 0x8903, // 10 001 0010 0000 011 - TRCACATR9 = 0x8913, // 10 001 0010 0010 011 - TRCACATR10 = 0x8923, // 10 001 0010 0100 011 - TRCACATR11 = 0x8933, // 10 001 0010 0110 011 - TRCACATR12 = 0x8943, // 10 001 0010 1000 011 - TRCACATR13 = 0x8953, // 10 001 0010 1010 011 - TRCACATR14 = 0x8963, // 10 001 0010 1100 011 - TRCACATR15 = 0x8973, // 10 001 0010 1110 011 - TRCDVCVR0 = 0x8904, // 10 001 0010 0000 100 - TRCDVCVR1 = 0x8924, // 10 001 0010 0100 100 - TRCDVCVR2 = 0x8944, // 10 001 0010 1000 100 - TRCDVCVR3 = 0x8964, // 10 001 0010 1100 100 - TRCDVCVR4 = 0x8905, // 10 001 0010 0000 101 - TRCDVCVR5 = 0x8925, // 10 001 0010 0100 101 - TRCDVCVR6 = 0x8945, // 10 001 0010 1000 101 - TRCDVCVR7 = 0x8965, // 10 001 0010 1100 101 - TRCDVCMR0 = 0x8906, // 10 001 0010 0000 110 - TRCDVCMR1 = 0x8926, // 10 001 0010 0100 110 - TRCDVCMR2 = 0x8946, // 10 001 0010 1000 110 - TRCDVCMR3 = 0x8966, // 10 001 0010 1100 110 - TRCDVCMR4 = 0x8907, // 10 001 0010 0000 111 - TRCDVCMR5 = 0x8927, // 10 001 0010 0100 111 - TRCDVCMR6 = 0x8947, // 10 001 0010 1000 111 - TRCDVCMR7 = 0x8967, // 10 001 0010 1100 111 - TRCCIDCVR0 = 0x8980, // 10 001 0011 0000 000 - TRCCIDCVR1 = 0x8990, // 10 001 0011 0010 000 - TRCCIDCVR2 = 0x89a0, // 10 001 0011 0100 000 - TRCCIDCVR3 = 0x89b0, // 10 001 0011 0110 000 - TRCCIDCVR4 = 0x89c0, // 10 001 0011 1000 000 - TRCCIDCVR5 = 0x89d0, // 10 001 0011 1010 000 - TRCCIDCVR6 = 0x89e0, // 10 001 0011 1100 000 - TRCCIDCVR7 = 0x89f0, // 10 001 0011 1110 000 - TRCVMIDCVR0 = 0x8981, // 10 001 0011 0000 001 - TRCVMIDCVR1 = 0x8991, // 10 001 0011 0010 001 - TRCVMIDCVR2 = 0x89a1, // 10 001 0011 0100 001 - TRCVMIDCVR3 = 0x89b1, // 10 001 0011 0110 001 - TRCVMIDCVR4 = 0x89c1, // 10 001 0011 1000 001 - TRCVMIDCVR5 = 0x89d1, // 10 001 0011 1010 001 - TRCVMIDCVR6 = 0x89e1, // 10 001 0011 1100 001 - TRCVMIDCVR7 = 0x89f1, // 10 001 0011 1110 001 - TRCCIDCCTLR0 = 0x8982, // 10 001 0011 0000 010 - TRCCIDCCTLR1 = 0x898a, // 10 001 0011 0001 010 - TRCVMIDCCTLR0 = 0x8992, // 10 001 0011 0010 010 - TRCVMIDCCTLR1 = 0x899a, // 10 001 0011 0011 010 - TRCITCTRL = 0x8b84, // 10 001 0111 0000 100 - TRCCLAIMSET = 0x8bc6, // 10 001 0111 1000 110 - TRCCLAIMCLR = 0x8bce, // 10 001 0111 1001 110 - - // GICv3 registers - ICC_BPR1_EL1 = 0xc663, // 11 000 1100 1100 011 - ICC_BPR0_EL1 = 0xc643, // 11 000 1100 1000 011 - ICC_PMR_EL1 = 0xc230, // 11 000 0100 0110 000 - ICC_CTLR_EL1 = 0xc664, // 11 000 1100 1100 100 - ICC_CTLR_EL3 = 0xf664, // 11 110 1100 1100 100 - ICC_SRE_EL1 = 0xc665, // 11 000 1100 1100 101 - ICC_SRE_EL2 = 0xe64d, // 11 100 1100 1001 101 - ICC_SRE_EL3 = 0xf665, // 11 110 1100 1100 101 - ICC_IGRPEN0_EL1 = 0xc666, // 11 000 1100 1100 110 - ICC_IGRPEN1_EL1 = 0xc667, // 11 000 1100 1100 111 - ICC_IGRPEN1_EL3 = 0xf667, // 11 110 1100 1100 111 - ICC_SEIEN_EL1 = 0xc668, // 11 000 1100 1101 000 - ICC_AP0R0_EL1 = 0xc644, // 11 000 1100 1000 100 - ICC_AP0R1_EL1 = 0xc645, // 11 000 1100 1000 101 - ICC_AP0R2_EL1 = 0xc646, // 11 000 1100 1000 110 - ICC_AP0R3_EL1 = 0xc647, // 11 000 1100 1000 111 - ICC_AP1R0_EL1 = 0xc648, // 11 000 1100 1001 000 - ICC_AP1R1_EL1 = 0xc649, // 11 000 1100 1001 001 - ICC_AP1R2_EL1 = 0xc64a, // 11 000 1100 1001 010 - ICC_AP1R3_EL1 = 0xc64b, // 11 000 1100 1001 011 - ICH_AP0R0_EL2 = 0xe640, // 11 100 1100 1000 000 - ICH_AP0R1_EL2 = 0xe641, // 11 100 1100 1000 001 - ICH_AP0R2_EL2 = 0xe642, // 11 100 1100 1000 010 - ICH_AP0R3_EL2 = 0xe643, // 11 100 1100 1000 011 - ICH_AP1R0_EL2 = 0xe648, // 11 100 1100 1001 000 - ICH_AP1R1_EL2 = 0xe649, // 11 100 1100 1001 001 - ICH_AP1R2_EL2 = 0xe64a, // 11 100 1100 1001 010 - ICH_AP1R3_EL2 = 0xe64b, // 11 100 1100 1001 011 - ICH_HCR_EL2 = 0xe658, // 11 100 1100 1011 000 - ICH_MISR_EL2 = 0xe65a, // 11 100 1100 1011 010 - ICH_VMCR_EL2 = 0xe65f, // 11 100 1100 1011 111 - ICH_VSEIR_EL2 = 0xe64c, // 11 100 1100 1001 100 - ICH_LR0_EL2 = 0xe660, // 11 100 1100 1100 000 - ICH_LR1_EL2 = 0xe661, // 11 100 1100 1100 001 - ICH_LR2_EL2 = 0xe662, // 11 100 1100 1100 010 - ICH_LR3_EL2 = 0xe663, // 11 100 1100 1100 011 - ICH_LR4_EL2 = 0xe664, // 11 100 1100 1100 100 - ICH_LR5_EL2 = 0xe665, // 11 100 1100 1100 101 - ICH_LR6_EL2 = 0xe666, // 11 100 1100 1100 110 - ICH_LR7_EL2 = 0xe667, // 11 100 1100 1100 111 - ICH_LR8_EL2 = 0xe668, // 11 100 1100 1101 000 - ICH_LR9_EL2 = 0xe669, // 11 100 1100 1101 001 - ICH_LR10_EL2 = 0xe66a, // 11 100 1100 1101 010 - ICH_LR11_EL2 = 0xe66b, // 11 100 1100 1101 011 - ICH_LR12_EL2 = 0xe66c, // 11 100 1100 1101 100 - ICH_LR13_EL2 = 0xe66d, // 11 100 1100 1101 101 - ICH_LR14_EL2 = 0xe66e, // 11 100 1100 1101 110 - ICH_LR15_EL2 = 0xe66f, // 11 100 1100 1101 111 - }; - - // Cyclone specific system registers - enum CycloneSysRegValues { - CPM_IOACC_CTL_EL3 = 0xff90 - }; - - // Note that these do not inherit from ARM64NamedImmMapper. This class is - // sufficiently different in its behaviour that I don't believe it's worth - // burdening the common ARM64NamedImmMapper with abstractions only needed in - // this one case. - struct SysRegMapper { - static const ARM64NamedImmMapper::Mapping SysRegPairs[]; - static const ARM64NamedImmMapper::Mapping CycloneSysRegPairs[]; - - const ARM64NamedImmMapper::Mapping *InstPairs; - size_t NumInstPairs; - uint64_t FeatureBits; - - SysRegMapper(uint64_t FeatureBits) : FeatureBits(FeatureBits) { } - uint32_t fromString(StringRef Name, bool &Valid) const; - std::string toString(uint32_t Bits, bool &Valid) const; - }; - - struct MSRMapper : SysRegMapper { - static const ARM64NamedImmMapper::Mapping MSRPairs[]; - MSRMapper(uint64_t FeatureBits); - }; - - struct MRSMapper : SysRegMapper { - static const ARM64NamedImmMapper::Mapping MRSPairs[]; - MRSMapper(uint64_t FeatureBits); - }; - - uint32_t ParseGenericRegister(StringRef Name, bool &Valid); -} - -namespace ARM64TLBI { - enum TLBIValues { - Invalid = -1, // Op0 Op1 CRn CRm Op2 - IPAS2E1IS = 0x6401, // 01 100 1000 0000 001 - IPAS2LE1IS = 0x6405, // 01 100 1000 0000 101 - VMALLE1IS = 0x4418, // 01 000 1000 0011 000 - ALLE2IS = 0x6418, // 01 100 1000 0011 000 - ALLE3IS = 0x7418, // 01 110 1000 0011 000 - VAE1IS = 0x4419, // 01 000 1000 0011 001 - VAE2IS = 0x6419, // 01 100 1000 0011 001 - VAE3IS = 0x7419, // 01 110 1000 0011 001 - ASIDE1IS = 0x441a, // 01 000 1000 0011 010 - VAAE1IS = 0x441b, // 01 000 1000 0011 011 - ALLE1IS = 0x641c, // 01 100 1000 0011 100 - VALE1IS = 0x441d, // 01 000 1000 0011 101 - VALE2IS = 0x641d, // 01 100 1000 0011 101 - VALE3IS = 0x741d, // 01 110 1000 0011 101 - VMALLS12E1IS = 0x641e, // 01 100 1000 0011 110 - VAALE1IS = 0x441f, // 01 000 1000 0011 111 - IPAS2E1 = 0x6421, // 01 100 1000 0100 001 - IPAS2LE1 = 0x6425, // 01 100 1000 0100 101 - VMALLE1 = 0x4438, // 01 000 1000 0111 000 - ALLE2 = 0x6438, // 01 100 1000 0111 000 - ALLE3 = 0x7438, // 01 110 1000 0111 000 - VAE1 = 0x4439, // 01 000 1000 0111 001 - VAE2 = 0x6439, // 01 100 1000 0111 001 - VAE3 = 0x7439, // 01 110 1000 0111 001 - ASIDE1 = 0x443a, // 01 000 1000 0111 010 - VAAE1 = 0x443b, // 01 000 1000 0111 011 - ALLE1 = 0x643c, // 01 100 1000 0111 100 - VALE1 = 0x443d, // 01 000 1000 0111 101 - VALE2 = 0x643d, // 01 100 1000 0111 101 - VALE3 = 0x743d, // 01 110 1000 0111 101 - VMALLS12E1 = 0x643e, // 01 100 1000 0111 110 - VAALE1 = 0x443f // 01 000 1000 0111 111 - }; - - struct TLBIMapper : ARM64NamedImmMapper { - const static Mapping TLBIPairs[]; - - TLBIMapper(); - }; - - static inline bool NeedsRegister(TLBIValues Val) { - switch (Val) { - case VMALLE1IS: - case ALLE2IS: - case ALLE3IS: - case ALLE1IS: - case VMALLS12E1IS: - case VMALLE1: - case ALLE2: - case ALLE3: - case ALLE1: - case VMALLS12E1: - return false; - default: - return true; - } - } -} - -namespace ARM64II { - /// Target Operand Flag enum. - enum TOF { - //===------------------------------------------------------------------===// - // ARM64 Specific MachineOperand flags. - - MO_NO_FLAG, - - MO_FRAGMENT = 0x7, - - /// MO_PAGE - A symbol operand with this flag represents the pc-relative - /// offset of the 4K page containing the symbol. This is used with the - /// ADRP instruction. - MO_PAGE = 1, - - /// MO_PAGEOFF - A symbol operand with this flag represents the offset of - /// that symbol within a 4K page. This offset is added to the page address - /// to produce the complete address. - MO_PAGEOFF = 2, - - /// MO_G3 - A symbol operand with this flag (granule 3) represents the high - /// 16-bits of a 64-bit address, used in a MOVZ or MOVK instruction - MO_G3 = 3, - - /// MO_G2 - A symbol operand with this flag (granule 2) represents the bits - /// 32-47 of a 64-bit address, used in a MOVZ or MOVK instruction - MO_G2 = 4, - - /// MO_G1 - A symbol operand with this flag (granule 1) represents the bits - /// 16-31 of a 64-bit address, used in a MOVZ or MOVK instruction - MO_G1 = 5, - - /// MO_G0 - A symbol operand with this flag (granule 0) represents the bits - /// 0-15 of a 64-bit address, used in a MOVZ or MOVK instruction - MO_G0 = 6, - - /// MO_GOT - This flag indicates that a symbol operand represents the - /// address of the GOT entry for the symbol, rather than the address of - /// the symbol itself. - MO_GOT = 8, - - /// MO_NC - Indicates whether the linker is expected to check the symbol - /// reference for overflow. For example in an ADRP/ADD pair of relocations - /// the ADRP usually does check, but not the ADD. - MO_NC = 0x10, - - /// MO_TLS - Indicates that the operand being accessed is some kind of - /// thread-local symbol. On Darwin, only one type of thread-local access - /// exists (pre linker-relaxation), but on ELF the TLSModel used for the - /// referee will affect interpretation. - MO_TLS = 0x20 - }; -} // end namespace ARM64II - -} // end namespace llvm - -#endif diff --git a/lib/Target/ARM64/Utils/CMakeLists.txt b/lib/Target/ARM64/Utils/CMakeLists.txt deleted file mode 100644 index f69076f4ef6..00000000000 --- a/lib/Target/ARM64/Utils/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -add_llvm_library(LLVMARM64Utils - ARM64BaseInfo.cpp - ) diff --git a/lib/Target/ARM64/Utils/LLVMBuild.txt b/lib/Target/ARM64/Utils/LLVMBuild.txt deleted file mode 100644 index 232dca29f40..00000000000 --- a/lib/Target/ARM64/Utils/LLVMBuild.txt +++ /dev/null @@ -1,23 +0,0 @@ -;===- ./lib/Target/ARM64/Utils/LLVMBuild.txt ----------------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[component_0] -type = Library -name = ARM64Utils -parent = ARM64 -required_libraries = Support -add_to_library_groups = ARM64 diff --git a/lib/Target/ARM64/Utils/Makefile b/lib/Target/ARM64/Utils/Makefile deleted file mode 100644 index 6491ad9a07b..00000000000 --- a/lib/Target/ARM64/Utils/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -##===- lib/Target/ARM64/Utils/Makefile -------------------*- Makefile -*-===## -# -# The LLVM Compiler Infrastructure -# -# This file is distributed under the University of Illinois Open Source -# License. See LICENSE.TXT for details. -# -##===----------------------------------------------------------------------===## -LEVEL = ../../../.. -LIBRARYNAME = LLVMARM64Utils - -# Hack: we need to include 'main' ARM64 target directory to grab private headers -CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/.. - -include $(LEVEL)/Makefile.common diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt index da2309ba0cb..1b0837cb3b5 100644 --- a/lib/Target/LLVMBuild.txt +++ b/lib/Target/LLVMBuild.txt @@ -16,7 +16,7 @@ ;===------------------------------------------------------------------------===; [common] -subdirectories = ARM ARM64 CppBackend Hexagon MSP430 NVPTX Mips PowerPC R600 Sparc SystemZ X86 XCore +subdirectories = ARM AArch64 CppBackend Hexagon MSP430 NVPTX Mips PowerPC R600 Sparc SystemZ X86 XCore ; This is a special group whose required libraries are extended (by llvm-build) ; with the best execution engine (the native JIT, if available, or the diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 18a0f9c167a..be1b5aa50b1 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -836,8 +836,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { case Intrinsic::arm_neon_vmulls: case Intrinsic::arm_neon_vmullu: - case Intrinsic::arm64_neon_smull: - case Intrinsic::arm64_neon_umull: { + case Intrinsic::aarch64_neon_smull: + case Intrinsic::aarch64_neon_umull: { Value *Arg0 = II->getArgOperand(0); Value *Arg1 = II->getArgOperand(1); @@ -848,7 +848,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { // Check for constant LHS & RHS - in this case we just simplify. bool Zext = (II->getIntrinsicID() == Intrinsic::arm_neon_vmullu || - II->getIntrinsicID() == Intrinsic::arm64_neon_umull); + II->getIntrinsicID() == Intrinsic::aarch64_neon_umull); VectorType *NewVT = cast(II->getType()); if (Constant *CV0 = dyn_cast(Arg0)) { if (Constant *CV1 = dyn_cast(Arg1)) { diff --git a/test/Analysis/CostModel/AArch64/lit.local.cfg b/test/Analysis/CostModel/AArch64/lit.local.cfg new file mode 100644 index 00000000000..c42034979fc --- /dev/null +++ b/test/Analysis/CostModel/AArch64/lit.local.cfg @@ -0,0 +1,3 @@ +targets = set(config.root.targets_to_build.split()) +if not 'AArch64' in targets: + config.unsupported = True diff --git a/test/Analysis/CostModel/AArch64/select.ll b/test/Analysis/CostModel/AArch64/select.ll new file mode 100644 index 00000000000..216dc5ddc48 --- /dev/null +++ b/test/Analysis/CostModel/AArch64/select.ll @@ -0,0 +1,38 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" + +; CHECK-LABEL: select +define void @select() { + ; Scalar values + ; CHECK: cost of 1 {{.*}} select + %v1 = select i1 undef, i8 undef, i8 undef + ; CHECK: cost of 1 {{.*}} select + %v2 = select i1 undef, i16 undef, i16 undef + ; CHECK: cost of 1 {{.*}} select + %v3 = select i1 undef, i32 undef, i32 undef + ; CHECK: cost of 1 {{.*}} select + %v4 = select i1 undef, i64 undef, i64 undef + ; CHECK: cost of 1 {{.*}} select + %v5 = select i1 undef, float undef, float undef + ; CHECK: cost of 1 {{.*}} select + %v6 = select i1 undef, double undef, double undef + + ; Vector values - check for vectors that have a high cost because they end up + ; scalarized. + ; CHECK: cost of 320 {{.*}} select + %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef + + ; CHECK: cost of 160 {{.*}} select + %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef + ; CHECK: cost of 320 {{.*}} select + %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef + + ; CHECK: cost of 80 {{.*}} select + %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef + ; CHECK: cost of 160 {{.*}} select + %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef + ; CHECK: cost of 320 {{.*}} select + %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef + + ret void +} diff --git a/test/Analysis/CostModel/AArch64/store.ll b/test/Analysis/CostModel/AArch64/store.ll new file mode 100644 index 00000000000..0c9883cf2a2 --- /dev/null +++ b/test/Analysis/CostModel/AArch64/store.ll @@ -0,0 +1,22 @@ +; RUN: opt < %s -cost-model -analyze -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" +; CHECK-LABEL: store +define void @store() { + ; Stores of <2 x i64> should be expensive because we don't split them and + ; and unaligned 16b stores have bad performance. + ; CHECK: cost of 12 {{.*}} store + store <2 x i64> undef, <2 x i64> * undef + + ; We scalarize the loads/stores because there is no vector register name for + ; these types (they get extended to v.4h/v.2s). + ; CHECK: cost of 16 {{.*}} store + store <2 x i8> undef, <2 x i8> * undef + ; CHECK: cost of 64 {{.*}} store + store <4 x i8> undef, <4 x i8> * undef + ; CHECK: cost of 16 {{.*}} load + load <2 x i8> * undef + ; CHECK: cost of 64 {{.*}} load + load <4 x i8> * undef + + ret void +} diff --git a/test/Analysis/CostModel/ARM64/lit.local.cfg b/test/Analysis/CostModel/ARM64/lit.local.cfg deleted file mode 100644 index 84ac9811f01..00000000000 --- a/test/Analysis/CostModel/ARM64/lit.local.cfg +++ /dev/null @@ -1,3 +0,0 @@ -targets = set(config.root.targets_to_build.split()) -if not 'ARM64' in targets: - config.unsupported = True diff --git a/test/Analysis/CostModel/ARM64/select.ll b/test/Analysis/CostModel/ARM64/select.ll deleted file mode 100644 index 216dc5ddc48..00000000000 --- a/test/Analysis/CostModel/ARM64/select.ll +++ /dev/null @@ -1,38 +0,0 @@ -; RUN: opt < %s -cost-model -analyze -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" - -; CHECK-LABEL: select -define void @select() { - ; Scalar values - ; CHECK: cost of 1 {{.*}} select - %v1 = select i1 undef, i8 undef, i8 undef - ; CHECK: cost of 1 {{.*}} select - %v2 = select i1 undef, i16 undef, i16 undef - ; CHECK: cost of 1 {{.*}} select - %v3 = select i1 undef, i32 undef, i32 undef - ; CHECK: cost of 1 {{.*}} select - %v4 = select i1 undef, i64 undef, i64 undef - ; CHECK: cost of 1 {{.*}} select - %v5 = select i1 undef, float undef, float undef - ; CHECK: cost of 1 {{.*}} select - %v6 = select i1 undef, double undef, double undef - - ; Vector values - check for vectors that have a high cost because they end up - ; scalarized. - ; CHECK: cost of 320 {{.*}} select - %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef - - ; CHECK: cost of 160 {{.*}} select - %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef - ; CHECK: cost of 320 {{.*}} select - %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef - - ; CHECK: cost of 80 {{.*}} select - %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef - ; CHECK: cost of 160 {{.*}} select - %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef - ; CHECK: cost of 320 {{.*}} select - %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef - - ret void -} diff --git a/test/Analysis/CostModel/ARM64/store.ll b/test/Analysis/CostModel/ARM64/store.ll deleted file mode 100644 index 0c9883cf2a2..00000000000 --- a/test/Analysis/CostModel/ARM64/store.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: opt < %s -cost-model -analyze -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" -; CHECK-LABEL: store -define void @store() { - ; Stores of <2 x i64> should be expensive because we don't split them and - ; and unaligned 16b stores have bad performance. - ; CHECK: cost of 12 {{.*}} store - store <2 x i64> undef, <2 x i64> * undef - - ; We scalarize the loads/stores because there is no vector register name for - ; these types (they get extended to v.4h/v.2s). - ; CHECK: cost of 16 {{.*}} store - store <2 x i8> undef, <2 x i8> * undef - ; CHECK: cost of 64 {{.*}} store - store <4 x i8> undef, <4 x i8> * undef - ; CHECK: cost of 16 {{.*}} load - load <2 x i8> * undef - ; CHECK: cost of 64 {{.*}} load - load <4 x i8> * undef - - ret void -} diff --git a/test/CodeGen/AArch64/128bit_load_store.ll b/test/CodeGen/AArch64/128bit_load_store.ll index 56f67873f84..a6f077698e4 100644 --- a/test/CodeGen/AArch64/128bit_load_store.ll +++ b/test/CodeGen/AArch64/128bit_load_store.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=neon | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64 +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s --check-prefix=CHECK define void @test_store_f128(fp128* %ptr, fp128 %val) #0 { ; CHECK-LABEL: test_store_f128 @@ -17,8 +17,8 @@ entry: } define void @test_vstrq_p128(i128* %ptr, i128 %val) #0 { -; CHECK-ARM64-LABEL: test_vstrq_p128 -; CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [{{x[0-9]+}}] +; CHECK-LABEL: test_vstrq_p128 +; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [{{x[0-9]+}}] entry: %0 = bitcast i128* %ptr to fp128* @@ -28,8 +28,8 @@ entry: } define i128 @test_vldrq_p128(i128* readonly %ptr) #2 { -; CHECK-ARM64-LABEL: test_vldrq_p128 -; CHECK-ARM64: ldp {{x[0-9]+}}, {{x[0-9]+}}, [{{x[0-9]+}}] +; CHECK-LABEL: test_vldrq_p128 +; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [{{x[0-9]+}}] entry: %0 = bitcast i128* %ptr to fp128* diff --git a/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll b/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll new file mode 100644 index 00000000000..c932253049e --- /dev/null +++ b/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll @@ -0,0 +1,69 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s +; arm64 has a separate copy as aarch64-neon-v1i1-setcc.ll + +; This file test the DAG node like "v1i1 SETCC v1i64, v1i64". As the v1i1 type +; is illegal in AArch64 backend, the legalizer tries to scalarize this node. +; As the v1i64 operands of SETCC are legal types, they will not be scalarized. +; Currently the type legalizer will have an assertion failure as it assumes all +; operands of SETCC have been legalized. +; FIXME: If the algorithm of type scalarization is improved and can legaize +; "v1i1 SETCC" correctly, these test cases are not needed. + +define i64 @test_sext_extr_cmp_0(<1 x i64> %v1, <1 x i64> %v2) { +; CHECK-LABEL: test_sext_extr_cmp_0: +; CHECK: cmp {{x[0-9]+}}, {{x[0-9]+}} + %1 = icmp sge <1 x i64> %v1, %v2 + %2 = extractelement <1 x i1> %1, i32 0 + %vget_lane = sext i1 %2 to i64 + ret i64 %vget_lane +} + +define i64 @test_sext_extr_cmp_1(<1 x double> %v1, <1 x double> %v2) { +; CHECK-LABEL: test_sext_extr_cmp_1: +; CHECK: fcmp {{d[0-9]+}}, {{d[0-9]+}} + %1 = fcmp oeq <1 x double> %v1, %v2 + %2 = extractelement <1 x i1> %1, i32 0 + %vget_lane = sext i1 %2 to i64 + ret i64 %vget_lane +} + +define <1 x i64> @test_select_v1i1_0(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) { +; CHECK-LABEL: test_select_v1i1_0: +; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %1 = icmp eq <1 x i64> %v1, %v2 + %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3 + ret <1 x i64> %res +} + +define <1 x i64> @test_select_v1i1_1(<1 x double> %v1, <1 x double> %v2, <1 x i64> %v3) { +; CHECK-LABEL: test_select_v1i1_1: +; CHECK: fcmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %1 = fcmp oeq <1 x double> %v1, %v2 + %res = select <1 x i1> %1, <1 x i64> zeroinitializer, <1 x i64> %v3 + ret <1 x i64> %res +} + +define <1 x double> @test_select_v1i1_2(<1 x i64> %v1, <1 x i64> %v2, <1 x double> %v3) { +; CHECK-LABEL: test_select_v1i1_2: +; CHECK: cmeq d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +; CHECK: bic v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b + %1 = icmp eq <1 x i64> %v1, %v2 + %res = select <1 x i1> %1, <1 x double> zeroinitializer, <1 x double> %v3 + ret <1 x double> %res +} + +define i32 @test_br_extr_cmp(<1 x i64> %v1, <1 x i64> %v2) { +; CHECK-LABEL: test_br_extr_cmp: +; CHECK: cmp x{{[0-9]+}}, x{{[0-9]+}} + %1 = icmp eq <1 x i64> %v1, %v2 + %2 = extractelement <1 x i1> %1, i32 0 + br i1 %2, label %if.end, label %if.then + +if.then: + ret i32 0; + +if.end: + ret i32 1; +} diff --git a/test/CodeGen/AArch64/addsub.ll b/test/CodeGen/AArch64/addsub.ll index 3aa427c352c..b85fdbb14ce 100644 --- a/test/CodeGen/AArch64/addsub.ll +++ b/test/CodeGen/AArch64/addsub.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-linux-gnu | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-linux-gnu | FileCheck %s ; Note that this should be refactored (for efficiency if nothing else) ; when the PCS is implemented so we don't have to worry about the diff --git a/test/CodeGen/AArch64/addsub_ext.ll b/test/CodeGen/AArch64/addsub_ext.ll index cd01f594dcd..a2266b1d36d 100644 --- a/test/CodeGen/AArch64/addsub_ext.ll +++ b/test/CodeGen/AArch64/addsub_ext.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs %s -o - -mtriple=arm64-linux-gnu | FileCheck %s +; RUN: llc -verify-machineinstrs %s -o - -mtriple=aarch64-linux-gnu | FileCheck %s @var8 = global i8 0 @var16 = global i16 0 diff --git a/test/CodeGen/AArch64/alloca.ll b/test/CodeGen/AArch64/alloca.ll index 7cab200b1ea..f93efbc42e6 100644 --- a/test/CodeGen/AArch64/alloca.ll +++ b/test/CodeGen/AArch64/alloca.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64 -; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP-ARM64 %s +; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=CHECK +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP-ARM64 %s declare void @use_addr(i8*) @@ -53,7 +53,7 @@ define i64 @test_alloca_with_local(i64 %n) { %val = load i64* %loc -; CHECK-ARM64: ldur x0, [x29, #-[[LOC_FROM_FP]]] +; CHECK: ldur x0, [x29, #-[[LOC_FROM_FP]]] ret i64 %val ; Make sure epilogue restores sp from fp @@ -74,16 +74,16 @@ define void @test_variadic_alloca(i64 %n, ...) { ; CHECK-NOFP-AARCH64: add x8, [[TMP]], #0 -; CHECK-ARM64: stp x29, x30, [sp, #-16]! -; CHECK-ARM64: mov x29, sp -; CHECK-ARM64: sub sp, sp, #192 -; CHECK-ARM64: stp q6, q7, [x29, #-96] +; CHECK: stp x29, x30, [sp, #-16]! +; CHECK: mov x29, sp +; CHECK: sub sp, sp, #192 +; CHECK: stp q6, q7, [x29, #-96] ; [...] -; CHECK-ARM64: stp q0, q1, [x29, #-192] +; CHECK: stp q0, q1, [x29, #-192] -; CHECK-ARM64: stp x6, x7, [x29, #-16] +; CHECK: stp x6, x7, [x29, #-16] ; [...] -; CHECK-ARM64: stp x2, x3, [x29, #-48] +; CHECK: stp x2, x3, [x29, #-48] ; CHECK-NOFP-ARM64: stp x29, x30, [sp, #-16]! ; CHECK-NOFP-ARM64: mov x29, sp @@ -115,11 +115,11 @@ define void @test_alloca_large_frame(i64 %n) { ; CHECK-LABEL: test_alloca_large_frame: -; CHECK-ARM64: stp x20, x19, [sp, #-32]! -; CHECK-ARM64: stp x29, x30, [sp, #16] -; CHECK-ARM64: add x29, sp, #16 -; CHECK-ARM64: sub sp, sp, #1953, lsl #12 -; CHECK-ARM64: sub sp, sp, #512 +; CHECK: stp x20, x19, [sp, #-32]! +; CHECK: stp x29, x30, [sp, #16] +; CHECK: add x29, sp, #16 +; CHECK: sub sp, sp, #1953, lsl #12 +; CHECK: sub sp, sp, #512 %addr1 = alloca i8, i64 %n %addr2 = alloca i64, i64 1000000 @@ -128,9 +128,9 @@ define void @test_alloca_large_frame(i64 %n) { ret void -; CHECK-ARM64: sub sp, x29, #16 -; CHECK-ARM64: ldp x29, x30, [sp, #16] -; CHECK-ARM64: ldp x20, x19, [sp], #32 +; CHECK: sub sp, x29, #16 +; CHECK: ldp x29, x30, [sp, #16] +; CHECK: ldp x20, x19, [sp], #32 } declare i8* @llvm.stacksave() diff --git a/test/CodeGen/AArch64/analyze-branch.ll b/test/CodeGen/AArch64/analyze-branch.ll index 1d4daec5f43..6616b27c45b 100644 --- a/test/CodeGen/AArch64/analyze-branch.ll +++ b/test/CodeGen/AArch64/analyze-branch.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s ; This test checks that LLVM can do basic stripping and reapplying of branches ; to basic blocks. diff --git a/test/CodeGen/AArch64/arm64-2011-03-09-CPSRSpill.ll b/test/CodeGen/AArch64/arm64-2011-03-09-CPSRSpill.ll new file mode 100644 index 00000000000..6fb7c3fb5e0 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-2011-03-09-CPSRSpill.ll @@ -0,0 +1,47 @@ +; RUN: llc < %s -mtriple=arm64-apple-darwin + +; Can't copy or spill / restore CPSR. +; rdar://9105206 + +define fastcc void @t() ssp align 2 { +entry: + br i1 undef, label %bb3.i, label %bb2.i + +bb2.i: ; preds = %entry + br label %bb3.i + +bb3.i: ; preds = %bb2.i, %entry + br i1 undef, label %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71, label %bb.i69 + +bb.i69: ; preds = %bb3.i + br label %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71 + +_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71: ; preds = %bb.i69, %bb3.i + %0 = select i1 undef, float 0.000000e+00, float undef + %1 = fdiv float %0, undef + %2 = fcmp ult float %1, 0xBF847AE140000000 + %storemerge9 = select i1 %2, float %1, float 0.000000e+00 + store float %storemerge9, float* undef, align 4 + br i1 undef, label %bb42, label %bb47 + +bb42: ; preds = %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71 + br i1 undef, label %bb46, label %bb53 + +bb46: ; preds = %bb42 + br label %bb48 + +bb47: ; preds = %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71 + br label %bb48 + +bb48: ; preds = %bb47, %bb46 + br i1 undef, label %bb1.i14, label %bb.i13 + +bb.i13: ; preds = %bb48 + br label %bb1.i14 + +bb1.i14: ; preds = %bb.i13, %bb48 + br label %bb53 + +bb53: ; preds = %bb1.i14, %bb42 + ret void +} diff --git a/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll new file mode 100644 index 00000000000..2b083d80491 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll @@ -0,0 +1,45 @@ +; RUN: llc < %s -mtriple=arm64-apple-darwin + +; rdar://9146594 + +define void @drt_vsprintf() nounwind ssp { +entry: + %do_tab_convert = alloca i32, align 4 + br i1 undef, label %if.then24, label %if.else295, !dbg !13 + +if.then24: ; preds = %entry + unreachable + +if.else295: ; preds = %entry + call void @llvm.dbg.declare(metadata !{i32* %do_tab_convert}, metadata !16), !dbg !18 + store i32 0, i32* %do_tab_convert, align 4, !dbg !19 + unreachable +} + +declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone + +!llvm.dbg.gv = !{!0} +!llvm.dbg.sp = !{!1, !7, !10, !11, !12} + +!0 = metadata !{i32 589876, i32 0, metadata !1, metadata !"vsplive", metadata !"vsplive", metadata !"", metadata !2, i32 617, metadata !6, i32 1, i32 1, null, null} ; [ DW_TAG_variable ] +!1 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"drt_vsprintf", metadata !"drt_vsprintf", metadata !"", i32 616, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] +!2 = metadata !{i32 589865, metadata !20} ; [ DW_TAG_file_type ] +!3 = metadata !{i32 589841, metadata !20, i32 12, metadata !"clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ] +!4 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !5, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] +!5 = metadata !{metadata !6} +!6 = metadata !{i32 589860, null, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] +!7 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"putc_mem", metadata !"putc_mem", metadata !"", i32 30, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] +!8 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !9, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] +!9 = metadata !{null} +!10 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_double", metadata !"print_double", metadata !"", i32 203, metadata !4, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] +!11 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_number", metadata !"print_number", metadata !"", i32 75, metadata !4, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] +!12 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"get_flags", metadata !"get_flags", metadata !"", i32 508, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ] +!13 = metadata !{i32 653, i32 5, metadata !14, null} +!14 = metadata !{i32 589835, metadata !20, metadata !15, i32 652, i32 35, i32 2} ; [ DW_TAG_lexical_block ] +!15 = metadata !{i32 589835, metadata !20, metadata !1, i32 616, i32 1, i32 0} ; [ DW_TAG_lexical_block ] +!16 = metadata !{i32 590080, metadata !17, metadata !"do_tab_convert", metadata !2, i32 853, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ] +!17 = metadata !{i32 589835, metadata !20, metadata !14, i32 850, i32 12, i32 33} ; [ DW_TAG_lexical_block ] +!18 = metadata !{i32 853, i32 11, metadata !17, null} +!19 = metadata !{i32 853, i32 29, metadata !17, null} +!20 = metadata !{metadata !"print.i", metadata !"/Volumes/Ebi/echeng/radars/r9146594"} +!21 = metadata !{i32 0} diff --git a/test/CodeGen/AArch64/arm64-2011-03-21-Unaligned-Frame-Index.ll b/test/CodeGen/AArch64/arm64-2011-03-21-Unaligned-Frame-Index.ll new file mode 100644 index 00000000000..6f0ec34fc1d --- /dev/null +++ b/test/CodeGen/AArch64/arm64-2011-03-21-Unaligned-Frame-Index.ll @@ -0,0 +1,12 @@ +; RUN: llc < %s -march=arm64 | FileCheck %s +define void @foo(i64 %val) { +; CHECK: foo +; The stack frame store is not 64-bit aligned. Make sure we use an +; instruction that can handle that. +; CHECK: stur x0, [sp, #20] + %a = alloca [49 x i32], align 4 + %p32 = getelementptr inbounds [49 x i32]* %a, i64 0, i64 2 + %p = bitcast i32* %p32 to i64* + store i64 %val, i64* %p, align 8 + ret void +} diff --git a/test/CodeGen/AArch64/arm64-2011-04-21-CPSRBug.ll b/test/CodeGen/AArch64/arm64-2011-04-21-CPSRBug.ll new file mode 100644 index 00000000000..88232fcc0b4 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-2011-04-21-CPSRBug.ll @@ -0,0 +1,26 @@ +; RUN: llc < %s -mtriple=arm64-apple-iOS5.0 + +; CPSR is not allocatable so fast allocatable wouldn't mark them killed. +; rdar://9313272 + +define hidden void @t() nounwind { +entry: + %cmp = icmp eq i32* null, undef + %frombool = zext i1 %cmp to i8 + store i8 %frombool, i8* undef, align 1 + %tmp4 = load i8* undef, align 1 + %tobool = trunc i8 %tmp4 to i1 + br i1 %tobool, label %land.lhs.true, label %if.end + +land.lhs.true: ; preds = %entry + unreachable + +if.end: ; preds = %entry + br i1 undef, label %land.lhs.true14, label %if.end33 + +land.lhs.true14: ; preds = %if.end + unreachable + +if.end33: ; preds = %if.end + unreachable +} diff --git a/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll b/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll new file mode 100644 index 00000000000..8f99bc30a55 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll @@ -0,0 +1,31 @@ +; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s + +; Can't fold the increment by 1<<12 into a post-increment load +; rdar://10301335 + +@test_data = common global i32 0, align 4 + +define void @t() nounwind ssp { +; CHECK-LABEL: t: +entry: + br label %for.body + +for.body: +; CHECK: for.body +; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}] +; CHECK: add x[[REG:[0-9]+]], +; CHECK: x[[REG]], #1, lsl #12 + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %0 = shl nsw i64 %indvars.iv, 12 + %add = add nsw i64 %0, 34628173824 + %1 = inttoptr i64 %add to i32* + %2 = load volatile i32* %1, align 4096 + store volatile i32 %2, i32* @test_data, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 200 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} diff --git a/test/CodeGen/AArch64/arm64-2012-01-11-ComparisonDAGCrash.ll b/test/CodeGen/AArch64/arm64-2012-01-11-ComparisonDAGCrash.ll new file mode 100644 index 00000000000..d47dbb28164 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-2012-01-11-ComparisonDAGCrash.ll @@ -0,0 +1,40 @@ +; RUN: llc < %s -march=arm64 + +; The target lowering for integer comparisons was replacing some DAG nodes +; during operation legalization, which resulted in dangling pointers, +; cycles in DAGs, and eventually crashes. This is the testcase for +; one of those crashes. (rdar://10653656) + +define void @test(i1 zeroext %IsArrow) nounwind ssp align 2 { +entry: + br i1 undef, label %return, label %lor.lhs.false + +lor.lhs.false: + br i1 undef, label %return, label %if.end + +if.end: + %tmp.i = load i64* undef, align 8 + %and.i.i.i = and i64 %tmp.i, -16 + br i1 %IsArrow, label %if.else_crit_edge, label %if.end32 + +if.else_crit_edge: + br i1 undef, label %if.end32, label %return + +if.end32: + %0 = icmp ult i32 undef, 3 + %1 = zext i64 %tmp.i to i320 + %.pn.v = select i1 %0, i320 128, i320 64 + %.pn = shl i320 %1, %.pn.v + %ins346392 = or i320 %.pn, 0 + store i320 %ins346392, i320* undef, align 8 + br i1 undef, label %sw.bb.i.i, label %exit + +sw.bb.i.i: + unreachable + +exit: + unreachable + +return: + ret void +} diff --git a/test/CodeGen/AArch64/arm64-2012-05-07-DAGCombineVectorExtract.ll b/test/CodeGen/AArch64/arm64-2012-05-07-DAGCombineVectorExtract.ll new file mode 100644 index 00000000000..a4d37e48685 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-2012-05-07-DAGCombineVectorExtract.ll @@ -0,0 +1,20 @@ +; RUN: llc < %s -march=arm64 | FileCheck %s + +define i32 @foo(<4 x i32> %a, i32 %n) nounwind { +; CHECK-LABEL: foo: +; CHECK: fmov w0, s0 +; CHECK-NEXT: ret + %b = bitcast <4 x i32> %a to i128 + %c = trunc i128 %b to i32 + ret i32 %c +} + +define i64 @bar(<2 x i64> %a, i64 %n) nounwind { +; CHECK-LABEL: bar: +; CHECK: fmov x0, d0 +; CHECK-NEXT: ret + %b = bitcast <2 x i64> %a to i128 + %c = trunc i128 %b to i64 + ret i64 %c +} + diff --git a/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll b/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll new file mode 100644 index 00000000000..d59b0d00438 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll @@ -0,0 +1,21 @@ +; RUN: llc < %s -march arm64 -mcpu=cyclone | FileCheck %s +; + +@b = private unnamed_addr constant [3 x i32] [i32 1768775988, i32 1685481784, i32 1836253201], align 4 + +; The important thing for this test is that we need an unaligned load of `l_b' +; ("ldr w2, [x1, #8]" in this case). + +; CHECK: adrp x[[PAGE:[0-9]+]], {{l_b@PAGE|.Lb}} +; CHECK: add x[[ADDR:[0-9]+]], x[[PAGE]], {{l_b@PAGEOFF|:lo12:.Lb}} +; CHECK-NEXT: ldr [[VAL:w[0-9]+]], [x[[ADDR]], #8] +; CHECK-NEXT: str [[VAL]], [x0, #8] +; CHECK-NEXT: ldr [[VAL2:x[0-9]+]], [x[[ADDR]]] +; CHECK-NEXT: str [[VAL2]], [x0] + +define void @foo(i8* %a) { + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast ([3 x i32]* @b to i8*), i64 12, i32 4, i1 false) + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind diff --git a/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll b/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll new file mode 100644 index 00000000000..d1840d35942 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll @@ -0,0 +1,22 @@ +; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s +; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic < %s | FileCheck %s --check-prefix=CHECK-LINUX +; + +define hidden void @t() optsize ssp { +entry: + store i64 zext (i32 ptrtoint (i64 (i32)* @x to i32) to i64), i64* undef, align 8 +; CHECK: adrp x{{[0-9]+}}, _x@GOTPAGE +; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}, _x@GOTPAGEOFF] +; CHECK-NEXT: and x{{[0-9]+}}, x{{[0-9]+}}, #0xffffffff +; CHECK-NEXT: str x{{[0-9]+}}, [x{{[0-9]+}}] + unreachable +} + +declare i64 @x(i32) optsize + +; Worth checking the Linux code is sensible too: only way to access +; the GOT is via a 64-bit load. Just loading wN is unacceptable +; (there's no ELF relocation to do that). + +; CHECK-LINUX: adrp {{x[0-9]+}}, :got:x +; CHECK-LINUX: ldr {{x[0-9]+}}, [{{x[0-9]+}}, :got_lo12:x] diff --git a/test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll b/test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll new file mode 100644 index 00000000000..4b037db9c84 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll @@ -0,0 +1,50 @@ +; RUN: llc < %s -mtriple=arm64-apple-ios -verify-machineinstrs | FileCheck %s + +; LdStOpt bug created illegal instruction: +; %D1, %D2 = LDPSi %X0, 1 +; rdar://11512047 + +%0 = type opaque +%struct.CGRect = type { %struct.CGPoint, %struct.CGSize } +%struct.CGPoint = type { double, double } +%struct.CGSize = type { double, double } + +@"OBJC_IVAR_$_UIScreen._bounds" = external hidden global i64, section "__DATA, __objc_ivar", align 8 + +define hidden %struct.CGRect @t(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp { +entry: +; CHECK-LABEL: t: +; CHECK: ldp d{{[0-9]+}}, d{{[0-9]+}} + %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4 + %0 = bitcast %0* %self to i8* + %add.ptr = getelementptr inbounds i8* %0, i64 %ivar + %add.ptr10.0 = bitcast i8* %add.ptr to double* + %tmp11 = load double* %add.ptr10.0, align 8 + %add.ptr.sum = add i64 %ivar, 8 + %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum + %1 = bitcast i8* %add.ptr10.1 to double* + %tmp12 = load double* %1, align 8 + %add.ptr.sum17 = add i64 %ivar, 16 + %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum17 + %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double* + %tmp = load double* %add.ptr4.1.0, align 8 + %add.ptr4.1.sum = add i64 %ivar, 24 + %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %add.ptr4.1.sum + %2 = bitcast i8* %add.ptr4.1.1 to double* + %tmp5 = load double* %2, align 8 + %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0 + %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1 + %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0 + %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0 + %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1 + %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1 + ret %struct.CGRect %insert3 +} + +!llvm.module.flags = !{!0, !1, !2, !3} + +!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2} +!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0} +!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"} +!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0} +!4 = metadata !{} diff --git a/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll new file mode 100644 index 00000000000..168e921bcc0 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll @@ -0,0 +1,67 @@ +; RUN: llc -march=arm64 -O0 < %s | FileCheck %s +; RUN: llc -march=arm64 -O3 < %s | FileCheck %s + +@.str = private unnamed_addr constant [9 x i8] c"%lf %lu\0A\00", align 1 +@.str1 = private unnamed_addr constant [8 x i8] c"%lf %u\0A\00", align 1 +@.str2 = private unnamed_addr constant [8 x i8] c"%f %lu\0A\00", align 1 +@.str3 = private unnamed_addr constant [7 x i8] c"%f %u\0A\00", align 1 + +define void @testDouble(double %d) ssp { +; CHECK-LABEL: testDouble: +; CHECK: fcvtzu x{{[0-9]+}}, d{{[0-9]+}} +; CHECK: fcvtzu w{{[0-9]+}}, d{{[0-9]+}} +entry: + %d.addr = alloca double, align 8 + store double %d, double* %d.addr, align 8 + %0 = load double* %d.addr, align 8 + %1 = load double* %d.addr, align 8 + %conv = fptoui double %1 to i64 + %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), double %0, i64 %conv) + %2 = load double* %d.addr, align 8 + %3 = load double* %d.addr, align 8 + %conv1 = fptoui double %3 to i32 + %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str1, i32 0, i32 0), double %2, i32 %conv1) + ret void +} + +declare i32 @printf(i8*, ...) + +define void @testFloat(float %f) ssp { +; CHECK-LABEL: testFloat: +; CHECK: fcvtzu x{{[0-9]+}}, s{{[0-9]+}} +; CHECK: fcvtzu w{{[0-9]+}}, s{{[0-9]+}} +entry: + %f.addr = alloca float, align 4 + store float %f, float* %f.addr, align 4 + %0 = load float* %f.addr, align 4 + %conv = fpext float %0 to double + %1 = load float* %f.addr, align 4 + %conv1 = fptoui float %1 to i64 + %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str2, i32 0, i32 0), double %conv, i64 %conv1) + %2 = load float* %f.addr, align 4 + %conv2 = fpext float %2 to double + %3 = load float* %f.addr, align 4 + %conv3 = fptoui float %3 to i32 + %call4 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8]* @.str3, i32 0, i32 0), double %conv2, i32 %conv3) + ret void +} + +define i32 @main(i32 %argc, i8** %argv) ssp { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca i8**, align 8 + store i32 0, i32* %retval + store i32 %argc, i32* %argc.addr, align 4 + store i8** %argv, i8*** %argv.addr, align 8 + call void @testDouble(double 1.159198e+01) + call void @testFloat(float 0x40272F1800000000) + ret i32 0 +} + +!llvm.module.flags = !{!0, !1, !2, !3} + +!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2} +!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0} +!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"} +!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0} diff --git a/test/CodeGen/AArch64/arm64-2012-07-11-InstrEmitterBug.ll b/test/CodeGen/AArch64/arm64-2012-07-11-InstrEmitterBug.ll new file mode 100644 index 00000000000..55ecfb5d2bd --- /dev/null +++ b/test/CodeGen/AArch64/arm64-2012-07-11-InstrEmitterBug.ll @@ -0,0 +1,56 @@ +; RUN: llc < %s -mtriple=arm64-apple-ios +; rdar://11849816 + +@shlib_path_substitutions = external hidden unnamed_addr global i8**, align 8 + +declare i64 @llvm.objectsize.i64(i8*, i1) nounwind readnone + +declare noalias i8* @xmalloc(i64) optsize + +declare i64 @strlen(i8* nocapture) nounwind readonly optsize + +declare i8* @__strcpy_chk(i8*, i8*, i64) nounwind optsize + +declare i8* @__strcat_chk(i8*, i8*, i64) nounwind optsize + +declare noalias i8* @xstrdup(i8*) optsize + +define i8* @dyld_fix_path(i8* %path) nounwind optsize ssp { +entry: + br i1 undef, label %if.end56, label %for.cond + +for.cond: ; preds = %entry + br i1 undef, label %for.cond10, label %for.body + +for.body: ; preds = %for.cond + unreachable + +for.cond10: ; preds = %for.cond + br i1 undef, label %if.end56, label %for.body14 + +for.body14: ; preds = %for.cond10 + %call22 = tail call i64 @strlen(i8* undef) nounwind optsize + %sext = shl i64 %call22, 32 + %conv30 = ashr exact i64 %sext, 32 + %add29 = sub i64 0, %conv30 + %sub = add i64 %add29, 0 + %add31 = shl i64 %sub, 32 + %sext59 = add i64 %add31, 4294967296 + %conv33 = ashr exact i64 %sext59, 32 + %call34 = tail call noalias i8* @xmalloc(i64 %conv33) nounwind optsize + br i1 undef, label %cond.false45, label %cond.true43 + +cond.true43: ; preds = %for.body14 + unreachable + +cond.false45: ; preds = %for.body14 + %add.ptr = getelementptr inbounds i8* %path, i64 %conv30 + unreachable + +if.end56: ; preds = %for.cond10, %entry + ret i8* null +} + +declare i32 @strncmp(i8* nocapture, i8* nocapture, i64) nounwind readonly optsize + +declare i8* @strcpy(i8*, i8* nocapture) nounwind diff --git a/test/CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll b/test/CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll new file mode 100644 index 00000000000..e2c43d953bb --- /dev/null +++ b/test/CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll @@ -0,0 +1,19 @@ +; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s +; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -fp-contract=fast | FileCheck %s --check-prefix=FAST + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128" +target triple = "arm64-apple-ios7.0.0" + +;FAST-LABEL: _Z9example25v: +;FAST: fcmgt.4s +;FAST: ret + +;CHECK-LABEL: _Z9example25v: +;CHECK: fcmgt.4s +;CHECK: ret + +define <4 x i32> @_Z9example25v( <4 x float> %N0, <4 x float> %N1) { + %A = fcmp olt <4 x float> %N0, %N1 + %B = zext <4 x i1> %A to <4 x i32> + ret <4 x i32> %B +} diff --git a/test/CodeGen/AArch64/arm64-2013-01-23-frem-crash.ll b/test/CodeGen/AArch64/arm64-2013-01-23-frem-crash.ll new file mode 100644 index 00000000000..94511243a49 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-2013-01-23-frem-crash.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s -march=arm64 +; Make sure we are not crashing on this test. + +define void @autogen_SD13158() { +entry: + %B26 = frem float 0.000000e+00, undef + br i1 undef, label %CF, label %CF77 + +CF: ; preds = %CF, %CF76 + store float %B26, float* undef + br i1 undef, label %CF, label %CF77 + +CF77: ; preds = %CF + ret void +} diff --git a/test/CodeGen/AArch64/arm64-2013-01-23-sext-crash.ll b/test/CodeGen/AArch64/arm64-2013-01-23-sext-crash.ll new file mode 100644 index 00000000000..404027bfd5f --- /dev/null +++ b/test/CodeGen/AArch64/arm64-2013-01-23-sext-crash.ll @@ -0,0 +1,37 @@ +; RUN: llc < %s -march=arm64 + +; Make sure we are not crashing on this test. + +define void @autogen_SD12881() { +BB: + %B17 = ashr <4 x i32> zeroinitializer, zeroinitializer + br label %CF + +CF: ; preds = %CF83, %CF, %BB + br i1 undef, label %CF, label %CF83 + +CF83: ; preds = %CF + %FC70 = sitofp <4 x i32> %B17 to <4 x double> + br label %CF +} + + +define void @autogen_SD12881_2() { +BB: + %B17 = ashr <4 x i32> zeroinitializer, zeroinitializer + br label %CF + +CF: ; preds = %CF83, %CF, %BB + br i1 undef, label %CF, label %CF83 + +CF83: ; preds = %CF + %FC70 = uitofp <4 x i32> %B17 to <4 x double> + br label %CF +} + +define void @_Z12my_example2bv() nounwind noinline ssp { +entry: + %0 = fptosi <2 x double> undef to <2 x i32> + store <2 x i32> %0, <2 x i32>* undef, align 8 + ret void +} diff --git a/test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll b/test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll new file mode 100644 index 00000000000..a350ba1472c --- /dev/null +++ b/test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll @@ -0,0 +1,11 @@ +; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple + +;CHECK-LABEL: Shuff: +;CHECK: tbl.8b +;CHECK: ret +define <8 x i8 > @Shuff(<8 x i8> %in, <8 x i8>* %out) nounwind ssp { + %value = shufflevector <8 x i8> %in, <8 x i8> zeroinitializer, <8 x i32> + ret <8 x i8> %value +} + + diff --git a/test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll b/test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll new file mode 100644 index 00000000000..a73b7071801 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -march=arm64 + +; This test case tests an infinite loop bug in DAG combiner. +; It just tries to do the following replacing endlessly: +; (1) Replacing.3 0x2c509f0: v4i32 = any_extend 0x2c4cd08 [ORD=4] +; With: 0x2c4d128: v4i32 = sign_extend 0x2c4cd08 [ORD=4] +; +; (2) Replacing.2 0x2c4d128: v4i32 = sign_extend 0x2c4cd08 [ORD=4] +; With: 0x2c509f0: v4i32 = any_extend 0x2c4cd08 [ORD=4] +; As we think the (2) optimization from SIGN_EXTEND to ANY_EXTEND is +; an optimization to replace unused bits with undefined bits, we remove +; the (1) optimization (It doesn't make sense to replace undefined bits +; with signed bits). + +define <4 x i32> @infiniteLoop(<4 x i32> %in0, <4 x i16> %in1) { +entry: + %cmp.i = icmp sge <4 x i16> %in1, + %sext.i = sext <4 x i1> %cmp.i to <4 x i32> + %mul.i = mul <4 x i32> %in0, %sext.i + %sext = shl <4 x i32> %mul.i, + %vmovl.i.i = ashr <4 x i32> %sext, + ret <4 x i32> %vmovl.i.i +} \ No newline at end of file diff --git a/test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll b/test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll new file mode 100644 index 00000000000..3949b85fbd3 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll @@ -0,0 +1,19 @@ +; RUN: llc < %s -verify-machineinstrs -march=arm64 | FileCheck %s + +; Check if sqshl/uqshl with constant shift amout can be selected. +define i64 @test_vqshld_s64_i(i64 %a) { +; CHECK-LABEL: test_vqshld_s64_i: +; CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, #36 + %1 = tail call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 36) + ret i64 %1 +} + +define i64 @test_vqshld_u64_i(i64 %a) { +; CHECK-LABEL: test_vqshld_u64_i: +; CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, #36 + %1 = tail call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 36) + ret i64 %1 +} + +declare i64 @llvm.aarch64.neon.uqshl.i64(i64, i64) +declare i64 @llvm.aarch64.neon.sqshl.i64(i64, i64) diff --git a/test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll b/test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll new file mode 100644 index 00000000000..1b2d54317c2 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -O0 -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s + +; The following 2 test cases test shufflevector with beginning UNDEF mask. +define <8 x i16> @test_vext_undef_traverse(<8 x i16> %in) { +;CHECK-LABEL: test_vext_undef_traverse: +;CHECK: {{ext.16b.*v0, #4}} + %vext = shufflevector <8 x i16> , <8 x i16> %in, <8 x i32> + ret <8 x i16> %vext +} + +define <8 x i16> @test_vext_undef_traverse2(<8 x i16> %in) { +;CHECK-LABEL: test_vext_undef_traverse2: +;CHECK: {{ext.16b.*v0, #6}} + %vext = shufflevector <8 x i16> %in, <8 x i16> , <8 x i32> + ret <8 x i16> %vext +} + +define <8 x i8> @test_vext_undef_traverse3(<8 x i8> %in) { +;CHECK-LABEL: test_vext_undef_traverse3: +;CHECK: {{ext.8b.*v0, #6}} + %vext = shufflevector <8 x i8> %in, <8 x i8> , <8 x i32> + ret <8 x i8> %vext +} diff --git a/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll new file mode 100644 index 00000000000..c4597d5a481 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll @@ -0,0 +1,67 @@ +; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s +; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s -check-prefix=GENERIC + +define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone { +; CHECK-LABEL: bar: +; CHECK: add.2d v[[REG:[0-9]+]], v0, v1 +; CHECK: add d[[REG3:[0-9]+]], d[[REG]], d1 +; CHECK: sub d[[REG2:[0-9]+]], d[[REG]], d1 +; GENERIC-LABEL: bar: +; GENERIC: add v[[REG:[0-9]+]].2d, v0.2d, v1.2d +; GENERIC: add d[[REG3:[0-9]+]], d[[REG]], d1 +; GENERIC: sub d[[REG2:[0-9]+]], d[[REG]], d1 + %add = add <2 x i64> %a, %b + %vgetq_lane = extractelement <2 x i64> %add, i32 0 + %vgetq_lane2 = extractelement <2 x i64> %b, i32 0 + %add3 = add i64 %vgetq_lane, %vgetq_lane2 + %sub = sub i64 %vgetq_lane, %vgetq_lane2 + %vecinit = insertelement <2 x i64> undef, i64 %add3, i32 0 + %vecinit8 = insertelement <2 x i64> %vecinit, i64 %sub, i32 1 + ret <2 x i64> %vecinit8 +} + +define double @subdd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone { +; CHECK-LABEL: subdd_su64: +; CHECK: sub d0, d1, d0 +; CHECK-NEXT: ret +; GENERIC-LABEL: subdd_su64: +; GENERIC: sub d0, d1, d0 +; GENERIC-NEXT: ret + %vecext = extractelement <2 x i64> %a, i32 0 + %vecext1 = extractelement <2 x i64> %b, i32 0 + %sub.i = sub nsw i64 %vecext1, %vecext + %retval = bitcast i64 %sub.i to double + ret double %retval +} + +define double @vaddd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone { +; CHECK-LABEL: vaddd_su64: +; CHECK: add d0, d1, d0 +; CHECK-NEXT: ret +; GENERIC-LABEL: vaddd_su64: +; GENERIC: add d0, d1, d0 +; GENERIC-NEXT: ret + %vecext = extractelement <2 x i64> %a, i32 0 + %vecext1 = extractelement <2 x i64> %b, i32 0 + %add.i = add nsw i64 %vecext1, %vecext + %retval = bitcast i64 %add.i to double + ret double %retval +} + +; sub MI doesn't access dsub register. +define double @add_sub_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone { +; CHECK-LABEL: add_sub_su64: +; CHECK: add d0, d1, d0 +; CHECK: sub d0, {{d[0-9]+}}, d0 +; CHECK-NEXT: ret +; GENERIC-LABEL: add_sub_su64: +; GENERIC: add d0, d1, d0 +; GENERIC: sub d0, {{d[0-9]+}}, d0 +; GENERIC-NEXT: ret + %vecext = extractelement <2 x i64> %a, i32 0 + %vecext1 = extractelement <2 x i64> %b, i32 0 + %add.i = add i64 %vecext1, %vecext + %sub.i = sub i64 0, %add.i + %retval = bitcast i64 %sub.i to double + ret double %retval +} diff --git a/test/CodeGen/AArch64/arm64-aapcs.ll b/test/CodeGen/AArch64/arm64-aapcs.ll new file mode 100644 index 00000000000..b713f0d5a53 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-aapcs.ll @@ -0,0 +1,103 @@ +; RUN: llc -mtriple=arm64-linux-gnu -enable-misched=false < %s | FileCheck %s + +@var = global i32 0, align 4 + +define i128 @test_i128_align(i32, i128 %arg, i32 %after) { + store i32 %after, i32* @var, align 4 +; CHECK: str w4, [{{x[0-9]+}}, :lo12:var] + + ret i128 %arg +; CHECK: mov x0, x2 +; CHECK: mov x1, x3 +} + +@var64 = global i64 0, align 8 + + ; Check stack slots are 64-bit at all times. +define void @test_stack_slots([8 x i32], i1 %bool, i8 %char, i16 %short, + i32 %int, i64 %long) { + ; Part of last store. Blasted scheduler. +; CHECK: ldr [[LONG:x[0-9]+]], [sp, #32] + + %ext_bool = zext i1 %bool to i64 + store volatile i64 %ext_bool, i64* @var64, align 8 +; CHECK: ldrb w[[EXT:[0-9]+]], [sp] +; CHECK: and x[[EXTED:[0-9]+]], x[[EXT]], #0x1 +; CHECK: str x[[EXTED]], [{{x[0-9]+}}, :lo12:var64] + + %ext_char = zext i8 %char to i64 + store volatile i64 %ext_char, i64* @var64, align 8 +; CHECK: ldrb w[[EXT:[0-9]+]], [sp, #8] +; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64] + + %ext_short = zext i16 %short to i64 + store volatile i64 %ext_short, i64* @var64, align 8 +; CHECK: ldrh w[[EXT:[0-9]+]], [sp, #16] +; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64] + + %ext_int = zext i32 %int to i64 + store volatile i64 %ext_int, i64* @var64, align 8 +; CHECK: ldr{{b?}} w[[EXT:[0-9]+]], [sp, #24] +; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64] + + store volatile i64 %long, i64* @var64, align 8 +; CHECK: str [[LONG]], [{{x[0-9]+}}, :lo12:var64] + + ret void +} + +; Make sure the callee does extensions (in the absence of zext/sext +; keyword on args) while we're here. + +define void @test_extension(i1 %bool, i8 %char, i16 %short, i32 %int) { + %ext_bool = zext i1 %bool to i64 + store volatile i64 %ext_bool, i64* @var64 +; CHECK: and [[EXT:x[0-9]+]], x0, #0x1 +; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64] + + %ext_char = sext i8 %char to i64 + store volatile i64 %ext_char, i64* @var64 +; CHECK: sxtb [[EXT:x[0-9]+]], w1 +; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64] + + %ext_short = zext i16 %short to i64 + store volatile i64 %ext_short, i64* @var64 +; CHECK: and [[EXT:x[0-9]+]], x2, #0xffff +; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64] + + %ext_int = zext i32 %int to i64 + store volatile i64 %ext_int, i64* @var64 +; CHECK: ubfx [[EXT:x[0-9]+]], x3, #0, #32 +; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64] + + ret void +} + +declare void @variadic(i32 %a, ...) + + ; Under AAPCS variadic functions have the same calling convention as + ; others. The extra arguments should go in registers rather than on the stack. +define void @test_variadic() { + call void(i32, ...)* @variadic(i32 0, i64 1, double 2.0) +; CHECK: fmov d0, #2.0 +; CHECK: orr w1, wzr, #0x1 +; CHECK: bl variadic + ret void +} + +; We weren't marking x7 as used after deciding that the i128 didn't fit into +; registers and putting the first half on the stack, so the *second* half went +; into x7. Yuck! +define i128 @test_i128_shadow([7 x i64] %x0_x6, i128 %sp) { +; CHECK-LABEL: test_i128_shadow: +; CHECK: ldp x0, x1, [sp] + + ret i128 %sp +} + +; This test is to check if fp128 can be correctly handled on stack. +define fp128 @test_fp128([8 x float] %arg0, fp128 %arg1) { +; CHECK-LABEL: test_fp128: +; CHECK: ldr {{q[0-9]+}}, [sp] + ret fp128 %arg1 +} diff --git a/test/CodeGen/AArch64/arm64-abi-varargs.ll b/test/CodeGen/AArch64/arm64-abi-varargs.ll new file mode 100644 index 00000000000..92db392cd04 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-abi-varargs.ll @@ -0,0 +1,191 @@ +; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s +target triple = "arm64-apple-ios7.0.0" + +; rdar://13625505 +; Here we have 9 fixed integer arguments the 9th argument in on stack, the +; varargs start right after at 8-byte alignment. +define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp { +; CHECK-LABEL: fn9: +; 9th fixed argument +; CHECK: ldr {{w[0-9]+}}, [sp, #64] +; CHECK: add [[ARGS:x[0-9]+]], sp, #72 +; CHECK: add {{x[0-9]+}}, [[ARGS]], #8 +; First vararg +; CHECK: ldr {{w[0-9]+}}, [sp, #72] +; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8 +; Second vararg +; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}] +; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8 +; Third vararg +; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}] + %1 = alloca i32, align 4 + %2 = alloca i32, align 4 + %3 = alloca i32, align 4 + %4 = alloca i32, align 4 + %5 = alloca i32, align 4 + %6 = alloca i32, align 4 + %7 = alloca i32, align 4 + %8 = alloca i32, align 4 + %9 = alloca i32, align 4 + %args = alloca i8*, align 8 + %a10 = alloca i32, align 4 + %a11 = alloca i32, align 4 + %a12 = alloca i32, align 4 + store i32 %a1, i32* %1, align 4 + store i32 %a2, i32* %2, align 4 + store i32 %a3, i32* %3, align 4 + store i32 %a4, i32* %4, align 4 + store i32 %a5, i32* %5, align 4 + store i32 %a6, i32* %6, align 4 + store i32 %a7, i32* %7, align 4 + store i32 %a8, i32* %8, align 4 + store i32 %a9, i32* %9, align 4 + %10 = bitcast i8** %args to i8* + call void @llvm.va_start(i8* %10) + %11 = va_arg i8** %args, i32 + store i32 %11, i32* %a10, align 4 + %12 = va_arg i8** %args, i32 + store i32 %12, i32* %a11, align 4 + %13 = va_arg i8** %args, i32 + store i32 %13, i32* %a12, align 4 + ret void +} + +declare void @llvm.va_start(i8*) nounwind + +define i32 @main() nounwind ssp { +; CHECK-LABEL: main: +; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16] +; CHECK: str {{x[0-9]+}}, [sp, #8] +; CHECK: str {{w[0-9]+}}, [sp] + %a1 = alloca i32, align 4 + %a2 = alloca i32, align 4 + %a3 = alloca i32, align 4 + %a4 = alloca i32, align 4 + %a5 = alloca i32, align 4 + %a6 = alloca i32, align 4 + %a7 = alloca i32, align 4 + %a8 = alloca i32, align 4 + %a9 = alloca i32, align 4 + %a10 = alloca i32, align 4 + %a11 = alloca i32, align 4 + %a12 = alloca i32, align 4 + store i32 1, i32* %a1, align 4 + store i32 2, i32* %a2, align 4 + store i32 3, i32* %a3, align 4 + store i32 4, i32* %a4, align 4 + store i32 5, i32* %a5, align 4 + store i32 6, i32* %a6, align 4 + store i32 7, i32* %a7, align 4 + store i32 8, i32* %a8, align 4 + store i32 9, i32* %a9, align 4 + store i32 10, i32* %a10, align 4 + store i32 11, i32* %a11, align 4 + store i32 12, i32* %a12, align 4 + %1 = load i32* %a1, align 4 + %2 = load i32* %a2, align 4 + %3 = load i32* %a3, align 4 + %4 = load i32* %a4, align 4 + %5 = load i32* %a5, align 4 + %6 = load i32* %a6, align 4 + %7 = load i32* %a7, align 4 + %8 = load i32* %a8, align 4 + %9 = load i32* %a9, align 4 + %10 = load i32* %a10, align 4 + %11 = load i32* %a11, align 4 + %12 = load i32* %a12, align 4 + call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...)* @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12) + ret i32 0 +} + +;rdar://13668483 +@.str = private unnamed_addr constant [4 x i8] c"fmt\00", align 1 +define void @foo(i8* %fmt, ...) nounwind { +entry: +; CHECK-LABEL: foo: +; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, #0x8 +; CHECK: ldr {{w[0-9]+}}, [sp, #48] +; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #15 +; CHECK: and x[[ADDR:[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0 +; CHECK: ldr {{q[0-9]+}}, [x[[ADDR]]] + %fmt.addr = alloca i8*, align 8 + %args = alloca i8*, align 8 + %vc = alloca i32, align 4 + %vv = alloca <4 x i32>, align 16 + store i8* %fmt, i8** %fmt.addr, align 8 + %args1 = bitcast i8** %args to i8* + call void @llvm.va_start(i8* %args1) + %0 = va_arg i8** %args, i32 + store i32 %0, i32* %vc, align 4 + %1 = va_arg i8** %args, <4 x i32> + store <4 x i32> %1, <4 x i32>* %vv, align 16 + ret void +} + +define void @bar(i32 %x, <4 x i32> %y) nounwind { +entry: +; CHECK-LABEL: bar: +; CHECK: str {{q[0-9]+}}, [sp, #16] +; CHECK: str {{x[0-9]+}}, [sp] + %x.addr = alloca i32, align 4 + %y.addr = alloca <4 x i32>, align 16 + store i32 %x, i32* %x.addr, align 4 + store <4 x i32> %y, <4 x i32>* %y.addr, align 16 + %0 = load i32* %x.addr, align 4 + %1 = load <4 x i32>* %y.addr, align 16 + call void (i8*, ...)* @foo(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %0, <4 x i32> %1) + ret void +} + +; rdar://13668927 +; When passing 16-byte aligned small structs as vararg, make sure the caller +; side is 16-byte aligned on stack. +%struct.s41 = type { i32, i16, i32, i16 } +define void @foo2(i8* %fmt, ...) nounwind { +entry: +; CHECK-LABEL: foo2: +; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, #0x8 +; CHECK: ldr {{w[0-9]+}}, [sp, #48] +; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #15 +; CHECK: and x[[ADDR:[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0 +; CHECK: ldr {{q[0-9]+}}, [x[[ADDR]]] + %fmt.addr = alloca i8*, align 8 + %args = alloca i8*, align 8 + %vc = alloca i32, align 4 + %vs = alloca %struct.s41, align 16 + store i8* %fmt, i8** %fmt.addr, align 8 + %args1 = bitcast i8** %args to i8* + call void @llvm.va_start(i8* %args1) + %0 = va_arg i8** %args, i32 + store i32 %0, i32* %vc, align 4 + %ap.cur = load i8** %args + %1 = getelementptr i8* %ap.cur, i32 15 + %2 = ptrtoint i8* %1 to i64 + %3 = and i64 %2, -16 + %ap.align = inttoptr i64 %3 to i8* + %ap.next = getelementptr i8* %ap.align, i32 16 + store i8* %ap.next, i8** %args + %4 = bitcast i8* %ap.align to %struct.s41* + %5 = bitcast %struct.s41* %vs to i8* + %6 = bitcast %struct.s41* %4 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* %6, i64 16, i32 16, i1 false) + ret void +} +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind + +define void @bar2(i32 %x, i128 %s41.coerce) nounwind { +entry: +; CHECK-LABEL: bar2: +; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16] +; CHECK: str {{x[0-9]+}}, [sp] + %x.addr = alloca i32, align 4 + %s41 = alloca %struct.s41, align 16 + store i32 %x, i32* %x.addr, align 4 + %0 = bitcast %struct.s41* %s41 to i128* + store i128 %s41.coerce, i128* %0, align 1 + %1 = load i32* %x.addr, align 4 + %2 = bitcast %struct.s41* %s41 to i128* + %3 = load i128* %2, align 1 + call void (i8*, ...)* @foo2(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %1, i128 %3) + ret void +} diff --git a/test/CodeGen/AArch64/arm64-abi.ll b/test/CodeGen/AArch64/arm64-abi.ll new file mode 100644 index 00000000000..e2de434c7b0 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-abi.ll @@ -0,0 +1,238 @@ +; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s +; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s +target triple = "arm64-apple-darwin" + +; rdar://9932559 +define i64 @i8i16callee(i64 %a1, i64 %a2, i64 %a3, i8 signext %a4, i16 signext %a5, i64 %a6, i64 %a7, i64 %a8, i8 signext %b1, i16 signext %b2, i8 signext %b3, i8 signext %b4) nounwind readnone noinline { +entry: +; CHECK-LABEL: i8i16callee: +; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5. +; They are i8, i16, i8 and i8. +; CHECK: ldrsb {{w[0-9]+}}, [sp, #5] +; CHECK: ldrsh {{w[0-9]+}}, [sp, #2] +; CHECK: ldrsb {{w[0-9]+}}, [sp] +; CHECK: ldrsb {{w[0-9]+}}, [sp, #4] +; FAST-LABEL: i8i16callee: +; FAST: ldrb {{w[0-9]+}}, [sp, #5] +; FAST: ldrb {{w[0-9]+}}, [sp, #4] +; FAST: ldrh {{w[0-9]+}}, [sp, #2] +; FAST: ldrb {{w[0-9]+}}, [sp] + %conv = sext i8 %a4 to i64 + %conv3 = sext i16 %a5 to i64 + %conv8 = sext i8 %b1 to i64 + %conv9 = sext i16 %b2 to i64 + %conv11 = sext i8 %b3 to i64 + %conv13 = sext i8 %b4 to i64 + %add10 = add i64 %a2, %a1 + %add12 = add i64 %add10, %a3 + %add14 = add i64 %add12, %conv + %add = add i64 %add14, %conv3 + %add1 = add i64 %add, %a6 + %add2 = add i64 %add1, %a7 + %add4 = add i64 %add2, %a8 + %add5 = add i64 %add4, %conv8 + %add6 = add i64 %add5, %conv9 + %add7 = add i64 %add6, %conv11 + %add15 = add i64 %add7, %conv13 + %sext = shl i64 %add15, 32 + %conv17 = ashr exact i64 %sext, 32 + ret i64 %conv17 +} + +define i32 @i8i16caller() nounwind readnone { +entry: +; CHECK: i8i16caller +; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5. +; They are i8, i16, i8 and i8. +; CHECK: strb {{w[0-9]+}}, [sp, #5] +; CHECK: strb {{w[0-9]+}}, [sp, #4] +; CHECK: strh {{w[0-9]+}}, [sp, #2] +; CHECK: strb {{w[0-9]+}}, [sp] +; CHECK: bl +; FAST: i8i16caller +; FAST: strb {{w[0-9]+}}, [sp] +; FAST: strh {{w[0-9]+}}, [sp, #2] +; FAST: strb {{w[0-9]+}}, [sp, #4] +; FAST: strb {{w[0-9]+}}, [sp, #5] +; FAST: bl + %call = tail call i64 @i8i16callee(i64 0, i64 1, i64 2, i8 signext 3, i16 signext 4, i64 5, i64 6, i64 7, i8 signext 97, i16 signext 98, i8 signext 99, i8 signext 100) + %conv = trunc i64 %call to i32 + ret i32 %conv +} + +; rdar://12651543 +define double @circle_center([2 x float] %a) nounwind ssp { + %call = tail call double @ext([2 x float] %a) nounwind +; CHECK: circle_center +; CHECK: bl + ret double %call +} +declare double @ext([2 x float]) + +; rdar://12656141 +; 16-byte vector should be aligned at 16-byte when passing on stack. +; A double argument will be passed on stack, so vecotr should be at sp+16. +define double @fixed_4i(<4 x i32>* nocapture %in) nounwind { +entry: +; CHECK: fixed_4i +; CHECK: str [[REG_1:q[0-9]+]], [sp, #16] +; FAST: fixed_4i +; FAST: sub sp, sp, #64 +; FAST: mov x[[ADDR:[0-9]+]], sp +; FAST: str [[REG_1:q[0-9]+]], [x[[ADDR]], #16] + %0 = load <4 x i32>* %in, align 16 + %call = tail call double @args_vec_4i(double 3.000000e+00, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, double 3.000000e+00, <4 x i32> %0, i8 signext 3) + ret double %call +} +declare double @args_vec_4i(double, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, double, <4 x i32>, i8 signext) + +; rdar://12695237 +; d8 at sp, i in register w0. +@g_d = common global double 0.000000e+00, align 8 +define void @test1(float %f1, double %d1, double %d2, double %d3, double %d4, + double %d5, double %d6, double %d7, double %d8, i32 %i) nounwind ssp { +entry: +; CHECK: test1 +; CHECK: ldr [[REG_1:d[0-9]+]], [sp] +; CHECK: scvtf [[REG_2:s[0-9]+]], w0 +; CHECK: fadd s0, [[REG_2]], s0 + %conv = sitofp i32 %i to float + %add = fadd float %conv, %f1 + %conv1 = fpext float %add to double + %add2 = fadd double %conv1, %d7 + %add3 = fadd double %add2, %d8 + store double %add3, double* @g_d, align 8 + ret void +} + +; i9 at sp, d1 in register s0. +define void @test2(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, + i32 %i7, i32 %i8, i32 %i9, float %d1) nounwind ssp { +entry: +; CHECK: test2 +; CHECK: scvtf [[REG_2:s[0-9]+]], w0 +; CHECK: fadd s0, [[REG_2]], s0 +; CHECK: ldr [[REG_1:s[0-9]+]], [sp] + %conv = sitofp i32 %i1 to float + %add = fadd float %conv, %d1 + %conv1 = fpext float %add to double + %conv2 = sitofp i32 %i8 to double + %add3 = fadd double %conv2, %conv1 + %conv4 = sitofp i32 %i9 to double + %add5 = fadd double %conv4, %add3 + store double %add5, double* @g_d, align 8 + ret void +} + +; rdar://12648441 +; Check alignment on stack for v64, f64, i64, f32, i32. +define double @test3(<2 x i32>* nocapture %in) nounwind { +entry: +; CHECK: test3 +; CHECK: str [[REG_1:d[0-9]+]], [sp, #8] +; FAST: test3 +; FAST: sub sp, sp, #32 +; FAST: mov x[[ADDR:[0-9]+]], sp +; FAST: str [[REG_1:d[0-9]+]], [x[[ADDR]], #8] + %0 = load <2 x i32>* %in, align 8 + %call = tail call double @args_vec_2i(double 3.000000e+00, <2 x i32> %0, + <2 x i32> %0, <2 x i32> %0, <2 x i32> %0, <2 x i32> %0, <2 x i32> %0, + <2 x i32> %0, float 3.000000e+00, <2 x i32> %0, i8 signext 3) + ret double %call +} +declare double @args_vec_2i(double, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, + <2 x i32>, <2 x i32>, <2 x i32>, float, <2 x i32>, i8 signext) + +define double @test4(double* nocapture %in) nounwind { +entry: +; CHECK: test4 +; CHECK: str [[REG_1:d[0-9]+]], [sp, #8] +; CHECK: str [[REG_2:w[0-9]+]], [sp] +; CHECK: orr w0, wzr, #0x3 + %0 = load double* %in, align 8 + %call = tail call double @args_f64(double 3.000000e+00, double %0, double %0, + double %0, double %0, double %0, double %0, double %0, + float 3.000000e+00, double %0, i8 signext 3) + ret double %call +} +declare double @args_f64(double, double, double, double, double, double, double, + double, float, double, i8 signext) + +define i64 @test5(i64* nocapture %in) nounwind { +entry: +; CHECK: test5 +; CHECK: strb [[REG_3:w[0-9]+]], [sp, #16] +; CHECK: str [[REG_1:x[0-9]+]], [sp, #8] +; CHECK: str [[REG_2:w[0-9]+]], [sp] + %0 = load i64* %in, align 8 + %call = tail call i64 @args_i64(i64 3, i64 %0, i64 %0, i64 %0, i64 %0, i64 %0, + i64 %0, i64 %0, i32 3, i64 %0, i8 signext 3) + ret i64 %call +} +declare i64 @args_i64(i64, i64, i64, i64, i64, i64, i64, i64, i32, i64, + i8 signext) + +define i32 @test6(float* nocapture %in) nounwind { +entry: +; CHECK: test6 +; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8] +; CHECK: str [[REG_1:s[0-9]+]], [sp, #4] +; CHECK: strh [[REG_3:w[0-9]+]], [sp] + %0 = load float* %in, align 4 + %call = tail call i32 @args_f32(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, + i32 7, i32 8, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, + float 6.0, float 7.0, float 8.0, i16 signext 3, float %0, + i8 signext 3) + ret i32 %call +} +declare i32 @args_f32(i32, i32, i32, i32, i32, i32, i32, i32, + float, float, float, float, float, float, float, float, + i16 signext, float, i8 signext) + +define i32 @test7(i32* nocapture %in) nounwind { +entry: +; CHECK: test7 +; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8] +; CHECK: str [[REG_1:w[0-9]+]], [sp, #4] +; CHECK: strh [[REG_3:w[0-9]+]], [sp] + %0 = load i32* %in, align 4 + %call = tail call i32 @args_i32(i32 3, i32 %0, i32 %0, i32 %0, i32 %0, i32 %0, + i32 %0, i32 %0, i16 signext 3, i32 %0, i8 signext 4) + ret i32 %call +} +declare i32 @args_i32(i32, i32, i32, i32, i32, i32, i32, i32, i16 signext, i32, + i8 signext) + +define i32 @test8(i32 %argc, i8** nocapture %argv) nounwind { +entry: +; CHECK: test8 +; CHECK: strb {{w[0-9]+}}, [sp, #3] +; CHECK: strb wzr, [sp, #2] +; CHECK: strb {{w[0-9]+}}, [sp, #1] +; CHECK: strb wzr, [sp] +; CHECK: bl +; FAST: test8 +; FAST: strb {{w[0-9]+}}, [sp] +; FAST: strb {{w[0-9]+}}, [sp, #1] +; FAST: strb {{w[0-9]+}}, [sp, #2] +; FAST: strb {{w[0-9]+}}, [sp, #3] +; FAST: bl + tail call void @args_i1(i1 zeroext false, i1 zeroext true, i1 zeroext false, + i1 zeroext true, i1 zeroext false, i1 zeroext true, + i1 zeroext false, i1 zeroext true, i1 zeroext false, + i1 zeroext true, i1 zeroext false, i1 zeroext true) + ret i32 0 +} + +declare void @args_i1(i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext, + i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext, + i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext) + +define i32 @i1_stack_incoming(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, + i64 %g, i64 %h, i64 %i, i1 zeroext %j) { +; CHECK-LABEL: i1_stack_incoming: +; CHECK: ldrb w0, [sp, #8] +; CHECK: ret + %v = zext i1 %j to i32 + ret i32 %v +} diff --git a/test/CodeGen/AArch64/arm64-abi_align.ll b/test/CodeGen/AArch64/arm64-abi_align.ll new file mode 100644 index 00000000000..44c5a07ce39 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-abi_align.ll @@ -0,0 +1,532 @@ +; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s +; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s +target triple = "arm64-apple-darwin" + +; rdar://12648441 +; Generated from arm64-arguments.c with -O2. +; Test passing structs with size < 8, < 16 and > 16 +; with alignment of 16 and without + +; Structs with size < 8 +%struct.s38 = type { i32, i16 } +; With alignment of 16, the size will be padded to multiple of 16 bytes. +%struct.s39 = type { i32, i16, [10 x i8] } +; Structs with size < 16 +%struct.s40 = type { i32, i16, i32, i16 } +%struct.s41 = type { i32, i16, i32, i16 } +; Structs with size > 16 +%struct.s42 = type { i32, i16, i32, i16, i32, i16 } +%struct.s43 = type { i32, i16, i32, i16, i32, i16, [10 x i8] } + +@g38 = common global %struct.s38 zeroinitializer, align 4 +@g38_2 = common global %struct.s38 zeroinitializer, align 4 +@g39 = common global %struct.s39 zeroinitializer, align 16 +@g39_2 = common global %struct.s39 zeroinitializer, align 16 +@g40 = common global %struct.s40 zeroinitializer, align 4 +@g40_2 = common global %struct.s40 zeroinitializer, align 4 +@g41 = common global %struct.s41 zeroinitializer, align 16 +@g41_2 = common global %struct.s41 zeroinitializer, align 16 +@g42 = common global %struct.s42 zeroinitializer, align 4 +@g42_2 = common global %struct.s42 zeroinitializer, align 4 +@g43 = common global %struct.s43 zeroinitializer, align 16 +@g43_2 = common global %struct.s43 zeroinitializer, align 16 + +; structs with size < 8 bytes, passed via i64 in x1 and x2 +define i32 @f38(i32 %i, i64 %s1.coerce, i64 %s2.coerce) #0 { +entry: +; CHECK: f38 +; CHECK: add w[[A:[0-9]+]], w1, w0 +; CHECK: add {{w[0-9]+}}, w[[A]], w2 + %s1.sroa.0.0.extract.trunc = trunc i64 %s1.coerce to i32 + %s1.sroa.1.4.extract.shift = lshr i64 %s1.coerce, 32 + %s2.sroa.0.0.extract.trunc = trunc i64 %s2.coerce to i32 + %s2.sroa.1.4.extract.shift = lshr i64 %s2.coerce, 32 + %sext8 = shl nuw nsw i64 %s1.sroa.1.4.extract.shift, 16 + %sext = trunc i64 %sext8 to i32 + %conv = ashr exact i32 %sext, 16 + %sext1011 = shl nuw nsw i64 %s2.sroa.1.4.extract.shift, 16 + %sext10 = trunc i64 %sext1011 to i32 + %conv6 = ashr exact i32 %sext10, 16 + %add = add i32 %s1.sroa.0.0.extract.trunc, %i + %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc + %add4 = add i32 %add3, %conv + %add7 = add i32 %add4, %conv6 + ret i32 %add7 +} + +define i32 @caller38() #1 { +entry: +; CHECK: caller38 +; CHECK: ldr x1, +; CHECK: ldr x2, + %0 = load i64* bitcast (%struct.s38* @g38 to i64*), align 4 + %1 = load i64* bitcast (%struct.s38* @g38_2 to i64*), align 4 + %call = tail call i32 @f38(i32 3, i64 %0, i64 %1) #5 + ret i32 %call +} + +declare i32 @f38_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, + i32 %i7, i32 %i8, i32 %i9, i64 %s1.coerce, i64 %s2.coerce) #0 + +; structs with size < 8 bytes, passed on stack at [sp+8] and [sp+16] +; i9 at [sp] +define i32 @caller38_stack() #1 { +entry: +; CHECK: caller38_stack +; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8] +; CHECK: movz w[[C:[0-9]+]], #0x9 +; CHECK: str w[[C]], [sp] + %0 = load i64* bitcast (%struct.s38* @g38 to i64*), align 4 + %1 = load i64* bitcast (%struct.s38* @g38_2 to i64*), align 4 + %call = tail call i32 @f38_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, + i32 7, i32 8, i32 9, i64 %0, i64 %1) #5 + ret i32 %call +} + +; structs with size < 8 bytes, alignment of 16 +; passed via i128 in x1 and x3 +define i32 @f39(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 { +entry: +; CHECK: f39 +; CHECK: add w[[A:[0-9]+]], w1, w0 +; CHECK: add {{w[0-9]+}}, w[[A]], w3 + %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32 + %s1.sroa.1.4.extract.shift = lshr i128 %s1.coerce, 32 + %s2.sroa.0.0.extract.trunc = trunc i128 %s2.coerce to i32 + %s2.sroa.1.4.extract.shift = lshr i128 %s2.coerce, 32 + %sext8 = shl nuw nsw i128 %s1.sroa.1.4.extract.shift, 16 + %sext = trunc i128 %sext8 to i32 + %conv = ashr exact i32 %sext, 16 + %sext1011 = shl nuw nsw i128 %s2.sroa.1.4.extract.shift, 16 + %sext10 = trunc i128 %sext1011 to i32 + %conv6 = ashr exact i32 %sext10, 16 + %add = add i32 %s1.sroa.0.0.extract.trunc, %i + %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc + %add4 = add i32 %add3, %conv + %add7 = add i32 %add4, %conv6 + ret i32 %add7 +} + +define i32 @caller39() #1 { +entry: +; CHECK: caller39 +; CHECK: ldp x1, x2, +; CHECK: ldp x3, x4, + %0 = load i128* bitcast (%struct.s39* @g39 to i128*), align 16 + %1 = load i128* bitcast (%struct.s39* @g39_2 to i128*), align 16 + %call = tail call i32 @f39(i32 3, i128 %0, i128 %1) #5 + ret i32 %call +} + +declare i32 @f39_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, + i32 %i7, i32 %i8, i32 %i9, i128 %s1.coerce, i128 %s2.coerce) #0 + +; structs with size < 8 bytes, alignment 16 +; passed on stack at [sp+16] and [sp+32] +define i32 @caller39_stack() #1 { +entry: +; CHECK: caller39_stack +; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32] +; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16] +; CHECK: movz w[[C:[0-9]+]], #0x9 +; CHECK: str w[[C]], [sp] + %0 = load i128* bitcast (%struct.s39* @g39 to i128*), align 16 + %1 = load i128* bitcast (%struct.s39* @g39_2 to i128*), align 16 + %call = tail call i32 @f39_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, + i32 7, i32 8, i32 9, i128 %0, i128 %1) #5 + ret i32 %call +} + +; structs with size < 16 bytes +; passed via i128 in x1 and x3 +define i32 @f40(i32 %i, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce) #0 { +entry: +; CHECK: f40 +; CHECK: add w[[A:[0-9]+]], w1, w0 +; CHECK: add {{w[0-9]+}}, w[[A]], w3 + %s1.coerce.fca.0.extract = extractvalue [2 x i64] %s1.coerce, 0 + %s2.coerce.fca.0.extract = extractvalue [2 x i64] %s2.coerce, 0 + %s1.sroa.0.0.extract.trunc = trunc i64 %s1.coerce.fca.0.extract to i32 + %s2.sroa.0.0.extract.trunc = trunc i64 %s2.coerce.fca.0.extract to i32 + %s1.sroa.0.4.extract.shift = lshr i64 %s1.coerce.fca.0.extract, 32 + %sext8 = shl nuw nsw i64 %s1.sroa.0.4.extract.shift, 16 + %sext = trunc i64 %sext8 to i32 + %conv = ashr exact i32 %sext, 16 + %s2.sroa.0.4.extract.shift = lshr i64 %s2.coerce.fca.0.extract, 32 + %sext1011 = shl nuw nsw i64 %s2.sroa.0.4.extract.shift, 16 + %sext10 = trunc i64 %sext1011 to i32 + %conv6 = ashr exact i32 %sext10, 16 + %add = add i32 %s1.sroa.0.0.extract.trunc, %i + %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc + %add4 = add i32 %add3, %conv + %add7 = add i32 %add4, %conv6 + ret i32 %add7 +} + +define i32 @caller40() #1 { +entry: +; CHECK: caller40 +; CHECK: ldp x1, x2, +; CHECK: ldp x3, x4, + %0 = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4 + %1 = load [2 x i64]* bitcast (%struct.s40* @g40_2 to [2 x i64]*), align 4 + %call = tail call i32 @f40(i32 3, [2 x i64] %0, [2 x i64] %1) #5 + ret i32 %call +} + +declare i32 @f40_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, + i32 %i7, i32 %i8, i32 %i9, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce) #0 + +; structs with size < 16 bytes +; passed on stack at [sp+8] and [sp+24] +define i32 @caller40_stack() #1 { +entry: +; CHECK: caller40_stack +; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #24] +; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8] +; CHECK: movz w[[C:[0-9]+]], #0x9 +; CHECK: str w[[C]], [sp] + %0 = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4 + %1 = load [2 x i64]* bitcast (%struct.s40* @g40_2 to [2 x i64]*), align 4 + %call = tail call i32 @f40_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, + i32 7, i32 8, i32 9, [2 x i64] %0, [2 x i64] %1) #5 + ret i32 %call +} + +; structs with size < 16 bytes, alignment of 16 +; passed via i128 in x1 and x3 +define i32 @f41(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 { +entry: +; CHECK: f41 +; CHECK: add w[[A:[0-9]+]], w1, w0 +; CHECK: add {{w[0-9]+}}, w[[A]], w3 + %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32 + %s1.sroa.1.4.extract.shift = lshr i128 %s1.coerce, 32 + %s2.sroa.0.0.extract.trunc = trunc i128 %s2.coerce to i32 + %s2.sroa.1.4.extract.shift = lshr i128 %s2.coerce, 32 + %sext8 = shl nuw nsw i128 %s1.sroa.1.4.extract.shift, 16 + %sext = trunc i128 %sext8 to i32 + %conv = ashr exact i32 %sext, 16 + %sext1011 = shl nuw nsw i128 %s2.sroa.1.4.extract.shift, 16 + %sext10 = trunc i128 %sext1011 to i32 + %conv6 = ashr exact i32 %sext10, 16 + %add = add i32 %s1.sroa.0.0.extract.trunc, %i + %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc + %add4 = add i32 %add3, %conv + %add7 = add i32 %add4, %conv6 + ret i32 %add7 +} + +define i32 @caller41() #1 { +entry: +; CHECK: caller41 +; CHECK: ldp x1, x2, +; CHECK: ldp x3, x4, + %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16 + %1 = load i128* bitcast (%struct.s41* @g41_2 to i128*), align 16 + %call = tail call i32 @f41(i32 3, i128 %0, i128 %1) #5 + ret i32 %call +} + +declare i32 @f41_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, + i32 %i7, i32 %i8, i32 %i9, i128 %s1.coerce, i128 %s2.coerce) #0 + +; structs with size < 16 bytes, alignment of 16 +; passed on stack at [sp+16] and [sp+32] +define i32 @caller41_stack() #1 { +entry: +; CHECK: caller41_stack +; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32] +; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16] +; CHECK: movz w[[C:[0-9]+]], #0x9 +; CHECK: str w[[C]], [sp] + %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16 + %1 = load i128* bitcast (%struct.s41* @g41_2 to i128*), align 16 + %call = tail call i32 @f41_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, + i32 7, i32 8, i32 9, i128 %0, i128 %1) #5 + ret i32 %call +} + +; structs with size of 22 bytes, passed indirectly in x1 and x2 +define i32 @f42(i32 %i, %struct.s42* nocapture %s1, %struct.s42* nocapture %s2) #2 { +entry: +; CHECK: f42 +; CHECK: ldr w[[A:[0-9]+]], [x1] +; CHECK: ldr w[[B:[0-9]+]], [x2] +; CHECK: add w[[C:[0-9]+]], w[[A]], w0 +; CHECK: add {{w[0-9]+}}, w[[C]], w[[B]] +; FAST: f42 +; FAST: ldr w[[A:[0-9]+]], [x1] +; FAST: ldr w[[B:[0-9]+]], [x2] +; FAST: add w[[C:[0-9]+]], w[[A]], w0 +; FAST: add {{w[0-9]+}}, w[[C]], w[[B]] + %i1 = getelementptr inbounds %struct.s42* %s1, i64 0, i32 0 + %0 = load i32* %i1, align 4, !tbaa !0 + %i2 = getelementptr inbounds %struct.s42* %s2, i64 0, i32 0 + %1 = load i32* %i2, align 4, !tbaa !0 + %s = getelementptr inbounds %struct.s42* %s1, i64 0, i32 1 + %2 = load i16* %s, align 2, !tbaa !3 + %conv = sext i16 %2 to i32 + %s5 = getelementptr inbounds %struct.s42* %s2, i64 0, i32 1 + %3 = load i16* %s5, align 2, !tbaa !3 + %conv6 = sext i16 %3 to i32 + %add = add i32 %0, %i + %add3 = add i32 %add, %1 + %add4 = add i32 %add3, %conv + %add7 = add i32 %add4, %conv6 + ret i32 %add7 +} + +; For s1, we allocate a 22-byte space, pass its address via x1 +define i32 @caller42() #3 { +entry: +; CHECK: caller42 +; CHECK: str {{x[0-9]+}}, [sp, #48] +; CHECK: str {{q[0-9]+}}, [sp, #32] +; CHECK: str {{x[0-9]+}}, [sp, #16] +; CHECK: str {{q[0-9]+}}, [sp] +; CHECK: add x1, sp, #32 +; CHECK: mov x2, sp +; Space for s1 is allocated at sp+32 +; Space for s2 is allocated at sp + +; FAST: caller42 +; FAST: sub sp, sp, #96 +; Space for s1 is allocated at fp-24 = sp+72 +; Space for s2 is allocated at sp+48 +; FAST: sub x[[A:[0-9]+]], x29, #24 +; FAST: add x[[A:[0-9]+]], sp, #48 +; Call memcpy with size = 24 (0x18) +; FAST: orr {{x[0-9]+}}, xzr, #0x18 + %tmp = alloca %struct.s42, align 4 + %tmp1 = alloca %struct.s42, align 4 + %0 = bitcast %struct.s42* %tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s42* @g42 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4 + %1 = bitcast %struct.s42* %tmp1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s42* @g42_2 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4 + %call = call i32 @f42(i32 3, %struct.s42* %tmp, %struct.s42* %tmp1) #5 + ret i32 %call +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) #4 + +declare i32 @f42_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, + i32 %i7, i32 %i8, i32 %i9, %struct.s42* nocapture %s1, + %struct.s42* nocapture %s2) #2 + +define i32 @caller42_stack() #3 { +entry: +; CHECK: caller42_stack +; CHECK: mov x29, sp +; CHECK: sub sp, sp, #96 +; CHECK: stur {{x[0-9]+}}, [x29, #-16] +; CHECK: stur {{q[0-9]+}}, [x29, #-32] +; CHECK: str {{x[0-9]+}}, [sp, #48] +; CHECK: str {{q[0-9]+}}, [sp, #32] +; Space for s1 is allocated at x29-32 = sp+64 +; Space for s2 is allocated at sp+32 +; CHECK: add x[[B:[0-9]+]], sp, #32 +; CHECK: str x[[B]], [sp, #16] +; CHECK: sub x[[A:[0-9]+]], x29, #32 +; Address of s1 is passed on stack at sp+8 +; CHECK: str x[[A]], [sp, #8] +; CHECK: movz w[[C:[0-9]+]], #0x9 +; CHECK: str w[[C]], [sp] + +; FAST: caller42_stack +; Space for s1 is allocated at fp-24 +; Space for s2 is allocated at fp-48 +; FAST: sub x[[A:[0-9]+]], x29, #24 +; FAST: sub x[[B:[0-9]+]], x29, #48 +; Call memcpy with size = 24 (0x18) +; FAST: orr {{x[0-9]+}}, xzr, #0x18 +; FAST: str {{w[0-9]+}}, [sp] +; Address of s1 is passed on stack at sp+8 +; FAST: str {{x[0-9]+}}, [sp, #8] +; FAST: str {{x[0-9]+}}, [sp, #16] + %tmp = alloca %struct.s42, align 4 + %tmp1 = alloca %struct.s42, align 4 + %0 = bitcast %struct.s42* %tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s42* @g42 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4 + %1 = bitcast %struct.s42* %tmp1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s42* @g42_2 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4 + %call = call i32 @f42_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, + i32 8, i32 9, %struct.s42* %tmp, %struct.s42* %tmp1) #5 + ret i32 %call +} + +; structs with size of 22 bytes, alignment of 16 +; passed indirectly in x1 and x2 +define i32 @f43(i32 %i, %struct.s43* nocapture %s1, %struct.s43* nocapture %s2) #2 { +entry: +; CHECK: f43 +; CHECK: ldr w[[A:[0-9]+]], [x1] +; CHECK: ldr w[[B:[0-9]+]], [x2] +; CHECK: add w[[C:[0-9]+]], w[[A]], w0 +; CHECK: add {{w[0-9]+}}, w[[C]], w[[B]] +; FAST: f43 +; FAST: ldr w[[A:[0-9]+]], [x1] +; FAST: ldr w[[B:[0-9]+]], [x2] +; FAST: add w[[C:[0-9]+]], w[[A]], w0 +; FAST: add {{w[0-9]+}}, w[[C]], w[[B]] + %i1 = getelementptr inbounds %struct.s43* %s1, i64 0, i32 0 + %0 = load i32* %i1, align 4, !tbaa !0 + %i2 = getelementptr inbounds %struct.s43* %s2, i64 0, i32 0 + %1 = load i32* %i2, align 4, !tbaa !0 + %s = getelementptr inbounds %struct.s43* %s1, i64 0, i32 1 + %2 = load i16* %s, align 2, !tbaa !3 + %conv = sext i16 %2 to i32 + %s5 = getelementptr inbounds %struct.s43* %s2, i64 0, i32 1 + %3 = load i16* %s5, align 2, !tbaa !3 + %conv6 = sext i16 %3 to i32 + %add = add i32 %0, %i + %add3 = add i32 %add, %1 + %add4 = add i32 %add3, %conv + %add7 = add i32 %add4, %conv6 + ret i32 %add7 +} + +define i32 @caller43() #3 { +entry: +; CHECK: caller43 +; CHECK: str {{q[0-9]+}}, [sp, #48] +; CHECK: str {{q[0-9]+}}, [sp, #32] +; CHECK: str {{q[0-9]+}}, [sp, #16] +; CHECK: str {{q[0-9]+}}, [sp] +; CHECK: add x1, sp, #32 +; CHECK: mov x2, sp +; Space for s1 is allocated at sp+32 +; Space for s2 is allocated at sp + +; FAST: caller43 +; FAST: mov x29, sp +; Space for s1 is allocated at sp+32 +; Space for s2 is allocated at sp +; FAST: add x1, sp, #32 +; FAST: mov x2, sp +; FAST: str {{x[0-9]+}}, [sp, #32] +; FAST: str {{x[0-9]+}}, [sp, #40] +; FAST: str {{x[0-9]+}}, [sp, #48] +; FAST: str {{x[0-9]+}}, [sp, #56] +; FAST: str {{x[0-9]+}}, [sp] +; FAST: str {{x[0-9]+}}, [sp, #8] +; FAST: str {{x[0-9]+}}, [sp, #16] +; FAST: str {{x[0-9]+}}, [sp, #24] + %tmp = alloca %struct.s43, align 16 + %tmp1 = alloca %struct.s43, align 16 + %0 = bitcast %struct.s43* %tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s43* @g43 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4 + %1 = bitcast %struct.s43* %tmp1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s43* @g43_2 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4 + %call = call i32 @f43(i32 3, %struct.s43* %tmp, %struct.s43* %tmp1) #5 + ret i32 %call +} + +declare i32 @f43_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, + i32 %i7, i32 %i8, i32 %i9, %struct.s43* nocapture %s1, + %struct.s43* nocapture %s2) #2 + +define i32 @caller43_stack() #3 { +entry: +; CHECK: caller43_stack +; CHECK: mov x29, sp +; CHECK: sub sp, sp, #96 +; CHECK: stur {{q[0-9]+}}, [x29, #-16] +; CHECK: stur {{q[0-9]+}}, [x29, #-32] +; CHECK: str {{q[0-9]+}}, [sp, #48] +; CHECK: str {{q[0-9]+}}, [sp, #32] +; Space for s1 is allocated at x29-32 = sp+64 +; Space for s2 is allocated at sp+32 +; CHECK: add x[[B:[0-9]+]], sp, #32 +; CHECK: str x[[B]], [sp, #16] +; CHECK: sub x[[A:[0-9]+]], x29, #32 +; Address of s1 is passed on stack at sp+8 +; CHECK: str x[[A]], [sp, #8] +; CHECK: movz w[[C:[0-9]+]], #0x9 +; CHECK: str w[[C]], [sp] + +; FAST: caller43_stack +; FAST: sub sp, sp, #96 +; Space for s1 is allocated at fp-32 = sp+64 +; Space for s2 is allocated at sp+32 +; FAST: sub x[[A:[0-9]+]], x29, #32 +; FAST: add x[[B:[0-9]+]], sp, #32 +; FAST: stur {{x[0-9]+}}, [x29, #-32] +; FAST: stur {{x[0-9]+}}, [x29, #-24] +; FAST: stur {{x[0-9]+}}, [x29, #-16] +; FAST: stur {{x[0-9]+}}, [x29, #-8] +; FAST: str {{x[0-9]+}}, [sp, #32] +; FAST: str {{x[0-9]+}}, [sp, #40] +; FAST: str {{x[0-9]+}}, [sp, #48] +; FAST: str {{x[0-9]+}}, [sp, #56] +; FAST: str {{w[0-9]+}}, [sp] +; Address of s1 is passed on stack at sp+8 +; FAST: str {{x[0-9]+}}, [sp, #8] +; FAST: str {{x[0-9]+}}, [sp, #16] + %tmp = alloca %struct.s43, align 16 + %tmp1 = alloca %struct.s43, align 16 + %0 = bitcast %struct.s43* %tmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s43* @g43 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4 + %1 = bitcast %struct.s43* %tmp1 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s43* @g43_2 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4 + %call = call i32 @f43_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, + i32 8, i32 9, %struct.s43* %tmp, %struct.s43* %tmp1) #5 + ret i32 %call +} + +; rdar://13668927 +; Check that we don't split an i128. +declare i32 @callee_i128_split(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, + i32 %i6, i32 %i7, i128 %s1, i32 %i8) + +define i32 @i128_split() { +entry: +; CHECK: i128_split +; "i128 %0" should be on stack at [sp]. +; "i32 8" should be on stack at [sp, #16]. +; CHECK: str {{w[0-9]+}}, [sp, #16] +; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp] +; FAST: i128_split +; FAST: sub sp, sp, #48 +; FAST: mov x[[ADDR:[0-9]+]], sp +; FAST: str {{w[0-9]+}}, [x[[ADDR]], #16] +; Load/Store opt is disabled with -O0, so the i128 is split. +; FAST: str {{x[0-9]+}}, [x[[ADDR]], #8] +; FAST: str {{x[0-9]+}}, [x[[ADDR]]] + %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16 + %call = tail call i32 @callee_i128_split(i32 1, i32 2, i32 3, i32 4, i32 5, + i32 6, i32 7, i128 %0, i32 8) #5 + ret i32 %call +} + +declare i32 @callee_i64(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, + i32 %i6, i32 %i7, i64 %s1, i32 %i8) + +define i32 @i64_split() { +entry: +; CHECK: i64_split +; "i64 %0" should be in register x7. +; "i32 8" should be on stack at [sp]. +; CHECK: ldr x7, [{{x[0-9]+}}] +; CHECK: str {{w[0-9]+}}, [sp] +; FAST: i64_split +; FAST: ldr x7, [{{x[0-9]+}}] +; FAST: str {{w[0-9]+}}, [sp] + %0 = load i64* bitcast (%struct.s41* @g41 to i64*), align 16 + %call = tail call i32 @callee_i64(i32 1, i32 2, i32 3, i32 4, i32 5, + i32 6, i32 7, i64 %0, i32 8) #5 + ret i32 %call +} + +attributes #0 = { noinline nounwind readnone "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" } +attributes #1 = { nounwind readonly "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" } +attributes #2 = { noinline nounwind readonly "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" } +attributes #3 = { nounwind "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" } +attributes #4 = { nounwind } +attributes #5 = { nobuiltin } + +!0 = metadata !{metadata !"int", metadata !1} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA"} +!3 = metadata !{metadata !"short", metadata !1} +!4 = metadata !{i64 0, i64 4, metadata !0, i64 4, i64 2, metadata !3, i64 8, i64 4, metadata !0, i64 12, i64 2, metadata !3, i64 16, i64 4, metadata !0, i64 20, i64 2, metadata !3} diff --git a/test/CodeGen/AArch64/arm64-addp.ll b/test/CodeGen/AArch64/arm64-addp.ll new file mode 100644 index 00000000000..3f1e5c5d44e --- /dev/null +++ b/test/CodeGen/AArch64/arm64-addp.ll @@ -0,0 +1,32 @@ +; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -mcpu=cyclone | FileCheck %s + +define double @foo(<2 x double> %a) nounwind { +; CHECK-LABEL: foo: +; CHECK: faddp.2d d0, v0 +; CHECK-NEXT: ret + %lane0.i = extractelement <2 x double> %a, i32 0 + %lane1.i = extractelement <2 x double> %a, i32 1 + %vpaddd.i = fadd double %lane0.i, %lane1.i + ret double %vpaddd.i +} + +define i64 @foo0(<2 x i64> %a) nounwind { +; CHECK-LABEL: foo0: +; CHECK: addp.2d d0, v0 +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret + %lane0.i = extractelement <2 x i64> %a, i32 0 + %lane1.i = extractelement <2 x i64> %a, i32 1 + %vpaddd.i = add i64 %lane0.i, %lane1.i + ret i64 %vpaddd.i +} + +define float @foo1(<2 x float> %a) nounwind { +; CHECK-LABEL: foo1: +; CHECK: faddp.2s +; CHECK-NEXT: ret + %lane0.i = extractelement <2 x float> %a, i32 0 + %lane1.i = extractelement <2 x float> %a, i32 1 + %vpaddd.i = fadd float %lane0.i, %lane1.i + ret float %vpaddd.i +} diff --git a/test/CodeGen/AArch64/arm64-addr-mode-folding.ll b/test/CodeGen/AArch64/arm64-addr-mode-folding.ll new file mode 100644 index 00000000000..08fb8c90c48 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-addr-mode-folding.ll @@ -0,0 +1,171 @@ +; RUN: llc -O3 -mtriple arm64-apple-ios3 %s -o - | FileCheck %s +; + +@block = common global i8* null, align 8 + +define i32 @fct(i32 %i1, i32 %i2) { +; CHECK: @fct +; Sign extension is used more than once, thus it should not be folded. +; CodeGenPrepare is not sharing sext across uses, thus this is folded because +; of that. +; _CHECK-NOT_: , sxtw] +entry: + %idxprom = sext i32 %i1 to i64 + %0 = load i8** @block, align 8 + %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom + %1 = load i8* %arrayidx, align 1 + %idxprom1 = sext i32 %i2 to i64 + %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1 + %2 = load i8* %arrayidx2, align 1 + %cmp = icmp eq i8 %1, %2 + br i1 %cmp, label %if.end, label %if.then + +if.then: ; preds = %entry + %cmp7 = icmp ugt i8 %1, %2 + %conv8 = zext i1 %cmp7 to i32 + br label %return + +if.end: ; preds = %entry + %inc = add nsw i32 %i1, 1 + %inc9 = add nsw i32 %i2, 1 + %idxprom10 = sext i32 %inc to i64 + %arrayidx11 = getelementptr inbounds i8* %0, i64 %idxprom10 + %3 = load i8* %arrayidx11, align 1 + %idxprom12 = sext i32 %inc9 to i64 + %arrayidx13 = getelementptr inbounds i8* %0, i64 %idxprom12 + %4 = load i8* %arrayidx13, align 1 + %cmp16 = icmp eq i8 %3, %4 + br i1 %cmp16, label %if.end23, label %if.then18 + +if.then18: ; preds = %if.end + %cmp21 = icmp ugt i8 %3, %4 + %conv22 = zext i1 %cmp21 to i32 + br label %return + +if.end23: ; preds = %if.end + %inc24 = add nsw i32 %i1, 2 + %inc25 = add nsw i32 %i2, 2 + %idxprom26 = sext i32 %inc24 to i64 + %arrayidx27 = getelementptr inbounds i8* %0, i64 %idxprom26 + %5 = load i8* %arrayidx27, align 1 + %idxprom28 = sext i32 %inc25 to i64 + %arrayidx29 = getelementptr inbounds i8* %0, i64 %idxprom28 + %6 = load i8* %arrayidx29, align 1 + %cmp32 = icmp eq i8 %5, %6 + br i1 %cmp32, label %return, label %if.then34 + +if.then34: ; preds = %if.end23 + %cmp37 = icmp ugt i8 %5, %6 + %conv38 = zext i1 %cmp37 to i32 + br label %return + +return: ; preds = %if.end23, %if.then34, %if.then18, %if.then + %retval.0 = phi i32 [ %conv8, %if.then ], [ %conv22, %if.then18 ], [ %conv38, %if.then34 ], [ 1, %if.end23 ] + ret i32 %retval.0 +} + +define i32 @fct1(i32 %i1, i32 %i2) optsize { +; CHECK: @fct1 +; Addressing are folded when optimizing for code size. +; CHECK: , sxtw] +; CHECK: , sxtw] +entry: + %idxprom = sext i32 %i1 to i64 + %0 = load i8** @block, align 8 + %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom + %1 = load i8* %arrayidx, align 1 + %idxprom1 = sext i32 %i2 to i64 + %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1 + %2 = load i8* %arrayidx2, align 1 + %cmp = icmp eq i8 %1, %2 + br i1 %cmp, label %if.end, label %if.then + +if.then: ; preds = %entry + %cmp7 = icmp ugt i8 %1, %2 + %conv8 = zext i1 %cmp7 to i32 + br label %return + +if.end: ; preds = %entry + %inc = add nsw i32 %i1, 1 + %inc9 = add nsw i32 %i2, 1 + %idxprom10 = sext i32 %inc to i64 + %arrayidx11 = getelementptr inbounds i8* %0, i64 %idxprom10 + %3 = load i8* %arrayidx11, align 1 + %idxprom12 = sext i32 %inc9 to i64 + %arrayidx13 = getelementptr inbounds i8* %0, i64 %idxprom12 + %4 = load i8* %arrayidx13, align 1 + %cmp16 = icmp eq i8 %3, %4 + br i1 %cmp16, label %if.end23, label %if.then18 + +if.then18: ; preds = %if.end + %cmp21 = icmp ugt i8 %3, %4 + %conv22 = zext i1 %cmp21 to i32 + br label %return + +if.end23: ; preds = %if.end + %inc24 = add nsw i32 %i1, 2 + %inc25 = add nsw i32 %i2, 2 + %idxprom26 = sext i32 %inc24 to i64 + %arrayidx27 = getelementptr inbounds i8* %0, i64 %idxprom26 + %5 = load i8* %arrayidx27, align 1 + %idxprom28 = sext i32 %inc25 to i64 + %arrayidx29 = getelementptr inbounds i8* %0, i64 %idxprom28 + %6 = load i8* %arrayidx29, align 1 + %cmp32 = icmp eq i8 %5, %6 + br i1 %cmp32, label %return, label %if.then34 + +if.then34: ; preds = %if.end23 + %cmp37 = icmp ugt i8 %5, %6 + %conv38 = zext i1 %cmp37 to i32 + br label %return + +return: ; preds = %if.end23, %if.then34, %if.then18, %if.then + %retval.0 = phi i32 [ %conv8, %if.then ], [ %conv22, %if.then18 ], [ %conv38, %if.then34 ], [ 1, %if.end23 ] + ret i32 %retval.0 +} + +; CHECK: @test +; CHECK-NOT: , uxtw #2] +define i32 @test(i32* %array, i8 zeroext %c, i32 %arg) { +entry: + %conv = zext i8 %c to i32 + %add = sub i32 0, %arg + %tobool = icmp eq i32 %conv, %add + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry + %idxprom = zext i8 %c to i64 + %arrayidx = getelementptr inbounds i32* %array, i64 %idxprom + %0 = load volatile i32* %arrayidx, align 4 + %1 = load volatile i32* %arrayidx, align 4 + %add3 = add nsw i32 %1, %0 + br label %if.end + +if.end: ; preds = %entry, %if.then + %res.0 = phi i32 [ %add3, %if.then ], [ 0, %entry ] + ret i32 %res.0 +} + + +; CHECK: @test2 +; CHECK: , uxtw #2] +; CHECK: , uxtw #2] +define i32 @test2(i32* %array, i8 zeroext %c, i32 %arg) optsize { +entry: + %conv = zext i8 %c to i32 + %add = sub i32 0, %arg + %tobool = icmp eq i32 %conv, %add + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry + %idxprom = zext i8 %c to i64 + %arrayidx = getelementptr inbounds i32* %array, i64 %idxprom + %0 = load volatile i32* %arrayidx, align 4 + %1 = load volatile i32* %arrayidx, align 4 + %add3 = add nsw i32 %1, %0 + br label %if.end + +if.end: ; preds = %entry, %if.then + %res.0 = phi i32 [ %add3, %if.then ], [ 0, %entry ] + ret i32 %res.0 +} diff --git a/test/CodeGen/AArch64/arm64-addr-type-promotion.ll b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll new file mode 100644 index 00000000000..1a3ca8bd5b8 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll @@ -0,0 +1,82 @@ +; RUN: llc -march arm64 < %s | FileCheck %s +; rdar://13452552 +; ModuleID = 'reduced_test.ll' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128" +target triple = "arm64-apple-ios3.0.0" + +@block = common global i8* null, align 8 + +define zeroext i8 @fullGtU(i32 %i1, i32 %i2) { +; CHECK: fullGtU +; CHECK: adrp [[PAGE:x[0-9]+]], _block@GOTPAGE +; CHECK: ldr [[ADDR:x[0-9]+]], {{\[}}[[PAGE]], _block@GOTPAGEOFF] +; CHECK-NEXT: ldr [[BLOCKBASE:x[0-9]+]], {{\[}}[[ADDR]]] +; CHECK-NEXT: ldrb [[BLOCKVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE]], w0, sxtw] +; CHECK-NEXT: ldrb [[BLOCKVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE]], w1, sxtw] +; CHECK-NEXT cmp [[BLOCKVAL1]], [[BLOCKVAL2]] +; CHECK-NEXT b.ne +; Next BB +; CHECK: add [[BLOCKBASE2:x[0-9]+]], [[BLOCKBASE]], w1, sxtw +; CHECK-NEXT: add [[BLOCKBASE1:x[0-9]+]], [[BLOCKBASE]], w0, sxtw +; CHECK-NEXT: ldrb [[LOADEDVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE1]], #1] +; CHECK-NEXT: ldrb [[LOADEDVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE2]], #1] +; CHECK-NEXT: cmp [[LOADEDVAL1]], [[LOADEDVAL2]] +; CHECK-NEXT: b.ne +; Next BB +; CHECK: ldrb [[LOADEDVAL3:w[0-9]+]], {{\[}}[[BLOCKBASE1]], #2] +; CHECK-NEXT: ldrb [[LOADEDVAL4:w[0-9]+]], {{\[}}[[BLOCKBASE2]], #2] +; CHECK-NEXT: cmp [[LOADEDVAL3]], [[LOADEDVAL4]] +entry: + %idxprom = sext i32 %i1 to i64 + %tmp = load i8** @block, align 8 + %arrayidx = getelementptr inbounds i8* %tmp, i64 %idxprom + %tmp1 = load i8* %arrayidx, align 1 + %idxprom1 = sext i32 %i2 to i64 + %arrayidx2 = getelementptr inbounds i8* %tmp, i64 %idxprom1 + %tmp2 = load i8* %arrayidx2, align 1 + %cmp = icmp eq i8 %tmp1, %tmp2 + br i1 %cmp, label %if.end, label %if.then + +if.then: ; preds = %entry + %cmp7 = icmp ugt i8 %tmp1, %tmp2 + %conv9 = zext i1 %cmp7 to i8 + br label %return + +if.end: ; preds = %entry + %inc = add nsw i32 %i1, 1 + %inc10 = add nsw i32 %i2, 1 + %idxprom11 = sext i32 %inc to i64 + %arrayidx12 = getelementptr inbounds i8* %tmp, i64 %idxprom11 + %tmp3 = load i8* %arrayidx12, align 1 + %idxprom13 = sext i32 %inc10 to i64 + %arrayidx14 = getelementptr inbounds i8* %tmp, i64 %idxprom13 + %tmp4 = load i8* %arrayidx14, align 1 + %cmp17 = icmp eq i8 %tmp3, %tmp4 + br i1 %cmp17, label %if.end25, label %if.then19 + +if.then19: ; preds = %if.end + %cmp22 = icmp ugt i8 %tmp3, %tmp4 + %conv24 = zext i1 %cmp22 to i8 + br label %return + +if.end25: ; preds = %if.end + %inc26 = add nsw i32 %i1, 2 + %inc27 = add nsw i32 %i2, 2 + %idxprom28 = sext i32 %inc26 to i64 + %arrayidx29 = getelementptr inbounds i8* %tmp, i64 %idxprom28 + %tmp5 = load i8* %arrayidx29, align 1 + %idxprom30 = sext i32 %inc27 to i64 + %arrayidx31 = getelementptr inbounds i8* %tmp, i64 %idxprom30 + %tmp6 = load i8* %arrayidx31, align 1 + %cmp34 = icmp eq i8 %tmp5, %tmp6 + br i1 %cmp34, label %return, label %if.then36 + +if.then36: ; preds = %if.end25 + %cmp39 = icmp ugt i8 %tmp5, %tmp6 + %conv41 = zext i1 %cmp39 to i8 + br label %return + +return: ; preds = %if.then36, %if.end25, %if.then19, %if.then + %retval.0 = phi i8 [ %conv9, %if.then ], [ %conv24, %if.then19 ], [ %conv41, %if.then36 ], [ 0, %if.end25 ] + ret i8 %retval.0 +} diff --git a/test/CodeGen/AArch64/arm64-addrmode.ll b/test/CodeGen/AArch64/arm64-addrmode.ll new file mode 100644 index 00000000000..700fba80149 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-addrmode.ll @@ -0,0 +1,72 @@ +; RUN: llc -march=arm64 < %s | FileCheck %s +; rdar://10232252 + +@object = external hidden global i64, section "__DATA, __objc_ivar", align 8 + +; base + offset (imm9) +; CHECK: @t1 +; CHECK: ldr xzr, [x{{[0-9]+}}, #8] +; CHECK: ret +define void @t1() { + %incdec.ptr = getelementptr inbounds i64* @object, i64 1 + %tmp = load volatile i64* %incdec.ptr, align 8 + ret void +} + +; base + offset (> imm9) +; CHECK: @t2 +; CHECK: sub [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #264 +; CHECK: ldr xzr, [ +; CHECK: [[ADDREG]]] +; CHECK: ret +define void @t2() { + %incdec.ptr = getelementptr inbounds i64* @object, i64 -33 + %tmp = load volatile i64* %incdec.ptr, align 8 + ret void +} + +; base + unsigned offset (> imm9 and <= imm12 * size of type in bytes) +; CHECK: @t3 +; CHECK: ldr xzr, [x{{[0-9]+}}, #32760] +; CHECK: ret +define void @t3() { + %incdec.ptr = getelementptr inbounds i64* @object, i64 4095 + %tmp = load volatile i64* %incdec.ptr, align 8 + ret void +} + +; base + unsigned offset (> imm12 * size of type in bytes) +; CHECK: @t4 +; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #8, lsl #12 +; CHECK: ldr xzr, [ +; CHECK: [[ADDREG]]] +; CHECK: ret +define void @t4() { + %incdec.ptr = getelementptr inbounds i64* @object, i64 4096 + %tmp = load volatile i64* %incdec.ptr, align 8 + ret void +} + +; base + reg +; CHECK: @t5 +; CHECK: ldr xzr, [x{{[0-9]+}}, x{{[0-9]+}}, lsl #3] +; CHECK: ret +define void @t5(i64 %a) { + %incdec.ptr = getelementptr inbounds i64* @object, i64 %a + %tmp = load volatile i64* %incdec.ptr, align 8 + ret void +} + +; base + reg + imm +; CHECK: @t6 +; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, x{{[0-9]+}}, lsl #3 +; CHECK-NEXT: add [[ADDREG]], [[ADDREG]], #8, lsl #12 +; CHECK: ldr xzr, [ +; CHECK: [[ADDREG]]] +; CHECK: ret +define void @t6(i64 %a) { + %tmp1 = getelementptr inbounds i64* @object, i64 %a + %incdec.ptr = getelementptr inbounds i64* %tmp1, i64 4096 + %tmp = load volatile i64* %incdec.ptr, align 8 + ret void +} diff --git a/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll b/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll new file mode 100644 index 00000000000..f396bc99170 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll @@ -0,0 +1,21 @@ +; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=false | FileCheck %s + +; rdar://12713765 +; Make sure we are not creating stack objects that are assumed to be 64-byte +; aligned. +@T3_retval = common global <16 x float> zeroinitializer, align 16 + +define void @test(<16 x float>* noalias sret %agg.result) nounwind ssp { +entry: +; CHECK: test +; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp, #32] +; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp] +; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE:x[0-9]+]], #32] +; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE]]] + %retval = alloca <16 x float>, align 16 + %0 = load <16 x float>* @T3_retval, align 16 + store <16 x float> %0, <16 x float>* %retval + %1 = load <16 x float>* %retval + store <16 x float> %1, <16 x float>* %agg.result, align 16 + ret void +} diff --git a/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll new file mode 100644 index 00000000000..3750f31b373 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll @@ -0,0 +1,29 @@ +; RUN: llc -march=arm64 -mcpu=cyclone < %s | FileCheck %s + +; CHECK: foo +; CHECK: ldr w[[REG:[0-9]+]], [x19, #264] +; CHECK: str w[[REG]], [x19, #132] +; CHECK: ldr w{{[0-9]+}}, [x19, #264] + +define i32 @foo(i32 %a) nounwind { + %retval = alloca i32, align 4 + %a.addr = alloca i32, align 4 + %arr = alloca [32 x i32], align 4 + %i = alloca i32, align 4 + %arr2 = alloca [32 x i32], align 4 + %j = alloca i32, align 4 + store i32 %a, i32* %a.addr, align 4 + %tmp = load i32* %a.addr, align 4 + %tmp1 = zext i32 %tmp to i64 + %v = mul i64 4, %tmp1 + %vla = alloca i8, i64 %v, align 4 + %tmp2 = bitcast i8* %vla to i32* + %tmp3 = load i32* %a.addr, align 4 + store i32 %tmp3, i32* %i, align 4 + %tmp4 = load i32* %a.addr, align 4 + store i32 %tmp4, i32* %j, align 4 + %tmp5 = load i32* %j, align 4 + store i32 %tmp5, i32* %retval + %x = load i32* %retval + ret i32 %x +} diff --git a/test/CodeGen/AArch64/arm64-andCmpBrToTBZ.ll b/test/CodeGen/AArch64/arm64-andCmpBrToTBZ.ll new file mode 100644 index 00000000000..419497722f4 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-andCmpBrToTBZ.ll @@ -0,0 +1,72 @@ +; RUN: llc -O1 -march=arm64 -enable-andcmp-sinking=true < %s | FileCheck %s +; ModuleID = 'and-cbz-extr-mr.bc' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128" +target triple = "arm64-apple-ios7.0.0" + +define zeroext i1 @foo(i1 %IsEditable, i1 %isTextField, i8* %str1, i8* %str2, i8* %str3, i8* %str4, i8* %str5, i8* %str6, i8* %str7, i8* %str8, i8* %str9, i8* %str10, i8* %str11, i8* %str12, i8* %str13, i32 %int1, i8* %str14) unnamed_addr #0 align 2 { +; CHECK: _foo: +entry: + %tobool = icmp eq i8* %str14, null + br i1 %tobool, label %return, label %if.end + +; CHECK: %if.end +; CHECK: tbz +if.end: ; preds = %entry + %and.i.i.i = and i32 %int1, 4 + %tobool.i.i.i = icmp eq i32 %and.i.i.i, 0 + br i1 %tobool.i.i.i, label %if.end12, label %land.rhs.i + +land.rhs.i: ; preds = %if.end + %cmp.i.i.i = icmp eq i8* %str12, %str13 + br i1 %cmp.i.i.i, label %if.then3, label %lor.rhs.i.i.i + +lor.rhs.i.i.i: ; preds = %land.rhs.i + %cmp.i13.i.i.i = icmp eq i8* %str10, %str11 + br i1 %cmp.i13.i.i.i, label %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, label %if.end5 + +_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit: ; preds = %lor.rhs.i.i.i + %cmp.i.i.i.i = icmp eq i8* %str8, %str9 + br i1 %cmp.i.i.i.i, label %if.then3, label %if.end5 + +if.then3: ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, %land.rhs.i + %tmp11 = load i8* %str14, align 8 + %tmp12 = and i8 %tmp11, 2 + %tmp13 = icmp ne i8 %tmp12, 0 + br label %return + +if.end5: ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, %lor.rhs.i.i.i +; CHECK: %if.end5 +; CHECK: tbz + br i1 %tobool.i.i.i, label %if.end12, label %land.rhs.i19 + +land.rhs.i19: ; preds = %if.end5 + %cmp.i.i.i18 = icmp eq i8* %str6, %str7 + br i1 %cmp.i.i.i18, label %if.then7, label %lor.rhs.i.i.i23 + +lor.rhs.i.i.i23: ; preds = %land.rhs.i19 + %cmp.i13.i.i.i22 = icmp eq i8* %str3, %str4 + br i1 %cmp.i13.i.i.i22, label %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, label %if.end12 + +_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28: ; preds = %lor.rhs.i.i.i23 + %cmp.i.i.i.i26 = icmp eq i8* %str1, %str2 + br i1 %cmp.i.i.i.i26, label %if.then7, label %if.end12 + +if.then7: ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, %land.rhs.i19 + br i1 %isTextField, label %if.then9, label %if.end12 + +if.then9: ; preds = %if.then7 + %tmp23 = load i8* %str5, align 8 + %tmp24 = and i8 %tmp23, 2 + %tmp25 = icmp ne i8 %tmp24, 0 + br label %return + +if.end12: ; preds = %if.then7, %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, %lor.rhs.i.i.i23, %if.end5, %if.end + %lnot = xor i1 %IsEditable, true + br label %return + +return: ; preds = %if.end12, %if.then9, %if.then3, %entry + %retval.0 = phi i1 [ %tmp13, %if.then3 ], [ %tmp25, %if.then9 ], [ %lnot, %if.end12 ], [ true, %entry ] + ret i1 %retval.0 +} + +attributes #0 = { nounwind ssp } diff --git a/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll b/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll new file mode 100644 index 00000000000..34d6287b8b4 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll @@ -0,0 +1,31 @@ +; RUN: llc %s -o - | FileCheck %s +; Check that ANDS (tst) is not merged with ADD when the immediate +; is not 0. +; +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-ios" + +; CHECK-LABEL: tst1: +; CHECK: add [[REG:w[0-9]+]], w{{[0-9]+}}, #1 +; CHECK: tst [[REG]], #0x1 +define void @tst1() { +entry: + br i1 undef, label %for.end, label %for.body + +for.body: ; preds = %for.body, %entry + %result.09 = phi i32 [ %add2.result.0, %for.body ], [ 1, %entry ] + %i.08 = phi i32 [ %inc, %for.body ], [ 2, %entry ] + %and = and i32 %i.08, 1 + %cmp1 = icmp eq i32 %and, 0 + %add2.result.0 = select i1 %cmp1, i32 undef, i32 %result.09 + %inc = add nsw i32 %i.08, 1 + %cmp = icmp slt i32 %i.08, undef + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge + +for.cond.for.end_crit_edge: ; preds = %for.body + %add2.result.0.lcssa = phi i32 [ %add2.result.0, %for.body ] + br label %for.end + +for.end: ; preds = %for.cond.for.end_crit_edge, %entry + ret void +} diff --git a/test/CodeGen/AArch64/arm64-anyregcc-crash.ll b/test/CodeGen/AArch64/arm64-anyregcc-crash.ll new file mode 100644 index 00000000000..241cf974c05 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-anyregcc-crash.ll @@ -0,0 +1,19 @@ +; RUN: not llc < %s -mtriple=arm64-apple-darwin 2>&1 | FileCheck %s +; +; Check that misuse of anyregcc results in a compile time error. + +; CHECK: LLVM ERROR: ran out of registers during register allocation +define i64 @anyreglimit(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8, + i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16, + i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24, + i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32) { +entry: + %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 32, + i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8, + i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16, + i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24, + i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32) + ret i64 %result +} + +declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...) diff --git a/test/CodeGen/AArch64/arm64-anyregcc.ll b/test/CodeGen/AArch64/arm64-anyregcc.ll new file mode 100644 index 00000000000..e26875d52f9 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-anyregcc.ll @@ -0,0 +1,363 @@ +; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s + +; Stackmap Header: no constants - 6 callsites +; CHECK-LABEL: .section __LLVM_STACKMAPS,__llvm_stackmaps +; CHECK-NEXT: __LLVM_StackMaps: +; Header +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 0 +; CHECK-NEXT: .short 0 +; Num Functions +; CHECK-NEXT: .long 8 +; Num LargeConstants +; CHECK-NEXT: .long 0 +; Num Callsites +; CHECK-NEXT: .long 8 + +; Functions and stack size +; CHECK-NEXT: .quad _test +; CHECK-NEXT: .quad 16 +; CHECK-NEXT: .quad _property_access1 +; CHECK-NEXT: .quad 16 +; CHECK-NEXT: .quad _property_access2 +; CHECK-NEXT: .quad 32 +; CHECK-NEXT: .quad _property_access3 +; CHECK-NEXT: .quad 32 +; CHECK-NEXT: .quad _anyreg_test1 +; CHECK-NEXT: .quad 16 +; CHECK-NEXT: .quad _anyreg_test2 +; CHECK-NEXT: .quad 16 +; CHECK-NEXT: .quad _patchpoint_spilldef +; CHECK-NEXT: .quad 112 +; CHECK-NEXT: .quad _patchpoint_spillargs +; CHECK-NEXT: .quad 128 + + +; test +; CHECK-LABEL: .long L{{.*}}-_test +; CHECK-NEXT: .short 0 +; 3 locations +; CHECK-NEXT: .short 3 +; Loc 0: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 1: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 2: Constant 3 +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short 0 +; CHECK-NEXT: .long 3 +define i64 @test() nounwind ssp uwtable { +entry: + call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 0, i32 16, i8* null, i32 2, i32 1, i32 2, i64 3) + ret i64 0 +} + +; property access 1 - %obj is an anyreg call argument and should therefore be in a register +; CHECK-LABEL: .long L{{.*}}-_property_access1 +; CHECK-NEXT: .short 0 +; 2 locations +; CHECK-NEXT: .short 2 +; Loc 0: Register <-- this is the return register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 1: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +define i64 @property_access1(i8* %obj) nounwind ssp uwtable { +entry: + %f = inttoptr i64 281474417671919 to i8* + %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 1, i32 20, i8* %f, i32 1, i8* %obj) + ret i64 %ret +} + +; property access 2 - %obj is an anyreg call argument and should therefore be in a register +; CHECK-LABEL: .long L{{.*}}-_property_access2 +; CHECK-NEXT: .short 0 +; 2 locations +; CHECK-NEXT: .short 2 +; Loc 0: Register <-- this is the return register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 1: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +define i64 @property_access2() nounwind ssp uwtable { +entry: + %obj = alloca i64, align 8 + %f = inttoptr i64 281474417671919 to i8* + %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %f, i32 1, i64* %obj) + ret i64 %ret +} + +; property access 3 - %obj is a frame index +; CHECK-LABEL: .long L{{.*}}-_property_access3 +; CHECK-NEXT: .short 0 +; 2 locations +; CHECK-NEXT: .short 2 +; Loc 0: Register <-- this is the return register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 1: Direct FP - 8 +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short 29 +; CHECK-NEXT: .long -8 +define i64 @property_access3() nounwind ssp uwtable { +entry: + %obj = alloca i64, align 8 + %f = inttoptr i64 281474417671919 to i8* + %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 3, i32 20, i8* %f, i32 0, i64* %obj) + ret i64 %ret +} + +; anyreg_test1 +; CHECK-LABEL: .long L{{.*}}-_anyreg_test1 +; CHECK-NEXT: .short 0 +; 14 locations +; CHECK-NEXT: .short 14 +; Loc 0: Register <-- this is the return register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 1: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 2: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 3: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 4: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 5: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 6: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 7: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 8: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 9: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 10: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 11: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 12: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 13: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable { +entry: + %f = inttoptr i64 281474417671919 to i8* + %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 4, i32 20, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) + ret i64 %ret +} + +; anyreg_test2 +; CHECK-LABEL: .long L{{.*}}-_anyreg_test2 +; CHECK-NEXT: .short 0 +; 14 locations +; CHECK-NEXT: .short 14 +; Loc 0: Register <-- this is the return register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 1: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 2: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 3: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 4: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 5: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 6: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 7: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 8: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 9: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 10: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 11: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 12: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 13: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable { +entry: + %f = inttoptr i64 281474417671919 to i8* + %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) + ret i64 %ret +} + +; Test spilling the return value of an anyregcc call. +; +; [JS] Assertion: "Folded a def to a non-store!" +; +; CHECK-LABEL: .long L{{.*}}-_patchpoint_spilldef +; CHECK-NEXT: .short 0 +; CHECK-NEXT: .short 3 +; Loc 0: Register (some register that will be spilled to the stack) +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 1: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 1: Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) { +entry: + %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2) + tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind + ret i64 %result +} + +; Test spilling the arguments of an anyregcc call. +; +; [JS] AnyRegCC argument ends up being spilled +; +; CHECK-LABEL: .long L{{.*}}-_patchpoint_spillargs +; CHECK-NEXT: .short 0 +; CHECK-NEXT: .short 5 +; Loc 0: Return a register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 1: Arg0 in a Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 2: Arg1 in a Register +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short {{[0-9]+}} +; CHECK-NEXT: .long 0 +; Loc 3: Arg2 spilled to FP -96 +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short 29 +; CHECK-NEXT: .long -96 +; Loc 4: Arg3 spilled to FP - 88 +; CHECK-NEXT: .byte 3 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .short 29 +; CHECK-NEXT: .long -88 +define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) { +entry: + tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind + %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 13, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4) + ret i64 %result +} + +declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...) +declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...) diff --git a/test/CodeGen/AArch64/arm64-arith-saturating.ll b/test/CodeGen/AArch64/arm64-arith-saturating.ll new file mode 100644 index 00000000000..78cd1fcb1a2 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-arith-saturating.ll @@ -0,0 +1,153 @@ +; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s + +define i32 @qadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp { +; CHECK-LABEL: qadds: +; CHECK: sqadd s0, s0, s1 + %vecext = extractelement <4 x i32> %b, i32 0 + %vecext1 = extractelement <4 x i32> %c, i32 0 + %vqadd.i = tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %vecext, i32 %vecext1) nounwind + ret i32 %vqadd.i +} + +define i64 @qaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp { +; CHECK-LABEL: qaddd: +; CHECK: sqadd d0, d0, d1 + %vecext = extractelement <2 x i64> %b, i32 0 + %vecext1 = extractelement <2 x i64> %c, i32 0 + %vqadd.i = tail call i64 @llvm.aarch64.neon.sqadd.i64(i64 %vecext, i64 %vecext1) nounwind + ret i64 %vqadd.i +} + +define i32 @uqadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp { +; CHECK-LABEL: uqadds: +; CHECK: uqadd s0, s0, s1 + %vecext = extractelement <4 x i32> %b, i32 0 + %vecext1 = extractelement <4 x i32> %c, i32 0 + %vqadd.i = tail call i32 @llvm.aarch64.neon.uqadd.i32(i32 %vecext, i32 %vecext1) nounwind + ret i32 %vqadd.i +} + +define i64 @uqaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp { +; CHECK-LABEL: uqaddd: +; CHECK: uqadd d0, d0, d1 + %vecext = extractelement <2 x i64> %b, i32 0 + %vecext1 = extractelement <2 x i64> %c, i32 0 + %vqadd.i = tail call i64 @llvm.aarch64.neon.uqadd.i64(i64 %vecext, i64 %vecext1) nounwind + ret i64 %vqadd.i +} + +declare i64 @llvm.aarch64.neon.uqadd.i64(i64, i64) nounwind readnone +declare i32 @llvm.aarch64.neon.uqadd.i32(i32, i32) nounwind readnone +declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64) nounwind readnone +declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32) nounwind readnone + +define i32 @qsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp { +; CHECK-LABEL: qsubs: +; CHECK: sqsub s0, s0, s1 + %vecext = extractelement <4 x i32> %b, i32 0 + %vecext1 = extractelement <4 x i32> %c, i32 0 + %vqsub.i = tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 %vecext, i32 %vecext1) nounwind + ret i32 %vqsub.i +} + +define i64 @qsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp { +; CHECK-LABEL: qsubd: +; CHECK: sqsub d0, d0, d1 + %vecext = extractelement <2 x i64> %b, i32 0 + %vecext1 = extractelement <2 x i64> %c, i32 0 + %vqsub.i = tail call i64 @llvm.aarch64.neon.sqsub.i64(i64 %vecext, i64 %vecext1) nounwind + ret i64 %vqsub.i +} + +define i32 @uqsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp { +; CHECK-LABEL: uqsubs: +; CHECK: uqsub s0, s0, s1 + %vecext = extractelement <4 x i32> %b, i32 0 + %vecext1 = extractelement <4 x i32> %c, i32 0 + %vqsub.i = tail call i32 @llvm.aarch64.neon.uqsub.i32(i32 %vecext, i32 %vecext1) nounwind + ret i32 %vqsub.i +} + +define i64 @uqsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp { +; CHECK-LABEL: uqsubd: +; CHECK: uqsub d0, d0, d1 + %vecext = extractelement <2 x i64> %b, i32 0 + %vecext1 = extractelement <2 x i64> %c, i32 0 + %vqsub.i = tail call i64 @llvm.aarch64.neon.uqsub.i64(i64 %vecext, i64 %vecext1) nounwind + ret i64 %vqsub.i +} + +declare i64 @llvm.aarch64.neon.uqsub.i64(i64, i64) nounwind readnone +declare i32 @llvm.aarch64.neon.uqsub.i32(i32, i32) nounwind readnone +declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64) nounwind readnone +declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32) nounwind readnone + +define i32 @qabss(<4 x i32> %b, <4 x i32> %c) nounwind readnone { +; CHECK-LABEL: qabss: +; CHECK: sqabs s0, s0 +; CHECK: ret + %vecext = extractelement <4 x i32> %b, i32 0 + %vqabs.i = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %vecext) nounwind + ret i32 %vqabs.i +} + +define i64 @qabsd(<2 x i64> %b, <2 x i64> %c) nounwind readnone { +; CHECK-LABEL: qabsd: +; CHECK: sqabs d0, d0 +; CHECK: ret + %vecext = extractelement <2 x i64> %b, i32 0 + %vqabs.i = tail call i64 @llvm.aarch64.neon.sqabs.i64(i64 %vecext) nounwind + ret i64 %vqabs.i +} + +define i32 @qnegs(<4 x i32> %b, <4 x i32> %c) nounwind readnone { +; CHECK-LABEL: qnegs: +; CHECK: sqneg s0, s0 +; CHECK: ret + %vecext = extractelement <4 x i32> %b, i32 0 + %vqneg.i = tail call i32 @llvm.aarch64.neon.sqneg.i32(i32 %vecext) nounwind + ret i32 %vqneg.i +} + +define i64 @qnegd(<2 x i64> %b, <2 x i64> %c) nounwind readnone { +; CHECK-LABEL: qnegd: +; CHECK: sqneg d0, d0 +; CHECK: ret + %vecext = extractelement <2 x i64> %b, i32 0 + %vqneg.i = tail call i64 @llvm.aarch64.neon.sqneg.i64(i64 %vecext) nounwind + ret i64 %vqneg.i +} + +declare i64 @llvm.aarch64.neon.sqneg.i64(i64) nounwind readnone +declare i32 @llvm.aarch64.neon.sqneg.i32(i32) nounwind readnone +declare i64 @llvm.aarch64.neon.sqabs.i64(i64) nounwind readnone +declare i32 @llvm.aarch64.neon.sqabs.i32(i32) nounwind readnone + + +define i32 @vqmovund(<2 x i64> %b) nounwind readnone { +; CHECK-LABEL: vqmovund: +; CHECK: sqxtun s0, d0 + %vecext = extractelement <2 x i64> %b, i32 0 + %vqmovun.i = tail call i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64 %vecext) nounwind + ret i32 %vqmovun.i +} + +define i32 @vqmovnd_s(<2 x i64> %b) nounwind readnone { +; CHECK-LABEL: vqmovnd_s: +; CHECK: sqxtn s0, d0 + %vecext = extractelement <2 x i64> %b, i32 0 + %vqmovn.i = tail call i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64 %vecext) nounwind + ret i32 %vqmovn.i +} + +define i32 @vqmovnd_u(<2 x i64> %b) nounwind readnone { +; CHECK-LABEL: vqmovnd_u: +; CHECK: uqxtn s0, d0 + %vecext = extractelement <2 x i64> %b, i32 0 + %vqmovn.i = tail call i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64 %vecext) nounwind + ret i32 %vqmovn.i +} + +declare i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64) nounwind readnone +declare i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64) nounwind readnone +declare i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64) nounwind readnone diff --git a/test/CodeGen/AArch64/arm64-arith.ll b/test/CodeGen/AArch64/arm64-arith.ll new file mode 100644 index 00000000000..ed9b569e218 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-arith.ll @@ -0,0 +1,262 @@ +; RUN: llc < %s -march=arm64 -asm-verbose=false | FileCheck %s + +define i32 @t1(i32 %a, i32 %b) nounwind readnone ssp { +entry: +; CHECK-LABEL: t1: +; CHECK: add w0, w1, w0 +; CHECK: ret + %add = add i32 %b, %a + ret i32 %add +} + +define i32 @t2(i32 %a, i32 %b) nounwind readnone ssp { +entry: +; CHECK-LABEL: t2: +; CHECK: udiv w0, w0, w1 +; CHECK: ret + %udiv = udiv i32 %a, %b + ret i32 %udiv +} + +define i64 @t3(i64 %a, i64 %b) nounwind readnone ssp { +entry: +; CHECK-LABEL: t3: +; CHECK: udiv x0, x0, x1 +; CHECK: ret + %udiv = udiv i64 %a, %b + ret i64 %udiv +} + +define i32 @t4(i32 %a, i32 %b) nounwind readnone ssp { +entry: +; CHECK-LABEL: t4: +; CHECK: sdiv w0, w0, w1 +; CHECK: ret + %sdiv = sdiv i32 %a, %b + ret i32 %sdiv +} + +define i64 @t5(i64 %a, i64 %b) nounwind readnone ssp { +entry: +; CHECK-LABEL: t5: +; CHECK: sdiv x0, x0, x1 +; CHECK: ret + %sdiv = sdiv i64 %a, %b + ret i64 %sdiv +} + +define i32 @t6(i32 %a, i32 %b) nounwind readnone ssp { +entry: +; CHECK-LABEL: t6: +; CHECK: lsl w0, w0, w1 +; CHECK: ret + %shl = shl i32 %a, %b + ret i32 %shl +} + +define i64 @t7(i64 %a, i64 %b) nounwind readnone ssp { +entry: +; CHECK-LABEL: t7: +; CHECK: lsl x0, x0, x1 +; CHECK: ret + %shl = shl i64 %a, %b + ret i64 %shl +} + +define i32 @t8(i32 %a, i32 %b) nounwind readnone ssp { +entry: +; CHECK-LABEL: t8: +; CHECK: lsr w0, w0, w1 +; CHECK: ret + %lshr = lshr i32 %a, %b + ret i32 %lshr +} + +define i64 @t9(i64 %a, i64 %b) nounwind readnone ssp { +entry: +; CHECK-LABEL: t9: +; CHECK: lsr x0, x0, x1 +; CHECK: ret + %lshr = lshr i64 %a, %b + ret i64 %lshr +} + +define i32 @t10(i32 %a, i32 %b) nounwind readnone ssp { +entry: +; CHECK-LABEL: t10: +; CHECK: asr w0, w0, w1 +; CHECK: ret + %ashr = ashr i32 %a, %b + ret i32 %ashr +} + +define i64 @t11(i64 %a, i64 %b) nounwind readnone ssp { +entry: +; CHECK-LABEL: t11: +; CHECK: asr x0, x0, x1 +; CHECK: ret + %ashr = ashr i64 %a, %b + ret i64 %ashr +} + +define i32 @t12(i16 %a, i32 %x) nounwind ssp { +entry: +; CHECK-LABEL: t12: +; CHECK: add w0, w1, w0, sxth +; CHECK: ret + %c = sext i16 %a to i32 + %e = add i32 %x, %c + ret i32 %e +} + +define i32 @t13(i16 %a, i32 %x) nounwind ssp { +entry: +; CHECK-LABEL: t13: +; CHECK: add w0, w1, w0, sxth #2 +; CHECK: ret + %c = sext i16 %a to i32 + %d = shl i32 %c, 2 + %e = add i32 %x, %d + ret i32 %e +} + +define i64 @t14(i16 %a, i64 %x) nounwind ssp { +entry: +; CHECK-LABEL: t14: +; CHECK: add x0, x1, w0, uxth #3 +; CHECK: ret + %c = zext i16 %a to i64 + %d = shl i64 %c, 3 + %e = add i64 %x, %d + ret i64 %e +} + +; rdar://9160598 +define i64 @t15(i64 %a, i64 %x) nounwind ssp { +entry: +; CHECK-LABEL: t15: +; CHECK: add x0, x1, w0, uxtw +; CHECK: ret + %b = and i64 %a, 4294967295 + %c = add i64 %x, %b + ret i64 %c +} + +define i64 @t16(i64 %x) nounwind ssp { +entry: +; CHECK-LABEL: t16: +; CHECK: lsl x0, x0, #1 +; CHECK: ret + %a = shl i64 %x, 1 + ret i64 %a +} + +; rdar://9166974 +define i64 @t17(i16 %a, i64 %x) nounwind ssp { +entry: +; CHECK-LABEL: t17: +; CHECK: sxth [[REG:x[0-9]+]], w0 +; CHECK: neg x0, [[REG]], lsl #32 +; CHECK: ret + %tmp16 = sext i16 %a to i64 + %tmp17 = mul i64 %tmp16, -4294967296 + ret i64 %tmp17 +} + +define i32 @t18(i32 %a, i32 %b) nounwind readnone ssp { +entry: +; CHECK-LABEL: t18: +; CHECK: sdiv w0, w0, w1 +; CHECK: ret + %sdiv = call i32 @llvm.aarch64.sdiv.i32(i32 %a, i32 %b) + ret i32 %sdiv +} + +define i64 @t19(i64 %a, i64 %b) nounwind readnone ssp { +entry: +; CHECK-LABEL: t19: +; CHECK: sdiv x0, x0, x1 +; CHECK: ret + %sdiv = call i64 @llvm.aarch64.sdiv.i64(i64 %a, i64 %b) + ret i64 %sdiv +} + +define i32 @t20(i32 %a, i32 %b) nounwind readnone ssp { +entry: +; CHECK-LABEL: t20: +; CHECK: udiv w0, w0, w1 +; CHECK: ret + %udiv = call i32 @llvm.aarch64.udiv.i32(i32 %a, i32 %b) + ret i32 %udiv +} + +define i64 @t21(i64 %a, i64 %b) nounwind readnone ssp { +entry: +; CHECK-LABEL: t21: +; CHECK: udiv x0, x0, x1 +; CHECK: ret + %udiv = call i64 @llvm.aarch64.udiv.i64(i64 %a, i64 %b) + ret i64 %udiv +} + +declare i32 @llvm.aarch64.sdiv.i32(i32, i32) nounwind readnone +declare i64 @llvm.aarch64.sdiv.i64(i64, i64) nounwind readnone +declare i32 @llvm.aarch64.udiv.i32(i32, i32) nounwind readnone +declare i64 @llvm.aarch64.udiv.i64(i64, i64) nounwind readnone + +; 32-bit not. +define i32 @inv_32(i32 %x) nounwind ssp { +entry: +; CHECK: inv_32 +; CHECK: mvn w0, w0 +; CHECK: ret + %inv = xor i32 %x, -1 + ret i32 %inv +} + +; 64-bit not. +define i64 @inv_64(i64 %x) nounwind ssp { +entry: +; CHECK: inv_64 +; CHECK: mvn x0, x0 +; CHECK: ret + %inv = xor i64 %x, -1 + ret i64 %inv +} + +; Multiplying by a power of two plus or minus one is better done via shift +; and add/sub rather than the madd/msub instructions. The latter are 4+ cycles, +; and the former are two (total for the two instruction sequence for subtract). +define i32 @f0(i32 %a) nounwind readnone ssp { +; CHECK-LABEL: f0: +; CHECK-NEXT: add w0, w0, w0, lsl #3 +; CHECK-NEXT: ret + %res = mul i32 %a, 9 + ret i32 %res +} + +define i64 @f1(i64 %a) nounwind readnone ssp { +; CHECK-LABEL: f1: +; CHECK-NEXT: lsl x8, x0, #4 +; CHECK-NEXT: sub x0, x8, x0 +; CHECK-NEXT: ret + %res = mul i64 %a, 15 + ret i64 %res +} + +define i32 @f2(i32 %a) nounwind readnone ssp { +; CHECK-LABEL: f2: +; CHECK-NEXT: lsl w8, w0, #3 +; CHECK-NEXT: sub w0, w8, w0 +; CHECK-NEXT: ret + %res = mul nsw i32 %a, 7 + ret i32 %res +} + +define i64 @f3(i64 %a) nounwind readnone ssp { +; CHECK-LABEL: f3: +; CHECK-NEXT: add x0, x0, x0, lsl #4 +; CHECK-NEXT: ret + %res = mul nsw i64 %a, 17 + ret i64 %res +} diff --git a/test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll b/test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll new file mode 100644 index 00000000000..0904b62c403 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll @@ -0,0 +1,16 @@ +; RUN: llc -march=arm64 -aarch64-dead-def-elimination=false < %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-ios7.0.0" + +; Function Attrs: nounwind ssp uwtable +define i32 @test1() #0 { + %tmp1 = alloca i8 + %tmp2 = icmp eq i8* %tmp1, null + %tmp3 = zext i1 %tmp2 to i32 + + ret i32 %tmp3 + + ; CHECK-LABEL: test1 + ; CHECK: adds {{x[0-9]+}}, sp, #15 +} diff --git a/test/CodeGen/AArch64/arm64-atomic-128.ll b/test/CodeGen/AArch64/arm64-atomic-128.ll new file mode 100644 index 00000000000..3b43aa16d2b --- /dev/null +++ b/test/CodeGen/AArch64/arm64-atomic-128.ll @@ -0,0 +1,225 @@ +; RUN: llc < %s -march=arm64 -mtriple=arm64-linux-gnu -verify-machineinstrs -mcpu=cyclone | FileCheck %s + +@var = global i128 0 + +define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) { +; CHECK-LABEL: val_compare_and_swap: +; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: ldaxp [[RESULTLO:x[0-9]+]], [[RESULTHI:x[0-9]+]], [x[[ADDR:[0-9]+]]] +; CHECK-DAG: eor [[MISMATCH_LO:x[0-9]+]], [[RESULTLO]], x2 +; CHECK-DAG: eor [[MISMATCH_HI:x[0-9]+]], [[RESULTHI]], x3 +; CHECK: orr [[MISMATCH:x[0-9]+]], [[MISMATCH_LO]], [[MISMATCH_HI]] +; CHECK: cbnz [[MISMATCH]], [[DONE:.LBB[0-9]+_[0-9]+]] +; CHECK: stxp [[SCRATCH_RES:w[0-9]+]], x4, x5, [x[[ADDR]]] +; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]] +; CHECK: [[DONE]]: + %val = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire + ret i128 %val +} + +define void @fetch_and_nand(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_nand: +; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: ldxp [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0] +; CHECK-DAG: bic [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2 +; CHECK-DAG: bic [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3 +; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0] +; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]] + +; CHECK-DAG: str [[DEST_REGHI]] +; CHECK-DAG: str [[DEST_REGLO]] + %val = atomicrmw nand i128* %p, i128 %bits release + store i128 %val, i128* @var, align 16 + ret void +} + +define void @fetch_and_or(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_or: +; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: ldaxp [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0] +; CHECK-DAG: orr [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2 +; CHECK-DAG: orr [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3 +; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0] +; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]] + +; CHECK-DAG: str [[DEST_REGHI]] +; CHECK-DAG: str [[DEST_REGLO]] + %val = atomicrmw or i128* %p, i128 %bits seq_cst + store i128 %val, i128* @var, align 16 + ret void +} + +define void @fetch_and_add(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_add: +; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: ldaxp [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0] +; CHECK: adds [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2 +; CHECK: adcs [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3 +; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0] +; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]] + +; CHECK-DAG: str [[DEST_REGHI]] +; CHECK-DAG: str [[DEST_REGLO]] + %val = atomicrmw add i128* %p, i128 %bits seq_cst + store i128 %val, i128* @var, align 16 + ret void +} + +define void @fetch_and_sub(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_sub: +; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: ldaxp [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0] +; CHECK: subs [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2 +; CHECK: sbcs [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3 +; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0] +; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]] + +; CHECK-DAG: str [[DEST_REGHI]] +; CHECK-DAG: str [[DEST_REGLO]] + %val = atomicrmw sub i128* %p, i128 %bits seq_cst + store i128 %val, i128* @var, align 16 + ret void +} + +define void @fetch_and_min(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_min: +; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: ldaxp [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0] +; CHECK: cmp [[DEST_REGLO]], x2 +; CHECK: cset [[LOCMP:w[0-9]+]], ls +; CHECK: cmp [[DEST_REGHI:x[0-9]+]], x3 +; CHECK: cset [[HICMP:w[0-9]+]], le +; CHECK: csel [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq +; CHECK: cmp [[CMP]], #0 +; CHECK-DAG: csel [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne +; CHECK-DAG: csel [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne +; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0] +; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]] + +; CHECK-DAG: str [[DEST_REGHI]] +; CHECK-DAG: str [[DEST_REGLO]] + %val = atomicrmw min i128* %p, i128 %bits seq_cst + store i128 %val, i128* @var, align 16 + ret void +} + +define void @fetch_and_max(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_max: +; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: ldaxp [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0] +; CHECK: cmp [[DEST_REGLO]], x2 +; CHECK: cset [[LOCMP:w[0-9]+]], hi +; CHECK: cmp [[DEST_REGHI:x[0-9]+]], x3 +; CHECK: cset [[HICMP:w[0-9]+]], gt +; CHECK: csel [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq +; CHECK: cmp [[CMP]], #0 +; CHECK-DAG: csel [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne +; CHECK-DAG: csel [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne +; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0] +; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]] + +; CHECK-DAG: str [[DEST_REGHI]] +; CHECK-DAG: str [[DEST_REGLO]] + %val = atomicrmw max i128* %p, i128 %bits seq_cst + store i128 %val, i128* @var, align 16 + ret void +} + +define void @fetch_and_umin(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_umin: +; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: ldaxp [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0] +; CHECK: cmp [[DEST_REGLO]], x2 +; CHECK: cset [[LOCMP:w[0-9]+]], ls +; CHECK: cmp [[DEST_REGHI:x[0-9]+]], x3 +; CHECK: cset [[HICMP:w[0-9]+]], ls +; CHECK: csel [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq +; CHECK: cmp [[CMP]], #0 +; CHECK-DAG: csel [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne +; CHECK-DAG: csel [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne +; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0] +; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]] + +; CHECK-DAG: str [[DEST_REGHI]] +; CHECK-DAG: str [[DEST_REGLO]] + %val = atomicrmw umin i128* %p, i128 %bits seq_cst + store i128 %val, i128* @var, align 16 + ret void +} + +define void @fetch_and_umax(i128* %p, i128 %bits) { +; CHECK-LABEL: fetch_and_umax: +; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: ldaxp [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0] +; CHECK: cmp [[DEST_REGLO]], x2 +; CHECK: cset [[LOCMP:w[0-9]+]], hi +; CHECK: cmp [[DEST_REGHI:x[0-9]+]], x3 +; CHECK: cset [[HICMP:w[0-9]+]], hi +; CHECK: csel [[CMP:w[0-9]+]], [[LOCMP]], [[HICMP]], eq +; CHECK: cmp [[CMP]], #0 +; CHECK-DAG: csel [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, ne +; CHECK-DAG: csel [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, ne +; CHECK: stlxp [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0] +; CHECK: cbnz [[SCRATCH_RES]], [[LABEL]] + +; CHECK-DAG: str [[DEST_REGHI]] +; CHECK-DAG: str [[DEST_REGLO]] + %val = atomicrmw umax i128* %p, i128 %bits seq_cst + store i128 %val, i128* @var, align 16 + ret void +} + +define i128 @atomic_load_seq_cst(i128* %p) { +; CHECK-LABEL: atomic_load_seq_cst: +; CHECK-NOT: dmb +; CHECK-LABEL: ldaxp +; CHECK-NOT: dmb + %r = load atomic i128* %p seq_cst, align 16 + ret i128 %r +} + +define i128 @atomic_load_relaxed(i128* %p) { +; CHECK-LABEL: atomic_load_relaxed: +; CHECK-NOT: dmb +; CHECK: ldxp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0] +; CHECK-NOT: dmb + %r = load atomic i128* %p monotonic, align 16 + ret i128 %r +} + + +define void @atomic_store_seq_cst(i128 %in, i128* %p) { +; CHECK-LABEL: atomic_store_seq_cst: +; CHECK-NOT: dmb +; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: ldaxp xzr, xzr, [x2] +; CHECK: stlxp [[SUCCESS:w[0-9]+]], x0, x1, [x2] +; CHECK: cbnz [[SUCCESS]], [[LABEL]] +; CHECK-NOT: dmb + store atomic i128 %in, i128* %p seq_cst, align 16 + ret void +} + +define void @atomic_store_release(i128 %in, i128* %p) { +; CHECK-LABEL: atomic_store_release: +; CHECK-NOT: dmb +; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: ldxp xzr, xzr, [x2] +; CHECK: stlxp [[SUCCESS:w[0-9]+]], x0, x1, [x2] +; CHECK: cbnz [[SUCCESS]], [[LABEL]] +; CHECK-NOT: dmb + store atomic i128 %in, i128* %p release, align 16 + ret void +} + +define void @atomic_store_relaxed(i128 %in, i128* %p) { +; CHECK-LABEL: atomic_store_relaxed: +; CHECK-NOT: dmb +; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: ldxp xzr, xzr, [x2] +; CHECK: stxp [[SUCCESS:w[0-9]+]], x0, x1, [x2] +; CHECK: cbnz [[SUCCESS]], [[LABEL]] +; CHECK-NOT: dmb + store atomic i128 %in, i128* %p unordered, align 16 + ret void +} diff --git a/test/CodeGen/AArch64/arm64-atomic.ll b/test/CodeGen/AArch64/arm64-atomic.ll new file mode 100644 index 00000000000..aa9b284410b --- /dev/null +++ b/test/CodeGen/AArch64/arm64-atomic.ll @@ -0,0 +1,331 @@ +; RUN: llc < %s -march=arm64 -verify-machineinstrs -mcpu=cyclone | FileCheck %s + +define i32 @val_compare_and_swap(i32* %p) { +; CHECK-LABEL: val_compare_and_swap: +; CHECK: orr [[NEWVAL_REG:w[0-9]+]], wzr, #0x4 +; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: ldaxr [[RESULT:w[0-9]+]], [x0] +; CHECK: cmp [[RESULT]], #7 +; CHECK: b.ne [[LABEL2:.?LBB[0-9]+_[0-9]+]] +; CHECK: stxr [[SCRATCH_REG:w[0-9]+]], [[NEWVAL_REG]], [x0] +; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]] +; CHECK: [[LABEL2]]: + %val = cmpxchg i32* %p, i32 7, i32 4 acquire acquire + ret i32 %val +} + +define i64 @val_compare_and_swap_64(i64* %p) { +; CHECK-LABEL: val_compare_and_swap_64: +; CHECK: orr w[[NEWVAL_REG:[0-9]+]], wzr, #0x4 +; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: ldxr [[RESULT:x[0-9]+]], [x0] +; CHECK: cmp [[RESULT]], #7 +; CHECK: b.ne [[LABEL2:.?LBB[0-9]+_[0-9]+]] +; CHECK-NOT: stxr x[[NEWVAL_REG]], x[[NEWVAL_REG]] +; CHECK: stxr [[SCRATCH_REG:w[0-9]+]], x[[NEWVAL_REG]], [x0] +; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]] +; CHECK: [[LABEL2]]: + %val = cmpxchg i64* %p, i64 7, i64 4 monotonic monotonic + ret i64 %val +} + +define i32 @fetch_and_nand(i32* %p) { +; CHECK-LABEL: fetch_and_nand: +; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: ldxr w[[DEST_REG:[0-9]+]], [x0] +; CHECK: and [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], #0xfffffff8 +; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]] +; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0] +; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]] +; CHECK: mov x0, x[[DEST_REG]] + %val = atomicrmw nand i32* %p, i32 7 release + ret i32 %val +} + +define i64 @fetch_and_nand_64(i64* %p) { +; CHECK-LABEL: fetch_and_nand_64: +; CHECK: mov x[[ADDR:[0-9]+]], x0 +; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: ldaxr [[DEST_REG:x[0-9]+]], [x[[ADDR]]] +; CHECK: and [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], #0xfffffffffffffff8 +; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]] +; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]] + + %val = atomicrmw nand i64* %p, i64 7 acq_rel + ret i64 %val +} + +define i32 @fetch_and_or(i32* %p) { +; CHECK-LABEL: fetch_and_or: +; CHECK: movz [[OLDVAL_REG:w[0-9]+]], #0x5 +; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: ldaxr w[[DEST_REG:[0-9]+]], [x0] +; CHECK: orr [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], [[OLDVAL_REG]] +; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]] +; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0] +; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]] +; CHECK: mov x0, x[[DEST_REG]] + %val = atomicrmw or i32* %p, i32 5 seq_cst + ret i32 %val +} + +define i64 @fetch_and_or_64(i64* %p) { +; CHECK: fetch_and_or_64: +; CHECK: mov x[[ADDR:[0-9]+]], x0 +; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]: +; CHECK: ldxr [[DEST_REG:x[0-9]+]], [x[[ADDR]]] +; CHECK: orr [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], #0x7 +; CHECK: stxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]] +; CHECK: cbnz [[SCRATCH_REG]], [[LABEL]] + %val = atomicrmw or i64* %p, i64 7 monotonic + ret i64 %val +} + +define void @acquire_fence() { + fence acquire + ret void + ; CHECK-LABEL: acquire_fence: + ; CHECK: dmb ishld +} + +define void @release_fence() { + fence release + ret void + ; CHECK-LABEL: release_fence: + ; CHECK: dmb ish{{$}} +} + +define void @seq_cst_fence() { + fence seq_cst + ret void + ; CHECK-LABEL: seq_cst_fence: + ; CHECK: dmb ish{{$}} +} + +define i32 @atomic_load(i32* %p) { + %r = load atomic i32* %p seq_cst, align 4 + ret i32 %r + ; CHECK-LABEL: atomic_load: + ; CHECK: ldar +} + +define i8 @atomic_load_relaxed_8(i8* %p, i32 %off32) { +; CHECK-LABEL: atomic_load_relaxed_8: + %ptr_unsigned = getelementptr i8* %p, i32 4095 + %val_unsigned = load atomic i8* %ptr_unsigned monotonic, align 1 +; CHECK: ldrb {{w[0-9]+}}, [x0, #4095] + + %ptr_regoff = getelementptr i8* %p, i32 %off32 + %val_regoff = load atomic i8* %ptr_regoff unordered, align 1 + %tot1 = add i8 %val_unsigned, %val_regoff +; CHECK: ldrb {{w[0-9]+}}, [x0, w1, sxtw] + + %ptr_unscaled = getelementptr i8* %p, i32 -256 + %val_unscaled = load atomic i8* %ptr_unscaled monotonic, align 1 + %tot2 = add i8 %tot1, %val_unscaled +; CHECK: ldurb {{w[0-9]+}}, [x0, #-256] + + %ptr_random = getelementptr i8* %p, i32 1191936 ; 0x123000 (i.e. ADD imm) + %val_random = load atomic i8* %ptr_random unordered, align 1 + %tot3 = add i8 %tot2, %val_random +; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12 +; CHECK: ldrb {{w[0-9]+}}, [x[[ADDR]]] + + ret i8 %tot3 +} + +define i16 @atomic_load_relaxed_16(i16* %p, i32 %off32) { +; CHECK-LABEL: atomic_load_relaxed_16: + %ptr_unsigned = getelementptr i16* %p, i32 4095 + %val_unsigned = load atomic i16* %ptr_unsigned monotonic, align 2 +; CHECK: ldrh {{w[0-9]+}}, [x0, #8190] + + %ptr_regoff = getelementptr i16* %p, i32 %off32 + %val_regoff = load atomic i16* %ptr_regoff unordered, align 2 + %tot1 = add i16 %val_unsigned, %val_regoff +; CHECK: ldrh {{w[0-9]+}}, [x0, w1, sxtw #1] + + %ptr_unscaled = getelementptr i16* %p, i32 -128 + %val_unscaled = load atomic i16* %ptr_unscaled monotonic, align 2 + %tot2 = add i16 %tot1, %val_unscaled +; CHECK: ldurh {{w[0-9]+}}, [x0, #-256] + + %ptr_random = getelementptr i16* %p, i32 595968 ; 0x123000/2 (i.e. ADD imm) + %val_random = load atomic i16* %ptr_random unordered, align 2 + %tot3 = add i16 %tot2, %val_random +; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12 +; CHECK: ldrh {{w[0-9]+}}, [x[[ADDR]]] + + ret i16 %tot3 +} + +define i32 @atomic_load_relaxed_32(i32* %p, i32 %off32) { +; CHECK-LABEL: atomic_load_relaxed_32: + %ptr_unsigned = getelementptr i32* %p, i32 4095 + %val_unsigned = load atomic i32* %ptr_unsigned monotonic, align 4 +; CHECK: ldr {{w[0-9]+}}, [x0, #16380] + + %ptr_regoff = getelementptr i32* %p, i32 %off32 + %val_regoff = load atomic i32* %ptr_regoff unordered, align 4 + %tot1 = add i32 %val_unsigned, %val_regoff +; CHECK: ldr {{w[0-9]+}}, [x0, w1, sxtw #2] + + %ptr_unscaled = getelementptr i32* %p, i32 -64 + %val_unscaled = load atomic i32* %ptr_unscaled monotonic, align 4 + %tot2 = add i32 %tot1, %val_unscaled +; CHECK: ldur {{w[0-9]+}}, [x0, #-256] + + %ptr_random = getelementptr i32* %p, i32 297984 ; 0x123000/4 (i.e. ADD imm) + %val_random = load atomic i32* %ptr_random unordered, align 4 + %tot3 = add i32 %tot2, %val_random +; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12 +; CHECK: ldr {{w[0-9]+}}, [x[[ADDR]]] + + ret i32 %tot3 +} + +define i64 @atomic_load_relaxed_64(i64* %p, i32 %off32) { +; CHECK-LABEL: atomic_load_relaxed_64: + %ptr_unsigned = getelementptr i64* %p, i32 4095 + %val_unsigned = load atomic i64* %ptr_unsigned monotonic, align 8 +; CHECK: ldr {{x[0-9]+}}, [x0, #32760] + + %ptr_regoff = getelementptr i64* %p, i32 %off32 + %val_regoff = load atomic i64* %ptr_regoff unordered, align 8 + %tot1 = add i64 %val_unsigned, %val_regoff +; CHECK: ldr {{x[0-9]+}}, [x0, w1, sxtw #3] + + %ptr_unscaled = getelementptr i64* %p, i32 -32 + %val_unscaled = load atomic i64* %ptr_unscaled monotonic, align 8 + %tot2 = add i64 %tot1, %val_unscaled +; CHECK: ldur {{x[0-9]+}}, [x0, #-256] + + %ptr_random = getelementptr i64* %p, i32 148992 ; 0x123000/8 (i.e. ADD imm) + %val_random = load atomic i64* %ptr_random unordered, align 8 + %tot3 = add i64 %tot2, %val_random +; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12 +; CHECK: ldr {{x[0-9]+}}, [x[[ADDR]]] + + ret i64 %tot3 +} + + +define void @atomc_store(i32* %p) { + store atomic i32 4, i32* %p seq_cst, align 4 + ret void + ; CHECK-LABEL: atomc_store: + ; CHECK: stlr +} + +define void @atomic_store_relaxed_8(i8* %p, i32 %off32, i8 %val) { +; CHECK-LABEL: atomic_store_relaxed_8: + %ptr_unsigned = getelementptr i8* %p, i32 4095 + store atomic i8 %val, i8* %ptr_unsigned monotonic, align 1 +; CHECK: strb {{w[0-9]+}}, [x0, #4095] + + %ptr_regoff = getelementptr i8* %p, i32 %off32 + store atomic i8 %val, i8* %ptr_regoff unordered, align 1 +; CHECK: strb {{w[0-9]+}}, [x0, w1, sxtw] + + %ptr_unscaled = getelementptr i8* %p, i32 -256 + store atomic i8 %val, i8* %ptr_unscaled monotonic, align 1 +; CHECK: sturb {{w[0-9]+}}, [x0, #-256] + + %ptr_random = getelementptr i8* %p, i32 1191936 ; 0x123000 (i.e. ADD imm) + store atomic i8 %val, i8* %ptr_random unordered, align 1 +; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12 +; CHECK: strb {{w[0-9]+}}, [x[[ADDR]]] + + ret void +} + +define void @atomic_store_relaxed_16(i16* %p, i32 %off32, i16 %val) { +; CHECK-LABEL: atomic_store_relaxed_16: + %ptr_unsigned = getelementptr i16* %p, i32 4095 + store atomic i16 %val, i16* %ptr_unsigned monotonic, align 2 +; CHECK: strh {{w[0-9]+}}, [x0, #8190] + + %ptr_regoff = getelementptr i16* %p, i32 %off32 + store atomic i16 %val, i16* %ptr_regoff unordered, align 2 +; CHECK: strh {{w[0-9]+}}, [x0, w1, sxtw #1] + + %ptr_unscaled = getelementptr i16* %p, i32 -128 + store atomic i16 %val, i16* %ptr_unscaled monotonic, align 2 +; CHECK: sturh {{w[0-9]+}}, [x0, #-256] + + %ptr_random = getelementptr i16* %p, i32 595968 ; 0x123000/2 (i.e. ADD imm) + store atomic i16 %val, i16* %ptr_random unordered, align 2 +; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12 +; CHECK: strh {{w[0-9]+}}, [x[[ADDR]]] + + ret void +} + +define void @atomic_store_relaxed_32(i32* %p, i32 %off32, i32 %val) { +; CHECK-LABEL: atomic_store_relaxed_32: + %ptr_unsigned = getelementptr i32* %p, i32 4095 + store atomic i32 %val, i32* %ptr_unsigned monotonic, align 4 +; CHECK: str {{w[0-9]+}}, [x0, #16380] + + %ptr_regoff = getelementptr i32* %p, i32 %off32 + store atomic i32 %val, i32* %ptr_regoff unordered, align 4 +; CHECK: str {{w[0-9]+}}, [x0, w1, sxtw #2] + + %ptr_unscaled = getelementptr i32* %p, i32 -64 + store atomic i32 %val, i32* %ptr_unscaled monotonic, align 4 +; CHECK: stur {{w[0-9]+}}, [x0, #-256] + + %ptr_random = getelementptr i32* %p, i32 297984 ; 0x123000/4 (i.e. ADD imm) + store atomic i32 %val, i32* %ptr_random unordered, align 4 +; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12 +; CHECK: str {{w[0-9]+}}, [x[[ADDR]]] + + ret void +} + +define void @atomic_store_relaxed_64(i64* %p, i32 %off32, i64 %val) { +; CHECK-LABEL: atomic_store_relaxed_64: + %ptr_unsigned = getelementptr i64* %p, i32 4095 + store atomic i64 %val, i64* %ptr_unsigned monotonic, align 8 +; CHECK: str {{x[0-9]+}}, [x0, #32760] + + %ptr_regoff = getelementptr i64* %p, i32 %off32 + store atomic i64 %val, i64* %ptr_regoff unordered, align 8 +; CHECK: str {{x[0-9]+}}, [x0, w1, sxtw #3] + + %ptr_unscaled = getelementptr i64* %p, i32 -32 + store atomic i64 %val, i64* %ptr_unscaled monotonic, align 8 +; CHECK: stur {{x[0-9]+}}, [x0, #-256] + + %ptr_random = getelementptr i64* %p, i32 148992 ; 0x123000/8 (i.e. ADD imm) + store atomic i64 %val, i64* %ptr_random unordered, align 8 +; CHECK: add x[[ADDR:[0-9]+]], x0, #291, lsl #12 +; CHECK: str {{x[0-9]+}}, [x[[ADDR]]] + + ret void +} + +; rdar://11531169 +; rdar://11531308 + +%"class.X::Atomic" = type { %struct.x_atomic_t } +%struct.x_atomic_t = type { i32 } + +@counter = external hidden global %"class.X::Atomic", align 4 + +define i32 @next_id() nounwind optsize ssp align 2 { +entry: + %0 = atomicrmw add i32* getelementptr inbounds (%"class.X::Atomic"* @counter, i64 0, i32 0, i32 0), i32 1 seq_cst + %add.i = add i32 %0, 1 + %tobool = icmp eq i32 %add.i, 0 + br i1 %tobool, label %if.else, label %return + +if.else: ; preds = %entry + %1 = atomicrmw add i32* getelementptr inbounds (%"class.X::Atomic"* @counter, i64 0, i32 0, i32 0), i32 1 seq_cst + %add.i2 = add i32 %1, 1 + br label %return + +return: ; preds = %if.else, %entry + %retval.0 = phi i32 [ %add.i2, %if.else ], [ %add.i, %entry ] + ret i32 %retval.0 +} diff --git a/test/CodeGen/AArch64/arm64-basic-pic.ll b/test/CodeGen/AArch64/arm64-basic-pic.ll new file mode 100644 index 00000000000..9fdb1e91385 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-basic-pic.ll @@ -0,0 +1,54 @@ +; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s + +@var = global i32 0 + +define i32 @get_globalvar() { +; CHECK-LABEL: get_globalvar: + + %val = load i32* @var +; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var +; CHECK: ldr x[[GOTLOC:[0-9]+]], [x[[GOTHI]], :got_lo12:var] +; CHECK: ldr w0, [x[[GOTLOC]]] + + ret i32 %val +} + +define i32* @get_globalvaraddr() { +; CHECK-LABEL: get_globalvaraddr: + + %val = load i32* @var +; CHECK: adrp x[[GOTHI:[0-9]+]], :got:var +; CHECK: ldr x0, [x[[GOTHI]], :got_lo12:var] + + ret i32* @var +} + +@hiddenvar = hidden global i32 0 + +define i32 @get_hiddenvar() { +; CHECK-LABEL: get_hiddenvar: + + %val = load i32* @hiddenvar +; CHECK: adrp x[[HI:[0-9]+]], hiddenvar +; CHECK: ldr w0, [x[[HI]], :lo12:hiddenvar] + + ret i32 %val +} + +define i32* @get_hiddenvaraddr() { +; CHECK-LABEL: get_hiddenvaraddr: + + %val = load i32* @hiddenvar +; CHECK: adrp [[HI:x[0-9]+]], hiddenvar +; CHECK: add x0, [[HI]], :lo12:hiddenvar + + ret i32* @hiddenvar +} + +define void()* @get_func() { +; CHECK-LABEL: get_func: + + ret void()* bitcast(void()*()* @get_func to void()*) +; CHECK: adrp x[[GOTHI:[0-9]+]], :got:get_func +; CHECK: ldr x0, [x[[GOTHI]], :got_lo12:get_func] +} diff --git a/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll b/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll new file mode 100644 index 00000000000..f0e968b2c17 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll @@ -0,0 +1,1101 @@ +; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -O1 -o - | FileCheck %s +; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -O0 -fast-isel=true -o - | FileCheck %s + +; CHECK-LABEL: test_i64_f64: +define void @test_i64_f64(double* %p, i64* %q) { +; CHECK: ldr +; CHECK: str + %1 = load double* %p + %2 = fadd double %1, %1 + %3 = bitcast double %2 to i64 + %4 = add i64 %3, %3 + store i64 %4, i64* %q + ret void +} + +; CHECK-LABEL: test_i64_v1i64: +define void @test_i64_v1i64(<1 x i64>* %p, i64* %q) { +; CHECK: ldr +; CHECK: str + %1 = load <1 x i64>* %p + %2 = add <1 x i64> %1, %1 + %3 = bitcast <1 x i64> %2 to i64 + %4 = add i64 %3, %3 + store i64 %4, i64* %q + ret void +} + +; CHECK-LABEL: test_i64_v2f32: +define void @test_i64_v2f32(<2 x float>* %p, i64* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2s } +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: str + %1 = load <2 x float>* %p + %2 = fadd <2 x float> %1, %1 + %3 = bitcast <2 x float> %2 to i64 + %4 = add i64 %3, %3 + store i64 %4, i64* %q + ret void +} + +; CHECK-LABEL: test_i64_v2i32: +define void @test_i64_v2i32(<2 x i32>* %p, i64* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2s } +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: str + %1 = load <2 x i32>* %p + %2 = add <2 x i32> %1, %1 + %3 = bitcast <2 x i32> %2 to i64 + %4 = add i64 %3, %3 + store i64 %4, i64* %q + ret void +} + +; CHECK-LABEL: test_i64_v4i16: +define void @test_i64_v4i16(<4 x i16>* %p, i64* %q) { +; CHECK: ld1 { v{{[0-9]+}}.4h } +; CHECK: rev64 v{{[0-9]+}}.4h +; CHECK: str + %1 = load <4 x i16>* %p + %2 = add <4 x i16> %1, %1 + %3 = bitcast <4 x i16> %2 to i64 + %4 = add i64 %3, %3 + store i64 %4, i64* %q + ret void +} + +; CHECK-LABEL: test_i64_v8i8: +define void @test_i64_v8i8(<8 x i8>* %p, i64* %q) { +; CHECK: ld1 { v{{[0-9]+}}.8b } +; CHECK: rev64 v{{[0-9]+}}.8b +; CHECK: str + %1 = load <8 x i8>* %p + %2 = add <8 x i8> %1, %1 + %3 = bitcast <8 x i8> %2 to i64 + %4 = add i64 %3, %3 + store i64 %4, i64* %q + ret void +} + +; CHECK-LABEL: test_f64_i64: +define void @test_f64_i64(i64* %p, double* %q) { +; CHECK: ldr +; CHECK: str + %1 = load i64* %p + %2 = add i64 %1, %1 + %3 = bitcast i64 %2 to double + %4 = fadd double %3, %3 + store double %4, double* %q + ret void +} + +; CHECK-LABEL: test_f64_v1i64: +define void @test_f64_v1i64(<1 x i64>* %p, double* %q) { +; CHECK: ldr +; CHECK: str + %1 = load <1 x i64>* %p + %2 = add <1 x i64> %1, %1 + %3 = bitcast <1 x i64> %2 to double + %4 = fadd double %3, %3 + store double %4, double* %q + ret void +} + +; CHECK-LABEL: test_f64_v2f32: +define void @test_f64_v2f32(<2 x float>* %p, double* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2s } +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: str + %1 = load <2 x float>* %p + %2 = fadd <2 x float> %1, %1 + %3 = bitcast <2 x float> %2 to double + %4 = fadd double %3, %3 + store double %4, double* %q + ret void +} + +; CHECK-LABEL: test_f64_v2i32: +define void @test_f64_v2i32(<2 x i32>* %p, double* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2s } +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: str + %1 = load <2 x i32>* %p + %2 = add <2 x i32> %1, %1 + %3 = bitcast <2 x i32> %2 to double + %4 = fadd double %3, %3 + store double %4, double* %q + ret void +} + +; CHECK-LABEL: test_f64_v4i16: +define void @test_f64_v4i16(<4 x i16>* %p, double* %q) { +; CHECK: ld1 { v{{[0-9]+}}.4h } +; CHECK: rev64 v{{[0-9]+}}.4h +; CHECK: str + %1 = load <4 x i16>* %p + %2 = add <4 x i16> %1, %1 + %3 = bitcast <4 x i16> %2 to double + %4 = fadd double %3, %3 + store double %4, double* %q + ret void +} + +; CHECK-LABEL: test_f64_v8i8: +define void @test_f64_v8i8(<8 x i8>* %p, double* %q) { +; CHECK: ld1 { v{{[0-9]+}}.8b } +; CHECK: rev64 v{{[0-9]+}}.8b +; CHECK: str + %1 = load <8 x i8>* %p + %2 = add <8 x i8> %1, %1 + %3 = bitcast <8 x i8> %2 to double + %4 = fadd double %3, %3 + store double %4, double* %q + ret void +} + +; CHECK-LABEL: test_v1i64_i64: +define void @test_v1i64_i64(i64* %p, <1 x i64>* %q) { +; CHECK: ldr +; CHECK: str + %1 = load i64* %p + %2 = add i64 %1, %1 + %3 = bitcast i64 %2 to <1 x i64> + %4 = add <1 x i64> %3, %3 + store <1 x i64> %4, <1 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v1i64_f64: +define void @test_v1i64_f64(double* %p, <1 x i64>* %q) { +; CHECK: ldr +; CHECK: str + %1 = load double* %p + %2 = fadd double %1, %1 + %3 = bitcast double %2 to <1 x i64> + %4 = add <1 x i64> %3, %3 + store <1 x i64> %4, <1 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v1i64_v2f32: +define void @test_v1i64_v2f32(<2 x float>* %p, <1 x i64>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2s } +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: str + %1 = load <2 x float>* %p + %2 = fadd <2 x float> %1, %1 + %3 = bitcast <2 x float> %2 to <1 x i64> + %4 = add <1 x i64> %3, %3 + store <1 x i64> %4, <1 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v1i64_v2i32: +define void @test_v1i64_v2i32(<2 x i32>* %p, <1 x i64>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2s } +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: str + %1 = load <2 x i32>* %p + %2 = add <2 x i32> %1, %1 + %3 = bitcast <2 x i32> %2 to <1 x i64> + %4 = add <1 x i64> %3, %3 + store <1 x i64> %4, <1 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v1i64_v4i16: +define void @test_v1i64_v4i16(<4 x i16>* %p, <1 x i64>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.4h } +; CHECK: rev64 v{{[0-9]+}}.4h +; CHECK: str + %1 = load <4 x i16>* %p + %2 = add <4 x i16> %1, %1 + %3 = bitcast <4 x i16> %2 to <1 x i64> + %4 = add <1 x i64> %3, %3 + store <1 x i64> %4, <1 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v1i64_v8i8: +define void @test_v1i64_v8i8(<8 x i8>* %p, <1 x i64>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.8b } +; CHECK: rev64 v{{[0-9]+}}.8b +; CHECK: str + %1 = load <8 x i8>* %p + %2 = add <8 x i8> %1, %1 + %3 = bitcast <8 x i8> %2 to <1 x i64> + %4 = add <1 x i64> %3, %3 + store <1 x i64> %4, <1 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v2f32_i64: +define void @test_v2f32_i64(i64* %p, <2 x float>* %q) { +; CHECK: ldr +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: st1 { v{{[0-9]+}}.2s } + %1 = load i64* %p + %2 = add i64 %1, %1 + %3 = bitcast i64 %2 to <2 x float> + %4 = fadd <2 x float> %3, %3 + store <2 x float> %4, <2 x float>* %q + ret void +} + +; CHECK-LABEL: test_v2f32_f64: +define void @test_v2f32_f64(double* %p, <2 x float>* %q) { +; CHECK: ldr +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: st1 { v{{[0-9]+}}.2s } + %1 = load double* %p + %2 = fadd double %1, %1 + %3 = bitcast double %2 to <2 x float> + %4 = fadd <2 x float> %3, %3 + store <2 x float> %4, <2 x float>* %q + ret void +} + +; CHECK-LABEL: test_v2f32_v1i64: +define void @test_v2f32_v1i64(<1 x i64>* %p, <2 x float>* %q) { +; CHECK: ldr +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: st1 { v{{[0-9]+}}.2s } + %1 = load <1 x i64>* %p + %2 = add <1 x i64> %1, %1 + %3 = bitcast <1 x i64> %2 to <2 x float> + %4 = fadd <2 x float> %3, %3 + store <2 x float> %4, <2 x float>* %q + ret void +} + +; CHECK-LABEL: test_v2f32_v2i32: +define void @test_v2f32_v2i32(<2 x i32>* %p, <2 x float>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2s } +; CHECK: st1 { v{{[0-9]+}}.2s } + %1 = load <2 x i32>* %p + %2 = add <2 x i32> %1, %1 + %3 = bitcast <2 x i32> %2 to <2 x float> + %4 = fadd <2 x float> %3, %3 + store <2 x float> %4, <2 x float>* %q + ret void +} + +; CHECK-LABEL: test_v2f32_v4i16: +define void @test_v2f32_v4i16(<4 x i16>* %p, <2 x float>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.4h } +; CHECK: rev32 v{{[0-9]+}}.4h +; CHECK: st1 { v{{[0-9]+}}.2s } + %1 = load <4 x i16>* %p + %2 = add <4 x i16> %1, %1 + %3 = bitcast <4 x i16> %2 to <2 x float> + %4 = fadd <2 x float> %3, %3 + store <2 x float> %4, <2 x float>* %q + ret void +} + +; CHECK-LABEL: test_v2f32_v8i8: +define void @test_v2f32_v8i8(<8 x i8>* %p, <2 x float>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.8b } +; CHECK: rev32 v{{[0-9]+}}.8b +; CHECK: st1 { v{{[0-9]+}}.2s } + %1 = load <8 x i8>* %p + %2 = add <8 x i8> %1, %1 + %3 = bitcast <8 x i8> %2 to <2 x float> + %4 = fadd <2 x float> %3, %3 + store <2 x float> %4, <2 x float>* %q + ret void +} + +; CHECK-LABEL: test_v2i32_i64: +define void @test_v2i32_i64(i64* %p, <2 x i32>* %q) { +; CHECK: ldr +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: st1 { v{{[0-9]+}}.2s } + %1 = load i64* %p + %2 = add i64 %1, %1 + %3 = bitcast i64 %2 to <2 x i32> + %4 = add <2 x i32> %3, %3 + store <2 x i32> %4, <2 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v2i32_f64: +define void @test_v2i32_f64(double* %p, <2 x i32>* %q) { +; CHECK: ldr +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: st1 { v{{[0-9]+}}.2s } + %1 = load double* %p + %2 = fadd double %1, %1 + %3 = bitcast double %2 to <2 x i32> + %4 = add <2 x i32> %3, %3 + store <2 x i32> %4, <2 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v2i32_v1i64: +define void @test_v2i32_v1i64(<1 x i64>* %p, <2 x i32>* %q) { +; CHECK: ldr +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: st1 { v{{[0-9]+}}.2s } + %1 = load <1 x i64>* %p + %2 = add <1 x i64> %1, %1 + %3 = bitcast <1 x i64> %2 to <2 x i32> + %4 = add <2 x i32> %3, %3 + store <2 x i32> %4, <2 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v2i32_v2f32: +define void @test_v2i32_v2f32(<2 x float>* %p, <2 x i32>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2s } +; CHECK: st1 { v{{[0-9]+}}.2s } + %1 = load <2 x float>* %p + %2 = fadd <2 x float> %1, %1 + %3 = bitcast <2 x float> %2 to <2 x i32> + %4 = add <2 x i32> %3, %3 + store <2 x i32> %4, <2 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v2i32_v4i16: +define void @test_v2i32_v4i16(<4 x i16>* %p, <2 x i32>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.4h } +; CHECK: rev32 v{{[0-9]+}}.4h +; CHECK: st1 { v{{[0-9]+}}.2s } + %1 = load <4 x i16>* %p + %2 = add <4 x i16> %1, %1 + %3 = bitcast <4 x i16> %2 to <2 x i32> + %4 = add <2 x i32> %3, %3 + store <2 x i32> %4, <2 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v2i32_v8i8: +define void @test_v2i32_v8i8(<8 x i8>* %p, <2 x i32>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.8b } +; CHECK: rev32 v{{[0-9]+}}.8b +; CHECK: st1 { v{{[0-9]+}}.2s } + %1 = load <8 x i8>* %p + %2 = add <8 x i8> %1, %1 + %3 = bitcast <8 x i8> %2 to <2 x i32> + %4 = add <2 x i32> %3, %3 + store <2 x i32> %4, <2 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v4i16_i64: +define void @test_v4i16_i64(i64* %p, <4 x i16>* %q) { +; CHECK: ldr +; CHECK: rev64 v{{[0-9]+}}.4h +; CHECK: st1 { v{{[0-9]+}}.4h } + %1 = load i64* %p + %2 = add i64 %1, %1 + %3 = bitcast i64 %2 to <4 x i16> + %4 = add <4 x i16> %3, %3 + store <4 x i16> %4, <4 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v4i16_f64: +define void @test_v4i16_f64(double* %p, <4 x i16>* %q) { +; CHECK: ldr +; CHECK: rev64 v{{[0-9]+}}.4h +; CHECK: st1 { v{{[0-9]+}}.4h } + %1 = load double* %p + %2 = fadd double %1, %1 + %3 = bitcast double %2 to <4 x i16> + %4 = add <4 x i16> %3, %3 + store <4 x i16> %4, <4 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v4i16_v1i64: +define void @test_v4i16_v1i64(<1 x i64>* %p, <4 x i16>* %q) { +; CHECK: ldr +; CHECK: rev64 v{{[0-9]+}}.4h +; CHECK: st1 { v{{[0-9]+}}.4h } + %1 = load <1 x i64>* %p + %2 = add <1 x i64> %1, %1 + %3 = bitcast <1 x i64> %2 to <4 x i16> + %4 = add <4 x i16> %3, %3 + store <4 x i16> %4, <4 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v4i16_v2f32: +define void @test_v4i16_v2f32(<2 x float>* %p, <4 x i16>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2s } +; CHECK: rev32 v{{[0-9]+}}.4h +; CHECK: st1 { v{{[0-9]+}}.4h } + %1 = load <2 x float>* %p + %2 = fadd <2 x float> %1, %1 + %3 = bitcast <2 x float> %2 to <4 x i16> + %4 = add <4 x i16> %3, %3 + store <4 x i16> %4, <4 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v4i16_v2i32: +define void @test_v4i16_v2i32(<2 x i32>* %p, <4 x i16>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2s } +; CHECK: rev32 v{{[0-9]+}}.4h +; CHECK: st1 { v{{[0-9]+}}.4h } + %1 = load <2 x i32>* %p + %2 = add <2 x i32> %1, %1 + %3 = bitcast <2 x i32> %2 to <4 x i16> + %4 = add <4 x i16> %3, %3 + store <4 x i16> %4, <4 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v4i16_v8i8: +define void @test_v4i16_v8i8(<8 x i8>* %p, <4 x i16>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.8b } +; CHECK: rev16 v{{[0-9]+}}.8b +; CHECK: st1 { v{{[0-9]+}}.4h } + %1 = load <8 x i8>* %p + %2 = add <8 x i8> %1, %1 + %3 = bitcast <8 x i8> %2 to <4 x i16> + %4 = add <4 x i16> %3, %3 + store <4 x i16> %4, <4 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v8i8_i64: +define void @test_v8i8_i64(i64* %p, <8 x i8>* %q) { +; CHECK: ldr +; CHECK: rev64 v{{[0-9]+}}.8b +; CHECK: st1 { v{{[0-9]+}}.8b } + %1 = load i64* %p + %2 = add i64 %1, %1 + %3 = bitcast i64 %2 to <8 x i8> + %4 = add <8 x i8> %3, %3 + store <8 x i8> %4, <8 x i8>* %q + ret void +} + +; CHECK-LABEL: test_v8i8_f64: +define void @test_v8i8_f64(double* %p, <8 x i8>* %q) { +; CHECK: ldr +; CHECK: rev64 v{{[0-9]+}}.8b +; CHECK: st1 { v{{[0-9]+}}.8b } + %1 = load double* %p + %2 = fadd double %1, %1 + %3 = bitcast double %2 to <8 x i8> + %4 = add <8 x i8> %3, %3 + store <8 x i8> %4, <8 x i8>* %q + ret void +} + +; CHECK-LABEL: test_v8i8_v1i64: +define void @test_v8i8_v1i64(<1 x i64>* %p, <8 x i8>* %q) { +; CHECK: ldr +; CHECK: rev64 v{{[0-9]+}}.8b +; CHECK: st1 { v{{[0-9]+}}.8b } + %1 = load <1 x i64>* %p + %2 = add <1 x i64> %1, %1 + %3 = bitcast <1 x i64> %2 to <8 x i8> + %4 = add <8 x i8> %3, %3 + store <8 x i8> %4, <8 x i8>* %q + ret void +} + +; CHECK-LABEL: test_v8i8_v2f32: +define void @test_v8i8_v2f32(<2 x float>* %p, <8 x i8>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2s } +; CHECK: rev32 v{{[0-9]+}}.8b +; CHECK: st1 { v{{[0-9]+}}.8b } + %1 = load <2 x float>* %p + %2 = fadd <2 x float> %1, %1 + %3 = bitcast <2 x float> %2 to <8 x i8> + %4 = add <8 x i8> %3, %3 + store <8 x i8> %4, <8 x i8>* %q + ret void +} + +; CHECK-LABEL: test_v8i8_v2i32: +define void @test_v8i8_v2i32(<2 x i32>* %p, <8 x i8>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2s } +; CHECK: rev32 v{{[0-9]+}}.8b +; CHECK: st1 { v{{[0-9]+}}.8b } + %1 = load <2 x i32>* %p + %2 = add <2 x i32> %1, %1 + %3 = bitcast <2 x i32> %2 to <8 x i8> + %4 = add <8 x i8> %3, %3 + store <8 x i8> %4, <8 x i8>* %q + ret void +} + +; CHECK-LABEL: test_v8i8_v4i16: +define void @test_v8i8_v4i16(<4 x i16>* %p, <8 x i8>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.4h } +; CHECK: rev16 v{{[0-9]+}}.8b +; CHECK: st1 { v{{[0-9]+}}.8b } + %1 = load <4 x i16>* %p + %2 = add <4 x i16> %1, %1 + %3 = bitcast <4 x i16> %2 to <8 x i8> + %4 = add <8 x i8> %3, %3 + store <8 x i8> %4, <8 x i8>* %q + ret void +} + +; CHECK-LABEL: test_f128_v2f64: +define void @test_f128_v2f64(<2 x double>* %p, fp128* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2d } +; CHECK: ext +; CHECK: str + %1 = load <2 x double>* %p + %2 = fadd <2 x double> %1, %1 + %3 = bitcast <2 x double> %2 to fp128 + %4 = fadd fp128 %3, %3 + store fp128 %4, fp128* %q + ret void +} + +; CHECK-LABEL: test_f128_v2i64: +define void @test_f128_v2i64(<2 x i64>* %p, fp128* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2d } +; CHECK: ext +; CHECK: str + %1 = load <2 x i64>* %p + %2 = add <2 x i64> %1, %1 + %3 = bitcast <2 x i64> %2 to fp128 + %4 = fadd fp128 %3, %3 + store fp128 %4, fp128* %q + ret void +} + +; CHECK-LABEL: test_f128_v4f32: +define void @test_f128_v4f32(<4 x float>* %p, fp128* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2d } +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: str q + %1 = load <4 x float>* %p + %2 = fadd <4 x float> %1, %1 + %3 = bitcast <4 x float> %2 to fp128 + %4 = fadd fp128 %3, %3 + store fp128 %4, fp128* %q + ret void +} + +; CHECK-LABEL: test_f128_v4i32: +define void @test_f128_v4i32(<4 x i32>* %p, fp128* %q) { +; CHECK: ld1 { v{{[0-9]+}}.4s } +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: str + %1 = load <4 x i32>* %p + %2 = add <4 x i32> %1, %1 + %3 = bitcast <4 x i32> %2 to fp128 + %4 = fadd fp128 %3, %3 + store fp128 %4, fp128* %q + ret void +} + +; CHECK-LABEL: test_f128_v8i16: +define void @test_f128_v8i16(<8 x i16>* %p, fp128* %q) { +; CHECK: ld1 { v{{[0-9]+}}.8h } +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext +; CHECK: str + %1 = load <8 x i16>* %p + %2 = add <8 x i16> %1, %1 + %3 = bitcast <8 x i16> %2 to fp128 + %4 = fadd fp128 %3, %3 + store fp128 %4, fp128* %q + ret void +} + +; CHECK-LABEL: test_f128_v16i8: +define void @test_f128_v16i8(<16 x i8>* %p, fp128* %q) { +; CHECK: ld1 { v{{[0-9]+}}.16b } +; CHECK: ext +; CHECK: str q + %1 = load <16 x i8>* %p + %2 = add <16 x i8> %1, %1 + %3 = bitcast <16 x i8> %2 to fp128 + %4 = fadd fp128 %3, %3 + store fp128 %4, fp128* %q + ret void +} + +; CHECK-LABEL: test_v2f64_f128: +define void @test_v2f64_f128(fp128* %p, <2 x double>* %q) { +; CHECK: ldr +; CHECK: ext +; CHECK: st1 { v{{[0-9]+}}.2d } + %1 = load fp128* %p + %2 = fadd fp128 %1, %1 + %3 = bitcast fp128 %2 to <2 x double> + %4 = fadd <2 x double> %3, %3 + store <2 x double> %4, <2 x double>* %q + ret void +} + +; CHECK-LABEL: test_v2f64_v2i64: +define void @test_v2f64_v2i64(<2 x i64>* %p, <2 x double>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2d } +; CHECK: st1 { v{{[0-9]+}}.2d } + %1 = load <2 x i64>* %p + %2 = add <2 x i64> %1, %1 + %3 = bitcast <2 x i64> %2 to <2 x double> + %4 = fadd <2 x double> %3, %3 + store <2 x double> %4, <2 x double>* %q + ret void +} + +; CHECK-LABEL: test_v2f64_v4f32: +define void @test_v2f64_v4f32(<4 x float>* %p, <2 x double>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2d } +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: st1 { v{{[0-9]+}}.2d } + %1 = load <4 x float>* %p + %2 = fadd <4 x float> %1, %1 + %3 = bitcast <4 x float> %2 to <2 x double> + %4 = fadd <2 x double> %3, %3 + store <2 x double> %4, <2 x double>* %q + ret void +} + +; CHECK-LABEL: test_v2f64_v4i32: +define void @test_v2f64_v4i32(<4 x i32>* %p, <2 x double>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.4s } +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: st1 { v{{[0-9]+}}.2d } + %1 = load <4 x i32>* %p + %2 = add <4 x i32> %1, %1 + %3 = bitcast <4 x i32> %2 to <2 x double> + %4 = fadd <2 x double> %3, %3 + store <2 x double> %4, <2 x double>* %q + ret void +} + +; CHECK-LABEL: test_v2f64_v8i16: +define void @test_v2f64_v8i16(<8 x i16>* %p, <2 x double>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.8h } +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: st1 { v{{[0-9]+}}.2d } + %1 = load <8 x i16>* %p + %2 = add <8 x i16> %1, %1 + %3 = bitcast <8 x i16> %2 to <2 x double> + %4 = fadd <2 x double> %3, %3 + store <2 x double> %4, <2 x double>* %q + ret void +} + +; CHECK-LABEL: test_v2f64_v16i8: +define void @test_v2f64_v16i8(<16 x i8>* %p, <2 x double>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.16b } +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: st1 { v{{[0-9]+}}.2d } + %1 = load <16 x i8>* %p + %2 = add <16 x i8> %1, %1 + %3 = bitcast <16 x i8> %2 to <2 x double> + %4 = fadd <2 x double> %3, %3 + store <2 x double> %4, <2 x double>* %q + ret void +} + +; CHECK-LABEL: test_v2i64_f128: +define void @test_v2i64_f128(fp128* %p, <2 x i64>* %q) { +; CHECK: ldr +; CHECK: ext +; CHECK: st1 { v{{[0-9]+}}.2d } + %1 = load fp128* %p + %2 = fadd fp128 %1, %1 + %3 = bitcast fp128 %2 to <2 x i64> + %4 = add <2 x i64> %3, %3 + store <2 x i64> %4, <2 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v2i64_v2f64: +define void @test_v2i64_v2f64(<2 x double>* %p, <2 x i64>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2d } +; CHECK: st1 { v{{[0-9]+}}.2d } + %1 = load <2 x double>* %p + %2 = fadd <2 x double> %1, %1 + %3 = bitcast <2 x double> %2 to <2 x i64> + %4 = add <2 x i64> %3, %3 + store <2 x i64> %4, <2 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v2i64_v4f32: +define void @test_v2i64_v4f32(<4 x float>* %p, <2 x i64>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2d } +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: st1 { v{{[0-9]+}}.2d } + %1 = load <4 x float>* %p + %2 = fadd <4 x float> %1, %1 + %3 = bitcast <4 x float> %2 to <2 x i64> + %4 = add <2 x i64> %3, %3 + store <2 x i64> %4, <2 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v2i64_v4i32: +define void @test_v2i64_v4i32(<4 x i32>* %p, <2 x i64>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.4s } +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: st1 { v{{[0-9]+}}.2d } + %1 = load <4 x i32>* %p + %2 = add <4 x i32> %1, %1 + %3 = bitcast <4 x i32> %2 to <2 x i64> + %4 = add <2 x i64> %3, %3 + store <2 x i64> %4, <2 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v2i64_v8i16: +define void @test_v2i64_v8i16(<8 x i16>* %p, <2 x i64>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.8h } +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: st1 { v{{[0-9]+}}.2d } + %1 = load <8 x i16>* %p + %2 = add <8 x i16> %1, %1 + %3 = bitcast <8 x i16> %2 to <2 x i64> + %4 = add <2 x i64> %3, %3 + store <2 x i64> %4, <2 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v2i64_v16i8: +define void @test_v2i64_v16i8(<16 x i8>* %p, <2 x i64>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.16b } +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: st1 { v{{[0-9]+}}.2d } + %1 = load <16 x i8>* %p + %2 = add <16 x i8> %1, %1 + %3 = bitcast <16 x i8> %2 to <2 x i64> + %4 = add <2 x i64> %3, %3 + store <2 x i64> %4, <2 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v4f32_f128: +define void @test_v4f32_f128(fp128* %p, <4 x float>* %q) { +; CHECK: ldr q +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: st1 { v{{[0-9]+}}.2d } + %1 = load fp128* %p + %2 = fadd fp128 %1, %1 + %3 = bitcast fp128 %2 to <4 x float> + %4 = fadd <4 x float> %3, %3 + store <4 x float> %4, <4 x float>* %q + ret void +} + +; CHECK-LABEL: test_v4f32_v2f64: +define void @test_v4f32_v2f64(<2 x double>* %p, <4 x float>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2d } +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: st1 { v{{[0-9]+}}.2d } + %1 = load <2 x double>* %p + %2 = fadd <2 x double> %1, %1 + %3 = bitcast <2 x double> %2 to <4 x float> + %4 = fadd <4 x float> %3, %3 + store <4 x float> %4, <4 x float>* %q + ret void +} + +; CHECK-LABEL: test_v4f32_v2i64: +define void @test_v4f32_v2i64(<2 x i64>* %p, <4 x float>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2d } +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: st1 { v{{[0-9]+}}.2d } + %1 = load <2 x i64>* %p + %2 = add <2 x i64> %1, %1 + %3 = bitcast <2 x i64> %2 to <4 x float> + %4 = fadd <4 x float> %3, %3 + store <4 x float> %4, <4 x float>* %q + ret void +} + +; CHECK-LABEL: test_v4f32_v4i32: +define void @test_v4f32_v4i32(<4 x i32>* %p, <4 x float>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.4s } +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: st1 { v{{[0-9]+}}.2d } + %1 = load <4 x i32>* %p + %2 = add <4 x i32> %1, %1 + %3 = bitcast <4 x i32> %2 to <4 x float> + %4 = fadd <4 x float> %3, %3 + store <4 x float> %4, <4 x float>* %q + ret void +} + +; CHECK-LABEL: test_v4f32_v8i16: +define void @test_v4f32_v8i16(<8 x i16>* %p, <4 x float>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.8h } +; CHECK: rev32 v{{[0-9]+}}.8h +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: st1 { v{{[0-9]+}}.2d } + %1 = load <8 x i16>* %p + %2 = add <8 x i16> %1, %1 + %3 = bitcast <8 x i16> %2 to <4 x float> + %4 = fadd <4 x float> %3, %3 + store <4 x float> %4, <4 x float>* %q + ret void +} + +; CHECK-LABEL: test_v4f32_v16i8: +define void @test_v4f32_v16i8(<16 x i8>* %p, <4 x float>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.16b } +; CHECK: rev32 v{{[0-9]+}}.16b +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: st1 { v{{[0-9]+}}.2d } + %1 = load <16 x i8>* %p + %2 = add <16 x i8> %1, %1 + %3 = bitcast <16 x i8> %2 to <4 x float> + %4 = fadd <4 x float> %3, %3 + store <4 x float> %4, <4 x float>* %q + ret void +} + +; CHECK-LABEL: test_v4i32_f128: +define void @test_v4i32_f128(fp128* %p, <4 x i32>* %q) { +; CHECK: ldr +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: st1 { v{{[0-9]+}}.4s } + %1 = load fp128* %p + %2 = fadd fp128 %1, %1 + %3 = bitcast fp128 %2 to <4 x i32> + %4 = add <4 x i32> %3, %3 + store <4 x i32> %4, <4 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v4i32_v2f64: +define void @test_v4i32_v2f64(<2 x double>* %p, <4 x i32>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2d } +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: st1 { v{{[0-9]+}}.4s } + %1 = load <2 x double>* %p + %2 = fadd <2 x double> %1, %1 + %3 = bitcast <2 x double> %2 to <4 x i32> + %4 = add <4 x i32> %3, %3 + store <4 x i32> %4, <4 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v4i32_v2i64: +define void @test_v4i32_v2i64(<2 x i64>* %p, <4 x i32>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2d } +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: st1 { v{{[0-9]+}}.4s } + %1 = load <2 x i64>* %p + %2 = add <2 x i64> %1, %1 + %3 = bitcast <2 x i64> %2 to <4 x i32> + %4 = add <4 x i32> %3, %3 + store <4 x i32> %4, <4 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v4i32_v4f32: +define void @test_v4i32_v4f32(<4 x float>* %p, <4 x i32>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2d } +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: st1 { v{{[0-9]+}}.4s } + %1 = load <4 x float>* %p + %2 = fadd <4 x float> %1, %1 + %3 = bitcast <4 x float> %2 to <4 x i32> + %4 = add <4 x i32> %3, %3 + store <4 x i32> %4, <4 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v4i32_v8i16: +define void @test_v4i32_v8i16(<8 x i16>* %p, <4 x i32>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.8h } +; CHECK: rev32 v{{[0-9]+}}.8h +; CHECK: st1 { v{{[0-9]+}}.4s } + %1 = load <8 x i16>* %p + %2 = add <8 x i16> %1, %1 + %3 = bitcast <8 x i16> %2 to <4 x i32> + %4 = add <4 x i32> %3, %3 + store <4 x i32> %4, <4 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v4i32_v16i8: +define void @test_v4i32_v16i8(<16 x i8>* %p, <4 x i32>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.16b } +; CHECK: rev32 v{{[0-9]+}}.16b +; CHECK: st1 { v{{[0-9]+}}.4s } + %1 = load <16 x i8>* %p + %2 = add <16 x i8> %1, %1 + %3 = bitcast <16 x i8> %2 to <4 x i32> + %4 = add <4 x i32> %3, %3 + store <4 x i32> %4, <4 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v8i16_f128: +define void @test_v8i16_f128(fp128* %p, <8 x i16>* %q) { +; CHECK: ldr +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext +; CHECK: st1 { v{{[0-9]+}}.8h } + %1 = load fp128* %p + %2 = fadd fp128 %1, %1 + %3 = bitcast fp128 %2 to <8 x i16> + %4 = add <8 x i16> %3, %3 + store <8 x i16> %4, <8 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v8i16_v2f64: +define void @test_v8i16_v2f64(<2 x double>* %p, <8 x i16>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2d } +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: st1 { v{{[0-9]+}}.8h } + %1 = load <2 x double>* %p + %2 = fadd <2 x double> %1, %1 + %3 = bitcast <2 x double> %2 to <8 x i16> + %4 = add <8 x i16> %3, %3 + store <8 x i16> %4, <8 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v8i16_v2i64: +define void @test_v8i16_v2i64(<2 x i64>* %p, <8 x i16>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2d } +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: st1 { v{{[0-9]+}}.8h } + %1 = load <2 x i64>* %p + %2 = add <2 x i64> %1, %1 + %3 = bitcast <2 x i64> %2 to <8 x i16> + %4 = add <8 x i16> %3, %3 + store <8 x i16> %4, <8 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v8i16_v4f32: +define void @test_v8i16_v4f32(<4 x float>* %p, <8 x i16>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2d } +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: rev32 v{{[0-9]+}}.8h +; CHECK: st1 { v{{[0-9]+}}.8h } + %1 = load <4 x float>* %p + %2 = fadd <4 x float> %1, %1 + %3 = bitcast <4 x float> %2 to <8 x i16> + %4 = add <8 x i16> %3, %3 + store <8 x i16> %4, <8 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v8i16_v4i32: +define void @test_v8i16_v4i32(<4 x i32>* %p, <8 x i16>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.4s } +; CHECK: rev32 v{{[0-9]+}}.8h +; CHECK: st1 { v{{[0-9]+}}.8h } + %1 = load <4 x i32>* %p + %2 = add <4 x i32> %1, %1 + %3 = bitcast <4 x i32> %2 to <8 x i16> + %4 = add <8 x i16> %3, %3 + store <8 x i16> %4, <8 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v8i16_v16i8: +define void @test_v8i16_v16i8(<16 x i8>* %p, <8 x i16>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.16b } +; CHECK: rev16 v{{[0-9]+}}.16b +; CHECK: st1 { v{{[0-9]+}}.8h } + %1 = load <16 x i8>* %p + %2 = add <16 x i8> %1, %1 + %3 = bitcast <16 x i8> %2 to <8 x i16> + %4 = add <8 x i16> %3, %3 + store <8 x i16> %4, <8 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v16i8_f128: +define void @test_v16i8_f128(fp128* %p, <16 x i8>* %q) { +; CHECK: ldr q +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext +; CHECK: st1 { v{{[0-9]+}}.16b } + %1 = load fp128* %p + %2 = fadd fp128 %1, %1 + %3 = bitcast fp128 %2 to <16 x i8> + %4 = add <16 x i8> %3, %3 + store <16 x i8> %4, <16 x i8>* %q + ret void +} + +; CHECK-LABEL: test_v16i8_v2f64: +define void @test_v16i8_v2f64(<2 x double>* %p, <16 x i8>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2d } +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: st1 { v{{[0-9]+}}.16b } + %1 = load <2 x double>* %p + %2 = fadd <2 x double> %1, %1 + %3 = bitcast <2 x double> %2 to <16 x i8> + %4 = add <16 x i8> %3, %3 + store <16 x i8> %4, <16 x i8>* %q + ret void +} + +; CHECK-LABEL: test_v16i8_v2i64: +define void @test_v16i8_v2i64(<2 x i64>* %p, <16 x i8>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2d } +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: st1 { v{{[0-9]+}}.16b } + %1 = load <2 x i64>* %p + %2 = add <2 x i64> %1, %1 + %3 = bitcast <2 x i64> %2 to <16 x i8> + %4 = add <16 x i8> %3, %3 + store <16 x i8> %4, <16 x i8>* %q + ret void +} + +; CHECK-LABEL: test_v16i8_v4f32: +define void @test_v16i8_v4f32(<4 x float>* %p, <16 x i8>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.2d } +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: rev32 v{{[0-9]+}}.16b +; CHECK: st1 { v{{[0-9]+}}.16b } + %1 = load <4 x float>* %p + %2 = fadd <4 x float> %1, %1 + %3 = bitcast <4 x float> %2 to <16 x i8> + %4 = add <16 x i8> %3, %3 + store <16 x i8> %4, <16 x i8>* %q + ret void +} + +; CHECK-LABEL: test_v16i8_v4i32: +define void @test_v16i8_v4i32(<4 x i32>* %p, <16 x i8>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.4s } +; CHECK: rev32 v{{[0-9]+}}.16b +; CHECK: st1 { v{{[0-9]+}}.16b } + %1 = load <4 x i32>* %p + %2 = add <4 x i32> %1, %1 + %3 = bitcast <4 x i32> %2 to <16 x i8> + %4 = add <16 x i8> %3, %3 + store <16 x i8> %4, <16 x i8>* %q + ret void +} + +; CHECK-LABEL: test_v16i8_v8i16: +define void @test_v16i8_v8i16(<8 x i16>* %p, <16 x i8>* %q) { +; CHECK: ld1 { v{{[0-9]+}}.8h } +; CHECK: rev16 v{{[0-9]+}}.16b +; CHECK: st1 { v{{[0-9]+}}.16b } + %1 = load <8 x i16>* %p + %2 = add <8 x i16> %1, %1 + %3 = bitcast <8 x i16> %2 to <16 x i8> + %4 = add <16 x i8> %3, %3 + store <16 x i8> %4, <16 x i8>* %q + ret void +} diff --git a/test/CodeGen/AArch64/arm64-big-endian-eh.ll b/test/CodeGen/AArch64/arm64-big-endian-eh.ll new file mode 100644 index 00000000000..93e7da98de2 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-big-endian-eh.ll @@ -0,0 +1,73 @@ +; RUN: llc -mtriple arm64_be-linux-gnu -filetype obj < %s | llvm-objdump -s - | FileCheck %s + +; ARM EHABI for big endian +; This test case checks whether CIE length record is laid out in big endian format. +; +; This is the LLVM assembly generated from following C++ code: +; +; extern void foo(int); +; void test(int a, int b) { +; try { +; foo(a); +; } catch (...) { +; foo(b); +; } +;} + +define void @_Z4testii(i32 %a, i32 %b) #0 { +entry: + invoke void @_Z3fooi(i32 %a) + to label %try.cont unwind label %lpad + +lpad: ; preds = %entry + %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) + catch i8* null + %1 = extractvalue { i8*, i32 } %0, 0 + %2 = tail call i8* @__cxa_begin_catch(i8* %1) #2 + invoke void @_Z3fooi(i32 %b) + to label %invoke.cont2 unwind label %lpad1 + +invoke.cont2: ; preds = %lpad + tail call void @__cxa_end_catch() + br label %try.cont + +try.cont: ; preds = %entry, %invoke.cont2 + ret void + +lpad1: ; preds = %lpad + %3 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) + cleanup + invoke void @__cxa_end_catch() + to label %eh.resume unwind label %terminate.lpad + +eh.resume: ; preds = %lpad1 + resume { i8*, i32 } %3 + +terminate.lpad: ; preds = %lpad1 + %4 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) + catch i8* null + %5 = extractvalue { i8*, i32 } %4, 0 + tail call void @__clang_call_terminate(i8* %5) #3 + unreachable +} + +declare void @_Z3fooi(i32) #0 + +declare i32 @__gxx_personality_v0(...) + +declare i8* @__cxa_begin_catch(i8*) + +declare void @__cxa_end_catch() + +; Function Attrs: noinline noreturn nounwind +define linkonce_odr hidden void @__clang_call_terminate(i8*) #1 { + %2 = tail call i8* @__cxa_begin_catch(i8* %0) #2 + tail call void @_ZSt9terminatev() #3 + unreachable +} + +declare void @_ZSt9terminatev() + +; CHECK-LABEL: Contents of section .eh_frame: +; CHECK-NEXT: 0000 0000001c + diff --git a/test/CodeGen/AArch64/arm64-big-endian-varargs.ll b/test/CodeGen/AArch64/arm64-big-endian-varargs.ll new file mode 100644 index 00000000000..d7b26b97523 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-big-endian-varargs.ll @@ -0,0 +1,58 @@ +; RUN: llc < %s | FileCheck %s + +; Vararg saving must save Q registers using the equivalent of STR/STP. + +target datalayout = "E-m:e-i64:64-i128:128-n32:64-S128" +target triple = "arm64_be-arm-none-eabi" + +%struct.__va_list = type { i8*, i8*, i8*, i32, i32 } + +declare void @llvm.va_start(i8*) nounwind +declare void @llvm.va_end(i8*) nounwind + +define double @callee(i32 %a, ...) { +; CHECK: stp +; CHECK: stp +; CHECK: stp +; CHECK: stp +; CHECK: stp +; CHECK: stp +entry: + %vl = alloca %struct.__va_list, align 8 + %vl1 = bitcast %struct.__va_list* %vl to i8* + call void @llvm.va_start(i8* %vl1) + %vr_offs_p = getelementptr inbounds %struct.__va_list* %vl, i64 0, i32 4 + %vr_offs = load i32* %vr_offs_p, align 4 + %0 = icmp sgt i32 %vr_offs, -1 + br i1 %0, label %vaarg.on_stack, label %vaarg.maybe_reg + +vaarg.maybe_reg: ; preds = %entry + %new_reg_offs = add i32 %vr_offs, 16 + store i32 %new_reg_offs, i32* %vr_offs_p, align 4 + %inreg = icmp slt i32 %new_reg_offs, 1 + br i1 %inreg, label %vaarg.in_reg, label %vaarg.on_stack + +vaarg.in_reg: ; preds = %vaarg.maybe_reg + %reg_top_p = getelementptr inbounds %struct.__va_list* %vl, i64 0, i32 2 + %reg_top = load i8** %reg_top_p, align 8 + %1 = sext i32 %vr_offs to i64 + %2 = getelementptr i8* %reg_top, i64 %1 + %3 = ptrtoint i8* %2 to i64 + %align_be = add i64 %3, 8 + %4 = inttoptr i64 %align_be to i8* + br label %vaarg.end + +vaarg.on_stack: ; preds = %vaarg.maybe_reg, %entry + %stack_p = getelementptr inbounds %struct.__va_list* %vl, i64 0, i32 0 + %stack = load i8** %stack_p, align 8 + %new_stack = getelementptr i8* %stack, i64 8 + store i8* %new_stack, i8** %stack_p, align 8 + br label %vaarg.end + +vaarg.end: ; preds = %vaarg.on_stack, %vaarg.in_reg + %.sink = phi i8* [ %4, %vaarg.in_reg ], [ %stack, %vaarg.on_stack ] + %5 = bitcast i8* %.sink to double* + %6 = load double* %5, align 8 + call void @llvm.va_end(i8* %vl1) + ret double %6 +} diff --git a/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll b/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll new file mode 100644 index 00000000000..1dcccf106a2 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll @@ -0,0 +1,848 @@ +; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s +; RUN: llc -mtriple arm64_be < %s -fast-isel=true -aarch64-load-store-opt=false -o - | FileCheck %s + +; CHECK-LABEL: test_i64_f64: +define i64 @test_i64_f64(double %p) { +; CHECK-NOT: rev + %1 = fadd double %p, %p + %2 = bitcast double %1 to i64 + %3 = add i64 %2, %2 + ret i64 %3 +} + +; CHECK-LABEL: test_i64_v1i64: +define i64 @test_i64_v1i64(<1 x i64> %p) { +; CHECK-NOT: rev + %1 = add <1 x i64> %p, %p + %2 = bitcast <1 x i64> %1 to i64 + %3 = add i64 %2, %2 + ret i64 %3 +} + +; CHECK-LABEL: test_i64_v2f32: +define i64 @test_i64_v2f32(<2 x float> %p) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = fadd <2 x float> %p, %p + %2 = bitcast <2 x float> %1 to i64 + %3 = add i64 %2, %2 + ret i64 %3 +} + +; CHECK-LABEL: test_i64_v2i32: +define i64 @test_i64_v2i32(<2 x i32> %p) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = add <2 x i32> %p, %p + %2 = bitcast <2 x i32> %1 to i64 + %3 = add i64 %2, %2 + ret i64 %3 +} + +; CHECK-LABEL: test_i64_v4i16: +define i64 @test_i64_v4i16(<4 x i16> %p) { +; CHECK: rev64 v{{[0-9]+}}.4h + %1 = add <4 x i16> %p, %p + %2 = bitcast <4 x i16> %1 to i64 + %3 = add i64 %2, %2 + ret i64 %3 +} + +; CHECK-LABEL: test_i64_v8i8: +define i64 @test_i64_v8i8(<8 x i8> %p) { +; CHECK: rev64 v{{[0-9]+}}.8b + %1 = add <8 x i8> %p, %p + %2 = bitcast <8 x i8> %1 to i64 + %3 = add i64 %2, %2 + ret i64 %3 +} + +; CHECK-LABEL: test_f64_i64: +define double @test_f64_i64(i64 %p) { +; CHECK-NOT: rev + %1 = add i64 %p, %p + %2 = bitcast i64 %1 to double + %3 = fadd double %2, %2 + ret double %3 +} + +; CHECK-LABEL: test_f64_v1i64: +define double @test_f64_v1i64(<1 x i64> %p) { +; CHECK-NOT: rev + %1 = add <1 x i64> %p, %p + %2 = bitcast <1 x i64> %1 to double + %3 = fadd double %2, %2 + ret double %3 +} + +; CHECK-LABEL: test_f64_v2f32: +define double @test_f64_v2f32(<2 x float> %p) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = fadd <2 x float> %p, %p + %2 = bitcast <2 x float> %1 to double + %3 = fadd double %2, %2 + ret double %3 +} + +; CHECK-LABEL: test_f64_v2i32: +define double @test_f64_v2i32(<2 x i32> %p) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = add <2 x i32> %p, %p + %2 = bitcast <2 x i32> %1 to double + %3 = fadd double %2, %2 + ret double %3 +} + +; CHECK-LABEL: test_f64_v4i16: +define double @test_f64_v4i16(<4 x i16> %p) { +; CHECK: rev64 v{{[0-9]+}}.4h + %1 = add <4 x i16> %p, %p + %2 = bitcast <4 x i16> %1 to double + %3 = fadd double %2, %2 + ret double %3 +} + +; CHECK-LABEL: test_f64_v8i8: +define double @test_f64_v8i8(<8 x i8> %p) { +; CHECK: rev64 v{{[0-9]+}}.8b + %1 = add <8 x i8> %p, %p + %2 = bitcast <8 x i8> %1 to double + %3 = fadd double %2, %2 + ret double %3 +} + +; CHECK-LABEL: test_v1i64_i64: +define <1 x i64> @test_v1i64_i64(i64 %p) { +; CHECK-NOT: rev + %1 = add i64 %p, %p + %2 = bitcast i64 %1 to <1 x i64> + %3 = add <1 x i64> %2, %2 + ret <1 x i64> %3 +} + +; CHECK-LABEL: test_v1i64_f64: +define <1 x i64> @test_v1i64_f64(double %p) { +; CHECK-NOT: rev + %1 = fadd double %p, %p + %2 = bitcast double %1 to <1 x i64> + %3 = add <1 x i64> %2, %2 + ret <1 x i64> %3 +} + +; CHECK-LABEL: test_v1i64_v2f32: +define <1 x i64> @test_v1i64_v2f32(<2 x float> %p) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = fadd <2 x float> %p, %p + %2 = bitcast <2 x float> %1 to <1 x i64> + %3 = add <1 x i64> %2, %2 + ret <1 x i64> %3 +} + +; CHECK-LABEL: test_v1i64_v2i32: +define <1 x i64> @test_v1i64_v2i32(<2 x i32> %p) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = add <2 x i32> %p, %p + %2 = bitcast <2 x i32> %1 to <1 x i64> + %3 = add <1 x i64> %2, %2 + ret <1 x i64> %3 +} + +; CHECK-LABEL: test_v1i64_v4i16: +define <1 x i64> @test_v1i64_v4i16(<4 x i16> %p) { +; CHECK: rev64 v{{[0-9]+}}.4h + %1 = add <4 x i16> %p, %p + %2 = bitcast <4 x i16> %1 to <1 x i64> + %3 = add <1 x i64> %2, %2 + ret <1 x i64> %3 +} + +; CHECK-LABEL: test_v1i64_v8i8: +define <1 x i64> @test_v1i64_v8i8(<8 x i8> %p) { +; CHECK: rev64 v{{[0-9]+}}.8b + %1 = add <8 x i8> %p, %p + %2 = bitcast <8 x i8> %1 to <1 x i64> + %3 = add <1 x i64> %2, %2 + ret <1 x i64> %3 +} + +; CHECK-LABEL: test_v2f32_i64: +define <2 x float> @test_v2f32_i64(i64 %p) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = add i64 %p, %p + %2 = bitcast i64 %1 to <2 x float> + %3 = fadd <2 x float> %2, %2 + ret <2 x float> %3 +} + +; CHECK-LABEL: test_v2f32_f64: +define <2 x float> @test_v2f32_f64(double %p) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = fadd double %p, %p + %2 = bitcast double %1 to <2 x float> + %3 = fadd <2 x float> %2, %2 + ret <2 x float> %3 +} + +; CHECK-LABEL: test_v2f32_v1i64: +define <2 x float> @test_v2f32_v1i64(<1 x i64> %p) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = add <1 x i64> %p, %p + %2 = bitcast <1 x i64> %1 to <2 x float> + %3 = fadd <2 x float> %2, %2 + ret <2 x float> %3 +} + +; CHECK-LABEL: test_v2f32_v2i32: +define <2 x float> @test_v2f32_v2i32(<2 x i32> %p) { +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = add <2 x i32> %p, %p + %2 = bitcast <2 x i32> %1 to <2 x float> + %3 = fadd <2 x float> %2, %2 + ret <2 x float> %3 +} + +; CHECK-LABEL: test_v2f32_v4i16: +define <2 x float> @test_v2f32_v4i16(<4 x i16> %p) { +; CHECK: rev64 v{{[0-9]+}}.4h +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = add <4 x i16> %p, %p + %2 = bitcast <4 x i16> %1 to <2 x float> + %3 = fadd <2 x float> %2, %2 + ret <2 x float> %3 +} + +; CHECK-LABEL: test_v2f32_v8i8: +define <2 x float> @test_v2f32_v8i8(<8 x i8> %p) { +; CHECK: rev64 v{{[0-9]+}}.8b +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = add <8 x i8> %p, %p + %2 = bitcast <8 x i8> %1 to <2 x float> + %3 = fadd <2 x float> %2, %2 + ret <2 x float> %3 +} + +; CHECK-LABEL: test_v2i32_i64: +define <2 x i32> @test_v2i32_i64(i64 %p) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = add i64 %p, %p + %2 = bitcast i64 %1 to <2 x i32> + %3 = add <2 x i32> %2, %2 + ret <2 x i32> %3 +} + +; CHECK-LABEL: test_v2i32_f64: +define <2 x i32> @test_v2i32_f64(double %p) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = fadd double %p, %p + %2 = bitcast double %1 to <2 x i32> + %3 = add <2 x i32> %2, %2 + ret <2 x i32> %3 +} + +; CHECK-LABEL: test_v2i32_v1i64: +define <2 x i32> @test_v2i32_v1i64(<1 x i64> %p) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = add <1 x i64> %p, %p + %2 = bitcast <1 x i64> %1 to <2 x i32> + %3 = add <2 x i32> %2, %2 + ret <2 x i32> %3 +} + +; CHECK-LABEL: test_v2i32_v2f32: +define <2 x i32> @test_v2i32_v2f32(<2 x float> %p) { +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = fadd <2 x float> %p, %p + %2 = bitcast <2 x float> %1 to <2 x i32> + %3 = add <2 x i32> %2, %2 + ret <2 x i32> %3 +} + +; CHECK-LABEL: test_v2i32_v4i16: +define <2 x i32> @test_v2i32_v4i16(<4 x i16> %p) { +; CHECK: rev64 v{{[0-9]+}}.4h +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = add <4 x i16> %p, %p + %2 = bitcast <4 x i16> %1 to <2 x i32> + %3 = add <2 x i32> %2, %2 + ret <2 x i32> %3 +} + +; CHECK-LABEL: test_v2i32_v8i8: +define <2 x i32> @test_v2i32_v8i8(<8 x i8> %p) { +; CHECK: rev64 v{{[0-9]+}}.8b +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = add <8 x i8> %p, %p + %2 = bitcast <8 x i8> %1 to <2 x i32> + %3 = add <2 x i32> %2, %2 + ret <2 x i32> %3 +} + +; CHECK-LABEL: test_v4i16_i64: +define <4 x i16> @test_v4i16_i64(i64 %p) { +; CHECK: rev64 v{{[0-9]+}}.4h + %1 = add i64 %p, %p + %2 = bitcast i64 %1 to <4 x i16> + %3 = add <4 x i16> %2, %2 + ret <4 x i16> %3 +} + +; CHECK-LABEL: test_v4i16_f64: +define <4 x i16> @test_v4i16_f64(double %p) { +; CHECK: rev64 v{{[0-9]+}}.4h + %1 = fadd double %p, %p + %2 = bitcast double %1 to <4 x i16> + %3 = add <4 x i16> %2, %2 + ret <4 x i16> %3 +} + +; CHECK-LABEL: test_v4i16_v1i64: +define <4 x i16> @test_v4i16_v1i64(<1 x i64> %p) { +; CHECK: rev64 v{{[0-9]+}}.4h + %1 = add <1 x i64> %p, %p + %2 = bitcast <1 x i64> %1 to <4 x i16> + %3 = add <4 x i16> %2, %2 + ret <4 x i16> %3 +} + +; CHECK-LABEL: test_v4i16_v2f32: +define <4 x i16> @test_v4i16_v2f32(<2 x float> %p) { +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: rev64 v{{[0-9]+}}.4h + %1 = fadd <2 x float> %p, %p + %2 = bitcast <2 x float> %1 to <4 x i16> + %3 = add <4 x i16> %2, %2 + ret <4 x i16> %3 +} + +; CHECK-LABEL: test_v4i16_v2i32: +define <4 x i16> @test_v4i16_v2i32(<2 x i32> %p) { +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: rev64 v{{[0-9]+}}.4h + %1 = add <2 x i32> %p, %p + %2 = bitcast <2 x i32> %1 to <4 x i16> + %3 = add <4 x i16> %2, %2 + ret <4 x i16> %3 +} + +; CHECK-LABEL: test_v4i16_v8i8: +define <4 x i16> @test_v4i16_v8i8(<8 x i8> %p) { +; CHECK: rev64 v{{[0-9]+}}.8b +; CHECK: rev64 v{{[0-9]+}}.4h + %1 = add <8 x i8> %p, %p + %2 = bitcast <8 x i8> %1 to <4 x i16> + %3 = add <4 x i16> %2, %2 + ret <4 x i16> %3 +} + +; CHECK-LABEL: test_v8i8_i64: +define <8 x i8> @test_v8i8_i64(i64 %p) { +; CHECK: rev64 v{{[0-9]+}}.8b + %1 = add i64 %p, %p + %2 = bitcast i64 %1 to <8 x i8> + %3 = add <8 x i8> %2, %2 + ret <8 x i8> %3 +} + +; CHECK-LABEL: test_v8i8_f64: +define <8 x i8> @test_v8i8_f64(double %p) { +; CHECK: rev64 v{{[0-9]+}}.8b + %1 = fadd double %p, %p + %2 = bitcast double %1 to <8 x i8> + %3 = add <8 x i8> %2, %2 + ret <8 x i8> %3 +} + +; CHECK-LABEL: test_v8i8_v1i64: +define <8 x i8> @test_v8i8_v1i64(<1 x i64> %p) { +; CHECK: rev64 v{{[0-9]+}}.8b + %1 = add <1 x i64> %p, %p + %2 = bitcast <1 x i64> %1 to <8 x i8> + %3 = add <8 x i8> %2, %2 + ret <8 x i8> %3 +} + +; CHECK-LABEL: test_v8i8_v2f32: +define <8 x i8> @test_v8i8_v2f32(<2 x float> %p) { +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: rev64 v{{[0-9]+}}.8b + %1 = fadd <2 x float> %p, %p + %2 = bitcast <2 x float> %1 to <8 x i8> + %3 = add <8 x i8> %2, %2 + ret <8 x i8> %3 +} + +; CHECK-LABEL: test_v8i8_v2i32: +define <8 x i8> @test_v8i8_v2i32(<2 x i32> %p) { +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: rev64 v{{[0-9]+}}.8b + %1 = add <2 x i32> %p, %p + %2 = bitcast <2 x i32> %1 to <8 x i8> + %3 = add <8 x i8> %2, %2 + ret <8 x i8> %3 +} + +; CHECK-LABEL: test_v8i8_v4i16: +define <8 x i8> @test_v8i8_v4i16(<4 x i16> %p) { +; CHECK: rev64 v{{[0-9]+}}.4h +; CHECK: rev64 v{{[0-9]+}}.8b + %1 = add <4 x i16> %p, %p + %2 = bitcast <4 x i16> %1 to <8 x i8> + %3 = add <8 x i8> %2, %2 + ret <8 x i8> %3 +} + +; CHECK-LABEL: test_f128_v2f64: +define fp128 @test_f128_v2f64(<2 x double> %p) { +; CHECK: ext + %1 = fadd <2 x double> %p, %p + %2 = bitcast <2 x double> %1 to fp128 + %3 = fadd fp128 %2, %2 + ret fp128 %3 +} + +; CHECK-LABEL: test_f128_v2i64: +define fp128 @test_f128_v2i64(<2 x i64> %p) { +; CHECK: ext + %1 = add <2 x i64> %p, %p + %2 = bitcast <2 x i64> %1 to fp128 + %3 = fadd fp128 %2, %2 + ret fp128 %3 +} + +; CHECK-LABEL: test_f128_v4f32: +define fp128 @test_f128_v4f32(<4 x float> %p) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = fadd <4 x float> %p, %p + %2 = bitcast <4 x float> %1 to fp128 + %3 = fadd fp128 %2, %2 + ret fp128 %3 +} + +; CHECK-LABEL: test_f128_v4i32: +define fp128 @test_f128_v4i32(<4 x i32> %p) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = add <4 x i32> %p, %p + %2 = bitcast <4 x i32> %1 to fp128 + %3 = fadd fp128 %2, %2 + ret fp128 %3 +} + +; CHECK-LABEL: test_f128_v8i16: +define fp128 @test_f128_v8i16(<8 x i16> %p) { +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext + %1 = add <8 x i16> %p, %p + %2 = bitcast <8 x i16> %1 to fp128 + %3 = fadd fp128 %2, %2 + ret fp128 %3 +} + +; CHECK-LABEL: test_f128_v16i8: +define fp128 @test_f128_v16i8(<16 x i8> %p) { +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext + %1 = add <16 x i8> %p, %p + %2 = bitcast <16 x i8> %1 to fp128 + %3 = fadd fp128 %2, %2 + ret fp128 %3 +} + +; CHECK-LABEL: test_v2f64_f128: +define <2 x double> @test_v2f64_f128(fp128 %p) { +; CHECK: ext + %1 = fadd fp128 %p, %p + %2 = bitcast fp128 %1 to <2 x double> + %3 = fadd <2 x double> %2, %2 + ret <2 x double> %3 +} + +; CHECK-LABEL: test_v2f64_v2i64: +define <2 x double> @test_v2f64_v2i64(<2 x i64> %p) { +; CHECK: ext +; CHECK: ext + %1 = add <2 x i64> %p, %p + %2 = bitcast <2 x i64> %1 to <2 x double> + %3 = fadd <2 x double> %2, %2 + ret <2 x double> %3 +} + +; CHECK-LABEL: test_v2f64_v4f32: +define <2 x double> @test_v2f64_v4f32(<4 x float> %p) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: ext + %1 = fadd <4 x float> %p, %p + %2 = bitcast <4 x float> %1 to <2 x double> + %3 = fadd <2 x double> %2, %2 + ret <2 x double> %3 +} + +; CHECK-LABEL: test_v2f64_v4i32: +define <2 x double> @test_v2f64_v4i32(<4 x i32> %p) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: ext + %1 = add <4 x i32> %p, %p + %2 = bitcast <4 x i32> %1 to <2 x double> + %3 = fadd <2 x double> %2, %2 + ret <2 x double> %3 +} + +; CHECK-LABEL: test_v2f64_v8i16: +define <2 x double> @test_v2f64_v8i16(<8 x i16> %p) { +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext +; CHECK: ext + %1 = add <8 x i16> %p, %p + %2 = bitcast <8 x i16> %1 to <2 x double> + %3 = fadd <2 x double> %2, %2 + ret <2 x double> %3 +} + +; CHECK-LABEL: test_v2f64_v16i8: +define <2 x double> @test_v2f64_v16i8(<16 x i8> %p) { +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext +; CHECK: ext + %1 = add <16 x i8> %p, %p + %2 = bitcast <16 x i8> %1 to <2 x double> + %3 = fadd <2 x double> %2, %2 + ret <2 x double> %3 +} + +; CHECK-LABEL: test_v2i64_f128: +define <2 x i64> @test_v2i64_f128(fp128 %p) { +; CHECK: ext + %1 = fadd fp128 %p, %p + %2 = bitcast fp128 %1 to <2 x i64> + %3 = add <2 x i64> %2, %2 + ret <2 x i64> %3 +} + +; CHECK-LABEL: test_v2i64_v2f64: +define <2 x i64> @test_v2i64_v2f64(<2 x double> %p) { +; CHECK: ext +; CHECK: ext + %1 = fadd <2 x double> %p, %p + %2 = bitcast <2 x double> %1 to <2 x i64> + %3 = add <2 x i64> %2, %2 + ret <2 x i64> %3 +} + +; CHECK-LABEL: test_v2i64_v4f32: +define <2 x i64> @test_v2i64_v4f32(<4 x float> %p) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: ext + %1 = fadd <4 x float> %p, %p + %2 = bitcast <4 x float> %1 to <2 x i64> + %3 = add <2 x i64> %2, %2 + ret <2 x i64> %3 +} + +; CHECK-LABEL: test_v2i64_v4i32: +define <2 x i64> @test_v2i64_v4i32(<4 x i32> %p) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: ext + %1 = add <4 x i32> %p, %p + %2 = bitcast <4 x i32> %1 to <2 x i64> + %3 = add <2 x i64> %2, %2 + ret <2 x i64> %3 +} + +; CHECK-LABEL: test_v2i64_v8i16: +define <2 x i64> @test_v2i64_v8i16(<8 x i16> %p) { +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext +; CHECK: ext + %1 = add <8 x i16> %p, %p + %2 = bitcast <8 x i16> %1 to <2 x i64> + %3 = add <2 x i64> %2, %2 + ret <2 x i64> %3 +} + +; CHECK-LABEL: test_v2i64_v16i8: +define <2 x i64> @test_v2i64_v16i8(<16 x i8> %p) { +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext +; CHECK: ext + %1 = add <16 x i8> %p, %p + %2 = bitcast <16 x i8> %1 to <2 x i64> + %3 = add <2 x i64> %2, %2 + ret <2 x i64> %3 +} + +; CHECK-LABEL: test_v4f32_f128: +define <4 x float> @test_v4f32_f128(fp128 %p) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = fadd fp128 %p, %p + %2 = bitcast fp128 %1 to <4 x float> + %3 = fadd <4 x float> %2, %2 + ret <4 x float> %3 +} + +; CHECK-LABEL: test_v4f32_v2f64: +define <4 x float> @test_v4f32_v2f64(<2 x double> %p) { +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = fadd <2 x double> %p, %p + %2 = bitcast <2 x double> %1 to <4 x float> + %3 = fadd <4 x float> %2, %2 + ret <4 x float> %3 +} + +; CHECK-LABEL: test_v4f32_v2i64: +define <4 x float> @test_v4f32_v2i64(<2 x i64> %p) { +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = add <2 x i64> %p, %p + %2 = bitcast <2 x i64> %1 to <4 x float> + %3 = fadd <4 x float> %2, %2 + ret <4 x float> %3 +} + +; CHECK-LABEL: test_v4f32_v4i32: +define <4 x float> @test_v4f32_v4i32(<4 x i32> %p) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = add <4 x i32> %p, %p + %2 = bitcast <4 x i32> %1 to <4 x float> + %3 = fadd <4 x float> %2, %2 + ret <4 x float> %3 +} + +; CHECK-LABEL: test_v4f32_v8i16: +define <4 x float> @test_v4f32_v8i16(<8 x i16> %p) { +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = add <8 x i16> %p, %p + %2 = bitcast <8 x i16> %1 to <4 x float> + %3 = fadd <4 x float> %2, %2 + ret <4 x float> %3 +} + +; CHECK-LABEL: test_v4f32_v16i8: +define <4 x float> @test_v4f32_v16i8(<16 x i8> %p) { +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = add <16 x i8> %p, %p + %2 = bitcast <16 x i8> %1 to <4 x float> + %3 = fadd <4 x float> %2, %2 + ret <4 x float> %3 +} + +; CHECK-LABEL: test_v4i32_f128: +define <4 x i32> @test_v4i32_f128(fp128 %p) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = fadd fp128 %p, %p + %2 = bitcast fp128 %1 to <4 x i32> + %3 = add <4 x i32> %2, %2 + ret <4 x i32> %3 +} + +; CHECK-LABEL: test_v4i32_v2f64: +define <4 x i32> @test_v4i32_v2f64(<2 x double> %p) { +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = fadd <2 x double> %p, %p + %2 = bitcast <2 x double> %1 to <4 x i32> + %3 = add <4 x i32> %2, %2 + ret <4 x i32> %3 +} + +; CHECK-LABEL: test_v4i32_v2i64: +define <4 x i32> @test_v4i32_v2i64(<2 x i64> %p) { +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = add <2 x i64> %p, %p + %2 = bitcast <2 x i64> %1 to <4 x i32> + %3 = add <4 x i32> %2, %2 + ret <4 x i32> %3 +} + +; CHECK-LABEL: test_v4i32_v4f32: +define <4 x i32> @test_v4i32_v4f32(<4 x float> %p) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = fadd <4 x float> %p, %p + %2 = bitcast <4 x float> %1 to <4 x i32> + %3 = add <4 x i32> %2, %2 + ret <4 x i32> %3 +} + +; CHECK-LABEL: test_v4i32_v8i16: +define <4 x i32> @test_v4i32_v8i16(<8 x i16> %p) { +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = add <8 x i16> %p, %p + %2 = bitcast <8 x i16> %1 to <4 x i32> + %3 = add <4 x i32> %2, %2 + ret <4 x i32> %3 +} + +; CHECK-LABEL: test_v4i32_v16i8: +define <4 x i32> @test_v4i32_v16i8(<16 x i8> %p) { +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = add <16 x i8> %p, %p + %2 = bitcast <16 x i8> %1 to <4 x i32> + %3 = add <4 x i32> %2, %2 + ret <4 x i32> %3 +} + +; CHECK-LABEL: test_v8i16_f128: +define <8 x i16> @test_v8i16_f128(fp128 %p) { +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext + %1 = fadd fp128 %p, %p + %2 = bitcast fp128 %1 to <8 x i16> + %3 = add <8 x i16> %2, %2 + ret <8 x i16> %3 +} + +; CHECK-LABEL: test_v8i16_v2f64: +define <8 x i16> @test_v8i16_v2f64(<2 x double> %p) { +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext + %1 = fadd <2 x double> %p, %p + %2 = bitcast <2 x double> %1 to <8 x i16> + %3 = add <8 x i16> %2, %2 + ret <8 x i16> %3 +} + +; CHECK-LABEL: test_v8i16_v2i64: +define <8 x i16> @test_v8i16_v2i64(<2 x i64> %p) { +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext + %1 = add <2 x i64> %p, %p + %2 = bitcast <2 x i64> %1 to <8 x i16> + %3 = add <8 x i16> %2, %2 + ret <8 x i16> %3 +} + +; CHECK-LABEL: test_v8i16_v4f32: +define <8 x i16> @test_v8i16_v4f32(<4 x float> %p) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext + %1 = fadd <4 x float> %p, %p + %2 = bitcast <4 x float> %1 to <8 x i16> + %3 = add <8 x i16> %2, %2 + ret <8 x i16> %3 +} + +; CHECK-LABEL: test_v8i16_v4i32: +define <8 x i16> @test_v8i16_v4i32(<4 x i32> %p) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext + %1 = add <4 x i32> %p, %p + %2 = bitcast <4 x i32> %1 to <8 x i16> + %3 = add <8 x i16> %2, %2 + ret <8 x i16> %3 +} + +; CHECK-LABEL: test_v8i16_v16i8: +define <8 x i16> @test_v8i16_v16i8(<16 x i8> %p) { +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext + %1 = add <16 x i8> %p, %p + %2 = bitcast <16 x i8> %1 to <8 x i16> + %3 = add <8 x i16> %2, %2 + ret <8 x i16> %3 +} + +; CHECK-LABEL: test_v16i8_f128: +define <16 x i8> @test_v16i8_f128(fp128 %p) { +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext + %1 = fadd fp128 %p, %p + %2 = bitcast fp128 %1 to <16 x i8> + %3 = add <16 x i8> %2, %2 + ret <16 x i8> %3 +} + +; CHECK-LABEL: test_v16i8_v2f64: +define <16 x i8> @test_v16i8_v2f64(<2 x double> %p) { +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext + %1 = fadd <2 x double> %p, %p + %2 = bitcast <2 x double> %1 to <16 x i8> + %3 = add <16 x i8> %2, %2 + ret <16 x i8> %3 +} + +; CHECK-LABEL: test_v16i8_v2i64: +define <16 x i8> @test_v16i8_v2i64(<2 x i64> %p) { +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext + %1 = add <2 x i64> %p, %p + %2 = bitcast <2 x i64> %1 to <16 x i8> + %3 = add <16 x i8> %2, %2 + ret <16 x i8> %3 +} + +; CHECK-LABEL: test_v16i8_v4f32: +define <16 x i8> @test_v16i8_v4f32(<4 x float> %p) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext + %1 = fadd <4 x float> %p, %p + %2 = bitcast <4 x float> %1 to <16 x i8> + %3 = add <16 x i8> %2, %2 + ret <16 x i8> %3 +} + +; CHECK-LABEL: test_v16i8_v4i32: +define <16 x i8> @test_v16i8_v4i32(<4 x i32> %p) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext + %1 = add <4 x i32> %p, %p + %2 = bitcast <4 x i32> %1 to <16 x i8> + %3 = add <16 x i8> %2, %2 + ret <16 x i8> %3 +} + +; CHECK-LABEL: test_v16i8_v8i16: +define <16 x i8> @test_v16i8_v8i16(<8 x i16> %p) { +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext + %1 = add <8 x i16> %p, %p + %2 = bitcast <8 x i16> %1 to <16 x i8> + %3 = add <16 x i8> %2, %2 + ret <16 x i8> %3 +} diff --git a/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll new file mode 100644 index 00000000000..9a12b7a0115 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll @@ -0,0 +1,1100 @@ +; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s +; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -fast-isel=true -O0 -o - | FileCheck %s + +; CHECK-LABEL: test_i64_f64: +declare i64 @test_i64_f64_helper(double %p) +define void @test_i64_f64(double* %p, i64* %q) { +; CHECK-NOT: rev + %1 = load double* %p + %2 = fadd double %1, %1 + %3 = call i64 @test_i64_f64_helper(double %2) + %4 = add i64 %3, %3 + store i64 %4, i64* %q + ret void +} + +; CHECK-LABEL: test_i64_v1i64: +declare i64 @test_i64_v1i64_helper(<1 x i64> %p) +define void @test_i64_v1i64(<1 x i64>* %p, i64* %q) { +; CHECK-NOT: rev + %1 = load <1 x i64>* %p + %2 = add <1 x i64> %1, %1 + %3 = call i64 @test_i64_v1i64_helper(<1 x i64> %2) + %4 = add i64 %3, %3 + store i64 %4, i64* %q + ret void +} + +; CHECK-LABEL: test_i64_v2f32: +declare i64 @test_i64_v2f32_helper(<2 x float> %p) +define void @test_i64_v2f32(<2 x float>* %p, i64* %q) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = load <2 x float>* %p + %2 = fadd <2 x float> %1, %1 + %3 = call i64 @test_i64_v2f32_helper(<2 x float> %2) + %4 = add i64 %3, %3 + store i64 %4, i64* %q + ret void +} + +; CHECK-LABEL: test_i64_v2i32: +declare i64 @test_i64_v2i32_helper(<2 x i32> %p) +define void @test_i64_v2i32(<2 x i32>* %p, i64* %q) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = load <2 x i32>* %p + %2 = add <2 x i32> %1, %1 + %3 = call i64 @test_i64_v2i32_helper(<2 x i32> %2) + %4 = add i64 %3, %3 + store i64 %4, i64* %q + ret void +} + +; CHECK-LABEL: test_i64_v4i16: +declare i64 @test_i64_v4i16_helper(<4 x i16> %p) +define void @test_i64_v4i16(<4 x i16>* %p, i64* %q) { +; CHECK: rev64 v{{[0-9]+}}.4h + %1 = load <4 x i16>* %p + %2 = add <4 x i16> %1, %1 + %3 = call i64 @test_i64_v4i16_helper(<4 x i16> %2) + %4 = add i64 %3, %3 + store i64 %4, i64* %q + ret void +} + +; CHECK-LABEL: test_i64_v8i8: +declare i64 @test_i64_v8i8_helper(<8 x i8> %p) +define void @test_i64_v8i8(<8 x i8>* %p, i64* %q) { +; CHECK: rev64 v{{[0-9]+}}.8b + %1 = load <8 x i8>* %p + %2 = add <8 x i8> %1, %1 + %3 = call i64 @test_i64_v8i8_helper(<8 x i8> %2) + %4 = add i64 %3, %3 + store i64 %4, i64* %q + ret void +} + +; CHECK-LABEL: test_f64_i64: +declare double @test_f64_i64_helper(i64 %p) +define void @test_f64_i64(i64* %p, double* %q) { +; CHECK-NOT: rev + %1 = load i64* %p + %2 = add i64 %1, %1 + %3 = call double @test_f64_i64_helper(i64 %2) + %4 = fadd double %3, %3 + store double %4, double* %q + ret void +} + +; CHECK-LABEL: test_f64_v1i64: +declare double @test_f64_v1i64_helper(<1 x i64> %p) +define void @test_f64_v1i64(<1 x i64>* %p, double* %q) { +; CHECK-NOT: rev + %1 = load <1 x i64>* %p + %2 = add <1 x i64> %1, %1 + %3 = call double @test_f64_v1i64_helper(<1 x i64> %2) + %4 = fadd double %3, %3 + store double %4, double* %q + ret void +} + +; CHECK-LABEL: test_f64_v2f32: +declare double @test_f64_v2f32_helper(<2 x float> %p) +define void @test_f64_v2f32(<2 x float>* %p, double* %q) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = load <2 x float>* %p + %2 = fadd <2 x float> %1, %1 + %3 = call double @test_f64_v2f32_helper(<2 x float> %2) + %4 = fadd double %3, %3 + store double %4, double* %q + ret void +} + +; CHECK-LABEL: test_f64_v2i32: +declare double @test_f64_v2i32_helper(<2 x i32> %p) +define void @test_f64_v2i32(<2 x i32>* %p, double* %q) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = load <2 x i32>* %p + %2 = add <2 x i32> %1, %1 + %3 = call double @test_f64_v2i32_helper(<2 x i32> %2) + %4 = fadd double %3, %3 + store double %4, double* %q + ret void +} + +; CHECK-LABEL: test_f64_v4i16: +declare double @test_f64_v4i16_helper(<4 x i16> %p) +define void @test_f64_v4i16(<4 x i16>* %p, double* %q) { +; CHECK: rev64 v{{[0-9]+}}.4h + %1 = load <4 x i16>* %p + %2 = add <4 x i16> %1, %1 + %3 = call double @test_f64_v4i16_helper(<4 x i16> %2) + %4 = fadd double %3, %3 + store double %4, double* %q + ret void +} + +; CHECK-LABEL: test_f64_v8i8: +declare double @test_f64_v8i8_helper(<8 x i8> %p) +define void @test_f64_v8i8(<8 x i8>* %p, double* %q) { +; CHECK: rev64 v{{[0-9]+}}.8b + %1 = load <8 x i8>* %p + %2 = add <8 x i8> %1, %1 + %3 = call double @test_f64_v8i8_helper(<8 x i8> %2) + %4 = fadd double %3, %3 + store double %4, double* %q + ret void +} + +; CHECK-LABEL: test_v1i64_i64: +declare <1 x i64> @test_v1i64_i64_helper(i64 %p) +define void @test_v1i64_i64(i64* %p, <1 x i64>* %q) { +; CHECK-NOT: rev + %1 = load i64* %p + %2 = add i64 %1, %1 + %3 = call <1 x i64> @test_v1i64_i64_helper(i64 %2) + %4 = add <1 x i64> %3, %3 + store <1 x i64> %4, <1 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v1i64_f64: +declare <1 x i64> @test_v1i64_f64_helper(double %p) +define void @test_v1i64_f64(double* %p, <1 x i64>* %q) { +; CHECK-NOT: rev + %1 = load double* %p + %2 = fadd double %1, %1 + %3 = call <1 x i64> @test_v1i64_f64_helper(double %2) + %4 = add <1 x i64> %3, %3 + store <1 x i64> %4, <1 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v1i64_v2f32: +declare <1 x i64> @test_v1i64_v2f32_helper(<2 x float> %p) +define void @test_v1i64_v2f32(<2 x float>* %p, <1 x i64>* %q) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = load <2 x float>* %p + %2 = fadd <2 x float> %1, %1 + %3 = call <1 x i64> @test_v1i64_v2f32_helper(<2 x float> %2) + %4 = add <1 x i64> %3, %3 + store <1 x i64> %4, <1 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v1i64_v2i32: +declare <1 x i64> @test_v1i64_v2i32_helper(<2 x i32> %p) +define void @test_v1i64_v2i32(<2 x i32>* %p, <1 x i64>* %q) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = load <2 x i32>* %p + %2 = add <2 x i32> %1, %1 + %3 = call <1 x i64> @test_v1i64_v2i32_helper(<2 x i32> %2) + %4 = add <1 x i64> %3, %3 + store <1 x i64> %4, <1 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v1i64_v4i16: +declare <1 x i64> @test_v1i64_v4i16_helper(<4 x i16> %p) +define void @test_v1i64_v4i16(<4 x i16>* %p, <1 x i64>* %q) { +; CHECK: rev64 v{{[0-9]+}}.4h + %1 = load <4 x i16>* %p + %2 = add <4 x i16> %1, %1 + %3 = call <1 x i64> @test_v1i64_v4i16_helper(<4 x i16> %2) + %4 = add <1 x i64> %3, %3 + store <1 x i64> %4, <1 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v1i64_v8i8: +declare <1 x i64> @test_v1i64_v8i8_helper(<8 x i8> %p) +define void @test_v1i64_v8i8(<8 x i8>* %p, <1 x i64>* %q) { +; CHECK: rev64 v{{[0-9]+}}.8b + %1 = load <8 x i8>* %p + %2 = add <8 x i8> %1, %1 + %3 = call <1 x i64> @test_v1i64_v8i8_helper(<8 x i8> %2) + %4 = add <1 x i64> %3, %3 + store <1 x i64> %4, <1 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v2f32_i64: +declare <2 x float> @test_v2f32_i64_helper(i64 %p) +define void @test_v2f32_i64(i64* %p, <2 x float>* %q) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = load i64* %p + %2 = add i64 %1, %1 + %3 = call <2 x float> @test_v2f32_i64_helper(i64 %2) + %4 = fadd <2 x float> %3, %3 + store <2 x float> %4, <2 x float>* %q + ret void +} + +; CHECK-LABEL: test_v2f32_f64: +declare <2 x float> @test_v2f32_f64_helper(double %p) +define void @test_v2f32_f64(double* %p, <2 x float>* %q) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = load double* %p + %2 = fadd double %1, %1 + %3 = call <2 x float> @test_v2f32_f64_helper(double %2) + %4 = fadd <2 x float> %3, %3 + store <2 x float> %4, <2 x float>* %q + ret void +} + +; CHECK-LABEL: test_v2f32_v1i64: +declare <2 x float> @test_v2f32_v1i64_helper(<1 x i64> %p) +define void @test_v2f32_v1i64(<1 x i64>* %p, <2 x float>* %q) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = load <1 x i64>* %p + %2 = add <1 x i64> %1, %1 + %3 = call <2 x float> @test_v2f32_v1i64_helper(<1 x i64> %2) + %4 = fadd <2 x float> %3, %3 + store <2 x float> %4, <2 x float>* %q + ret void +} + +; CHECK-LABEL: test_v2f32_v2i32: +declare <2 x float> @test_v2f32_v2i32_helper(<2 x i32> %p) +define void @test_v2f32_v2i32(<2 x i32>* %p, <2 x float>* %q) { +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = load <2 x i32>* %p + %2 = add <2 x i32> %1, %1 + %3 = call <2 x float> @test_v2f32_v2i32_helper(<2 x i32> %2) + %4 = fadd <2 x float> %3, %3 + store <2 x float> %4, <2 x float>* %q + ret void +} + +; CHECK-LABEL: test_v2f32_v4i16: +declare <2 x float> @test_v2f32_v4i16_helper(<4 x i16> %p) +define void @test_v2f32_v4i16(<4 x i16>* %p, <2 x float>* %q) { +; CHECK: rev64 v{{[0-9]+}}.4h +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = load <4 x i16>* %p + %2 = add <4 x i16> %1, %1 + %3 = call <2 x float> @test_v2f32_v4i16_helper(<4 x i16> %2) + %4 = fadd <2 x float> %3, %3 + store <2 x float> %4, <2 x float>* %q + ret void +} + +; CHECK-LABEL: test_v2f32_v8i8: +declare <2 x float> @test_v2f32_v8i8_helper(<8 x i8> %p) +define void @test_v2f32_v8i8(<8 x i8>* %p, <2 x float>* %q) { +; CHECK: rev64 v{{[0-9]+}}.8b +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = load <8 x i8>* %p + %2 = add <8 x i8> %1, %1 + %3 = call <2 x float> @test_v2f32_v8i8_helper(<8 x i8> %2) + %4 = fadd <2 x float> %3, %3 + store <2 x float> %4, <2 x float>* %q + ret void +} + +; CHECK-LABEL: test_v2i32_i64: +declare <2 x i32> @test_v2i32_i64_helper(i64 %p) +define void @test_v2i32_i64(i64* %p, <2 x i32>* %q) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = load i64* %p + %2 = add i64 %1, %1 + %3 = call <2 x i32> @test_v2i32_i64_helper(i64 %2) + %4 = add <2 x i32> %3, %3 + store <2 x i32> %4, <2 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v2i32_f64: +declare <2 x i32> @test_v2i32_f64_helper(double %p) +define void @test_v2i32_f64(double* %p, <2 x i32>* %q) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = load double* %p + %2 = fadd double %1, %1 + %3 = call <2 x i32> @test_v2i32_f64_helper(double %2) + %4 = add <2 x i32> %3, %3 + store <2 x i32> %4, <2 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v2i32_v1i64: +declare <2 x i32> @test_v2i32_v1i64_helper(<1 x i64> %p) +define void @test_v2i32_v1i64(<1 x i64>* %p, <2 x i32>* %q) { +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = load <1 x i64>* %p + %2 = add <1 x i64> %1, %1 + %3 = call <2 x i32> @test_v2i32_v1i64_helper(<1 x i64> %2) + %4 = add <2 x i32> %3, %3 + store <2 x i32> %4, <2 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v2i32_v2f32: +declare <2 x i32> @test_v2i32_v2f32_helper(<2 x float> %p) +define void @test_v2i32_v2f32(<2 x float>* %p, <2 x i32>* %q) { +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = load <2 x float>* %p + %2 = fadd <2 x float> %1, %1 + %3 = call <2 x i32> @test_v2i32_v2f32_helper(<2 x float> %2) + %4 = add <2 x i32> %3, %3 + store <2 x i32> %4, <2 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v2i32_v4i16: +declare <2 x i32> @test_v2i32_v4i16_helper(<4 x i16> %p) +define void @test_v2i32_v4i16(<4 x i16>* %p, <2 x i32>* %q) { +; CHECK: rev64 v{{[0-9]+}}.4h +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = load <4 x i16>* %p + %2 = add <4 x i16> %1, %1 + %3 = call <2 x i32> @test_v2i32_v4i16_helper(<4 x i16> %2) + %4 = add <2 x i32> %3, %3 + store <2 x i32> %4, <2 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v2i32_v8i8: +declare <2 x i32> @test_v2i32_v8i8_helper(<8 x i8> %p) +define void @test_v2i32_v8i8(<8 x i8>* %p, <2 x i32>* %q) { +; CHECK: rev64 v{{[0-9]+}}.8b +; CHECK: rev64 v{{[0-9]+}}.2s + %1 = load <8 x i8>* %p + %2 = add <8 x i8> %1, %1 + %3 = call <2 x i32> @test_v2i32_v8i8_helper(<8 x i8> %2) + %4 = add <2 x i32> %3, %3 + store <2 x i32> %4, <2 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v4i16_i64: +declare <4 x i16> @test_v4i16_i64_helper(i64 %p) +define void @test_v4i16_i64(i64* %p, <4 x i16>* %q) { +; CHECK: rev64 v{{[0-9]+}}.4h + %1 = load i64* %p + %2 = add i64 %1, %1 + %3 = call <4 x i16> @test_v4i16_i64_helper(i64 %2) + %4 = add <4 x i16> %3, %3 + store <4 x i16> %4, <4 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v4i16_f64: +declare <4 x i16> @test_v4i16_f64_helper(double %p) +define void @test_v4i16_f64(double* %p, <4 x i16>* %q) { +; CHECK: rev64 v{{[0-9]+}}.4h + %1 = load double* %p + %2 = fadd double %1, %1 + %3 = call <4 x i16> @test_v4i16_f64_helper(double %2) + %4 = add <4 x i16> %3, %3 + store <4 x i16> %4, <4 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v4i16_v1i64: +declare <4 x i16> @test_v4i16_v1i64_helper(<1 x i64> %p) +define void @test_v4i16_v1i64(<1 x i64>* %p, <4 x i16>* %q) { +; CHECK: rev64 v{{[0-9]+}}.4h + %1 = load <1 x i64>* %p + %2 = add <1 x i64> %1, %1 + %3 = call <4 x i16> @test_v4i16_v1i64_helper(<1 x i64> %2) + %4 = add <4 x i16> %3, %3 + store <4 x i16> %4, <4 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v4i16_v2f32: +declare <4 x i16> @test_v4i16_v2f32_helper(<2 x float> %p) +define void @test_v4i16_v2f32(<2 x float>* %p, <4 x i16>* %q) { +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: rev64 v{{[0-9]+}}.4h + %1 = load <2 x float>* %p + %2 = fadd <2 x float> %1, %1 + %3 = call <4 x i16> @test_v4i16_v2f32_helper(<2 x float> %2) + %4 = add <4 x i16> %3, %3 + store <4 x i16> %4, <4 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v4i16_v2i32: +declare <4 x i16> @test_v4i16_v2i32_helper(<2 x i32> %p) +define void @test_v4i16_v2i32(<2 x i32>* %p, <4 x i16>* %q) { +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: rev64 v{{[0-9]+}}.4h + %1 = load <2 x i32>* %p + %2 = add <2 x i32> %1, %1 + %3 = call <4 x i16> @test_v4i16_v2i32_helper(<2 x i32> %2) + %4 = add <4 x i16> %3, %3 + store <4 x i16> %4, <4 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v4i16_v8i8: +declare <4 x i16> @test_v4i16_v8i8_helper(<8 x i8> %p) +define void @test_v4i16_v8i8(<8 x i8>* %p, <4 x i16>* %q) { +; CHECK: rev64 v{{[0-9]+}}.8b +; CHECK: rev64 v{{[0-9]+}}.4h + %1 = load <8 x i8>* %p + %2 = add <8 x i8> %1, %1 + %3 = call <4 x i16> @test_v4i16_v8i8_helper(<8 x i8> %2) + %4 = add <4 x i16> %3, %3 + store <4 x i16> %4, <4 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v8i8_i64: +declare <8 x i8> @test_v8i8_i64_helper(i64 %p) +define void @test_v8i8_i64(i64* %p, <8 x i8>* %q) { +; CHECK: rev64 v{{[0-9]+}}.8b + %1 = load i64* %p + %2 = add i64 %1, %1 + %3 = call <8 x i8> @test_v8i8_i64_helper(i64 %2) + %4 = add <8 x i8> %3, %3 + store <8 x i8> %4, <8 x i8>* %q + ret void +} + +; CHECK-LABEL: test_v8i8_f64: +declare <8 x i8> @test_v8i8_f64_helper(double %p) +define void @test_v8i8_f64(double* %p, <8 x i8>* %q) { +; CHECK: rev64 v{{[0-9]+}}.8b + %1 = load double* %p + %2 = fadd double %1, %1 + %3 = call <8 x i8> @test_v8i8_f64_helper(double %2) + %4 = add <8 x i8> %3, %3 + store <8 x i8> %4, <8 x i8>* %q + ret void +} + +; CHECK-LABEL: test_v8i8_v1i64: +declare <8 x i8> @test_v8i8_v1i64_helper(<1 x i64> %p) +define void @test_v8i8_v1i64(<1 x i64>* %p, <8 x i8>* %q) { +; CHECK: rev64 v{{[0-9]+}}.8b + %1 = load <1 x i64>* %p + %2 = add <1 x i64> %1, %1 + %3 = call <8 x i8> @test_v8i8_v1i64_helper(<1 x i64> %2) + %4 = add <8 x i8> %3, %3 + store <8 x i8> %4, <8 x i8>* %q + ret void +} + +; CHECK-LABEL: test_v8i8_v2f32: +declare <8 x i8> @test_v8i8_v2f32_helper(<2 x float> %p) +define void @test_v8i8_v2f32(<2 x float>* %p, <8 x i8>* %q) { +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: rev64 v{{[0-9]+}}.8b + %1 = load <2 x float>* %p + %2 = fadd <2 x float> %1, %1 + %3 = call <8 x i8> @test_v8i8_v2f32_helper(<2 x float> %2) + %4 = add <8 x i8> %3, %3 + store <8 x i8> %4, <8 x i8>* %q + ret void +} + +; CHECK-LABEL: test_v8i8_v2i32: +declare <8 x i8> @test_v8i8_v2i32_helper(<2 x i32> %p) +define void @test_v8i8_v2i32(<2 x i32>* %p, <8 x i8>* %q) { +; CHECK: rev64 v{{[0-9]+}}.2s +; CHECK: rev64 v{{[0-9]+}}.8b + %1 = load <2 x i32>* %p + %2 = add <2 x i32> %1, %1 + %3 = call <8 x i8> @test_v8i8_v2i32_helper(<2 x i32> %2) + %4 = add <8 x i8> %3, %3 + store <8 x i8> %4, <8 x i8>* %q + ret void +} + +; CHECK-LABEL: test_v8i8_v4i16: +declare <8 x i8> @test_v8i8_v4i16_helper(<4 x i16> %p) +define void @test_v8i8_v4i16(<4 x i16>* %p, <8 x i8>* %q) { +; CHECK: rev64 v{{[0-9]+}}.4h +; CHECK: rev64 v{{[0-9]+}}.8b + %1 = load <4 x i16>* %p + %2 = add <4 x i16> %1, %1 + %3 = call <8 x i8> @test_v8i8_v4i16_helper(<4 x i16> %2) + %4 = add <8 x i8> %3, %3 + store <8 x i8> %4, <8 x i8>* %q + ret void +} + +; CHECK-LABEL: test_f128_v2f64: +declare fp128 @test_f128_v2f64_helper(<2 x double> %p) +define void @test_f128_v2f64(<2 x double>* %p, fp128* %q) { +; CHECK: ext + %1 = load <2 x double>* %p + %2 = fadd <2 x double> %1, %1 + %3 = call fp128 @test_f128_v2f64_helper(<2 x double> %2) + %4 = fadd fp128 %3, %3 + store fp128 %4, fp128* %q + ret void +} + +; CHECK-LABEL: test_f128_v2i64: +declare fp128 @test_f128_v2i64_helper(<2 x i64> %p) +define void @test_f128_v2i64(<2 x i64>* %p, fp128* %q) { +; CHECK: ext + %1 = load <2 x i64>* %p + %2 = add <2 x i64> %1, %1 + %3 = call fp128 @test_f128_v2i64_helper(<2 x i64> %2) + %4 = fadd fp128 %3, %3 + store fp128 %4, fp128* %q + ret void +} + +; CHECK-LABEL: test_f128_v4f32: +declare fp128 @test_f128_v4f32_helper(<4 x float> %p) +define void @test_f128_v4f32(<4 x float>* %p, fp128* %q) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = load <4 x float>* %p + %2 = fadd <4 x float> %1, %1 + %3 = call fp128 @test_f128_v4f32_helper(<4 x float> %2) + %4 = fadd fp128 %3, %3 + store fp128 %4, fp128* %q + ret void +} + +; CHECK-LABEL: test_f128_v4i32: +declare fp128 @test_f128_v4i32_helper(<4 x i32> %p) +define void @test_f128_v4i32(<4 x i32>* %p, fp128* %q) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = load <4 x i32>* %p + %2 = add <4 x i32> %1, %1 + %3 = call fp128 @test_f128_v4i32_helper(<4 x i32> %2) + %4 = fadd fp128 %3, %3 + store fp128 %4, fp128* %q + ret void +} + +; CHECK-LABEL: test_f128_v8i16: +declare fp128 @test_f128_v8i16_helper(<8 x i16> %p) +define void @test_f128_v8i16(<8 x i16>* %p, fp128* %q) { +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext + %1 = load <8 x i16>* %p + %2 = add <8 x i16> %1, %1 + %3 = call fp128 @test_f128_v8i16_helper(<8 x i16> %2) + %4 = fadd fp128 %3, %3 + store fp128 %4, fp128* %q + ret void +} + +; CHECK-LABEL: test_f128_v16i8: +declare fp128 @test_f128_v16i8_helper(<16 x i8> %p) +define void @test_f128_v16i8(<16 x i8>* %p, fp128* %q) { +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext + %1 = load <16 x i8>* %p + %2 = add <16 x i8> %1, %1 + %3 = call fp128 @test_f128_v16i8_helper(<16 x i8> %2) + %4 = fadd fp128 %3, %3 + store fp128 %4, fp128* %q + ret void +} + +; CHECK-LABEL: test_v2f64_f128: +declare <2 x double> @test_v2f64_f128_helper(fp128 %p) +define void @test_v2f64_f128(fp128* %p, <2 x double>* %q) { +; CHECK: ext + %1 = load fp128* %p + %2 = fadd fp128 %1, %1 + %3 = call <2 x double> @test_v2f64_f128_helper(fp128 %2) + %4 = fadd <2 x double> %3, %3 + store <2 x double> %4, <2 x double>* %q + ret void +} + +; CHECK-LABEL: test_v2f64_v2i64: +declare <2 x double> @test_v2f64_v2i64_helper(<2 x i64> %p) +define void @test_v2f64_v2i64(<2 x i64>* %p, <2 x double>* %q) { +; CHECK: ext +; CHECK: ext + %1 = load <2 x i64>* %p + %2 = add <2 x i64> %1, %1 + %3 = call <2 x double> @test_v2f64_v2i64_helper(<2 x i64> %2) + %4 = fadd <2 x double> %3, %3 + store <2 x double> %4, <2 x double>* %q + ret void +} + +; CHECK-LABEL: test_v2f64_v4f32: +declare <2 x double> @test_v2f64_v4f32_helper(<4 x float> %p) +define void @test_v2f64_v4f32(<4 x float>* %p, <2 x double>* %q) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: ext + %1 = load <4 x float>* %p + %2 = fadd <4 x float> %1, %1 + %3 = call <2 x double> @test_v2f64_v4f32_helper(<4 x float> %2) + %4 = fadd <2 x double> %3, %3 + store <2 x double> %4, <2 x double>* %q + ret void +} + +; CHECK-LABEL: test_v2f64_v4i32: +declare <2 x double> @test_v2f64_v4i32_helper(<4 x i32> %p) +define void @test_v2f64_v4i32(<4 x i32>* %p, <2 x double>* %q) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: ext + %1 = load <4 x i32>* %p + %2 = add <4 x i32> %1, %1 + %3 = call <2 x double> @test_v2f64_v4i32_helper(<4 x i32> %2) + %4 = fadd <2 x double> %3, %3 + store <2 x double> %4, <2 x double>* %q + ret void +} + +; CHECK-LABEL: test_v2f64_v8i16: +declare <2 x double> @test_v2f64_v8i16_helper(<8 x i16> %p) +define void @test_v2f64_v8i16(<8 x i16>* %p, <2 x double>* %q) { +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext +; CHECK: ext + %1 = load <8 x i16>* %p + %2 = add <8 x i16> %1, %1 + %3 = call <2 x double> @test_v2f64_v8i16_helper(<8 x i16> %2) + %4 = fadd <2 x double> %3, %3 + store <2 x double> %4, <2 x double>* %q + ret void +} + +; CHECK-LABEL: test_v2f64_v16i8: +declare <2 x double> @test_v2f64_v16i8_helper(<16 x i8> %p) +define void @test_v2f64_v16i8(<16 x i8>* %p, <2 x double>* %q) { +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext +; CHECK: ext + %1 = load <16 x i8>* %p + %2 = add <16 x i8> %1, %1 + %3 = call <2 x double> @test_v2f64_v16i8_helper(<16 x i8> %2) + %4 = fadd <2 x double> %3, %3 + store <2 x double> %4, <2 x double>* %q + ret void +} + +; CHECK-LABEL: test_v2i64_f128: +declare <2 x i64> @test_v2i64_f128_helper(fp128 %p) +define void @test_v2i64_f128(fp128* %p, <2 x i64>* %q) { +; CHECK: ext + %1 = load fp128* %p + %2 = fadd fp128 %1, %1 + %3 = call <2 x i64> @test_v2i64_f128_helper(fp128 %2) + %4 = add <2 x i64> %3, %3 + store <2 x i64> %4, <2 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v2i64_v2f64: +declare <2 x i64> @test_v2i64_v2f64_helper(<2 x double> %p) +define void @test_v2i64_v2f64(<2 x double>* %p, <2 x i64>* %q) { +; CHECK: ext +; CHECK: ext + %1 = load <2 x double>* %p + %2 = fadd <2 x double> %1, %1 + %3 = call <2 x i64> @test_v2i64_v2f64_helper(<2 x double> %2) + %4 = add <2 x i64> %3, %3 + store <2 x i64> %4, <2 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v2i64_v4f32: +declare <2 x i64> @test_v2i64_v4f32_helper(<4 x float> %p) +define void @test_v2i64_v4f32(<4 x float>* %p, <2 x i64>* %q) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: ext + %1 = load <4 x float>* %p + %2 = fadd <4 x float> %1, %1 + %3 = call <2 x i64> @test_v2i64_v4f32_helper(<4 x float> %2) + %4 = add <2 x i64> %3, %3 + store <2 x i64> %4, <2 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v2i64_v4i32: +declare <2 x i64> @test_v2i64_v4i32_helper(<4 x i32> %p) +define void @test_v2i64_v4i32(<4 x i32>* %p, <2 x i64>* %q) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: ext + %1 = load <4 x i32>* %p + %2 = add <4 x i32> %1, %1 + %3 = call <2 x i64> @test_v2i64_v4i32_helper(<4 x i32> %2) + %4 = add <2 x i64> %3, %3 + store <2 x i64> %4, <2 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v2i64_v8i16: +declare <2 x i64> @test_v2i64_v8i16_helper(<8 x i16> %p) +define void @test_v2i64_v8i16(<8 x i16>* %p, <2 x i64>* %q) { +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext +; CHECK: ext + %1 = load <8 x i16>* %p + %2 = add <8 x i16> %1, %1 + %3 = call <2 x i64> @test_v2i64_v8i16_helper(<8 x i16> %2) + %4 = add <2 x i64> %3, %3 + store <2 x i64> %4, <2 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v2i64_v16i8: +declare <2 x i64> @test_v2i64_v16i8_helper(<16 x i8> %p) +define void @test_v2i64_v16i8(<16 x i8>* %p, <2 x i64>* %q) { +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext +; CHECK: ext + %1 = load <16 x i8>* %p + %2 = add <16 x i8> %1, %1 + %3 = call <2 x i64> @test_v2i64_v16i8_helper(<16 x i8> %2) + %4 = add <2 x i64> %3, %3 + store <2 x i64> %4, <2 x i64>* %q + ret void +} + +; CHECK-LABEL: test_v4f32_f128: +declare <4 x float> @test_v4f32_f128_helper(fp128 %p) +define void @test_v4f32_f128(fp128* %p, <4 x float>* %q) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = load fp128* %p + %2 = fadd fp128 %1, %1 + %3 = call <4 x float> @test_v4f32_f128_helper(fp128 %2) + %4 = fadd <4 x float> %3, %3 + store <4 x float> %4, <4 x float>* %q + ret void +} + +; CHECK-LABEL: test_v4f32_v2f64: +declare <4 x float> @test_v4f32_v2f64_helper(<2 x double> %p) +define void @test_v4f32_v2f64(<2 x double>* %p, <4 x float>* %q) { +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = load <2 x double>* %p + %2 = fadd <2 x double> %1, %1 + %3 = call <4 x float> @test_v4f32_v2f64_helper(<2 x double> %2) + %4 = fadd <4 x float> %3, %3 + store <4 x float> %4, <4 x float>* %q + ret void +} + +; CHECK-LABEL: test_v4f32_v2i64: +declare <4 x float> @test_v4f32_v2i64_helper(<2 x i64> %p) +define void @test_v4f32_v2i64(<2 x i64>* %p, <4 x float>* %q) { +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = load <2 x i64>* %p + %2 = add <2 x i64> %1, %1 + %3 = call <4 x float> @test_v4f32_v2i64_helper(<2 x i64> %2) + %4 = fadd <4 x float> %3, %3 + store <4 x float> %4, <4 x float>* %q + ret void +} + +; CHECK-LABEL: test_v4f32_v4i32: +declare <4 x float> @test_v4f32_v4i32_helper(<4 x i32> %p) +define void @test_v4f32_v4i32(<4 x i32>* %p, <4 x float>* %q) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = load <4 x i32>* %p + %2 = add <4 x i32> %1, %1 + %3 = call <4 x float> @test_v4f32_v4i32_helper(<4 x i32> %2) + %4 = fadd <4 x float> %3, %3 + store <4 x float> %4, <4 x float>* %q + ret void +} + +; CHECK-LABEL: test_v4f32_v8i16: +declare <4 x float> @test_v4f32_v8i16_helper(<8 x i16> %p) +define void @test_v4f32_v8i16(<8 x i16>* %p, <4 x float>* %q) { +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = load <8 x i16>* %p + %2 = add <8 x i16> %1, %1 + %3 = call <4 x float> @test_v4f32_v8i16_helper(<8 x i16> %2) + %4 = fadd <4 x float> %3, %3 + store <4 x float> %4, <4 x float>* %q + ret void +} + +; CHECK-LABEL: test_v4f32_v16i8: +declare <4 x float> @test_v4f32_v16i8_helper(<16 x i8> %p) +define void @test_v4f32_v16i8(<16 x i8>* %p, <4 x float>* %q) { +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = load <16 x i8>* %p + %2 = add <16 x i8> %1, %1 + %3 = call <4 x float> @test_v4f32_v16i8_helper(<16 x i8> %2) + %4 = fadd <4 x float> %3, %3 + store <4 x float> %4, <4 x float>* %q + ret void +} + +; CHECK-LABEL: test_v4i32_f128: +declare <4 x i32> @test_v4i32_f128_helper(fp128 %p) +define void @test_v4i32_f128(fp128* %p, <4 x i32>* %q) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = load fp128* %p + %2 = fadd fp128 %1, %1 + %3 = call <4 x i32> @test_v4i32_f128_helper(fp128 %2) + %4 = add <4 x i32> %3, %3 + store <4 x i32> %4, <4 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v4i32_v2f64: +declare <4 x i32> @test_v4i32_v2f64_helper(<2 x double> %p) +define void @test_v4i32_v2f64(<2 x double>* %p, <4 x i32>* %q) { +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = load <2 x double>* %p + %2 = fadd <2 x double> %1, %1 + %3 = call <4 x i32> @test_v4i32_v2f64_helper(<2 x double> %2) + %4 = add <4 x i32> %3, %3 + store <4 x i32> %4, <4 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v4i32_v2i64: +declare <4 x i32> @test_v4i32_v2i64_helper(<2 x i64> %p) +define void @test_v4i32_v2i64(<2 x i64>* %p, <4 x i32>* %q) { +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = load <2 x i64>* %p + %2 = add <2 x i64> %1, %1 + %3 = call <4 x i32> @test_v4i32_v2i64_helper(<2 x i64> %2) + %4 = add <4 x i32> %3, %3 + store <4 x i32> %4, <4 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v4i32_v4f32: +declare <4 x i32> @test_v4i32_v4f32_helper(<4 x float> %p) +define void @test_v4i32_v4f32(<4 x float>* %p, <4 x i32>* %q) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = load <4 x float>* %p + %2 = fadd <4 x float> %1, %1 + %3 = call <4 x i32> @test_v4i32_v4f32_helper(<4 x float> %2) + %4 = add <4 x i32> %3, %3 + store <4 x i32> %4, <4 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v4i32_v8i16: +declare <4 x i32> @test_v4i32_v8i16_helper(<8 x i16> %p) +define void @test_v4i32_v8i16(<8 x i16>* %p, <4 x i32>* %q) { +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = load <8 x i16>* %p + %2 = add <8 x i16> %1, %1 + %3 = call <4 x i32> @test_v4i32_v8i16_helper(<8 x i16> %2) + %4 = add <4 x i32> %3, %3 + store <4 x i32> %4, <4 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v4i32_v16i8: +declare <4 x i32> @test_v4i32_v16i8_helper(<16 x i8> %p) +define void @test_v4i32_v16i8(<16 x i8>* %p, <4 x i32>* %q) { +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext + %1 = load <16 x i8>* %p + %2 = add <16 x i8> %1, %1 + %3 = call <4 x i32> @test_v4i32_v16i8_helper(<16 x i8> %2) + %4 = add <4 x i32> %3, %3 + store <4 x i32> %4, <4 x i32>* %q + ret void +} + +; CHECK-LABEL: test_v8i16_f128: +declare <8 x i16> @test_v8i16_f128_helper(fp128 %p) +define void @test_v8i16_f128(fp128* %p, <8 x i16>* %q) { +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext + %1 = load fp128* %p + %2 = fadd fp128 %1, %1 + %3 = call <8 x i16> @test_v8i16_f128_helper(fp128 %2) + %4 = add <8 x i16> %3, %3 + store <8 x i16> %4, <8 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v8i16_v2f64: +declare <8 x i16> @test_v8i16_v2f64_helper(<2 x double> %p) +define void @test_v8i16_v2f64(<2 x double>* %p, <8 x i16>* %q) { +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext + %1 = load <2 x double>* %p + %2 = fadd <2 x double> %1, %1 + %3 = call <8 x i16> @test_v8i16_v2f64_helper(<2 x double> %2) + %4 = add <8 x i16> %3, %3 + store <8 x i16> %4, <8 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v8i16_v2i64: +declare <8 x i16> @test_v8i16_v2i64_helper(<2 x i64> %p) +define void @test_v8i16_v2i64(<2 x i64>* %p, <8 x i16>* %q) { +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext + %1 = load <2 x i64>* %p + %2 = add <2 x i64> %1, %1 + %3 = call <8 x i16> @test_v8i16_v2i64_helper(<2 x i64> %2) + %4 = add <8 x i16> %3, %3 + store <8 x i16> %4, <8 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v8i16_v4f32: +declare <8 x i16> @test_v8i16_v4f32_helper(<4 x float> %p) +define void @test_v8i16_v4f32(<4 x float>* %p, <8 x i16>* %q) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext + %1 = load <4 x float>* %p + %2 = fadd <4 x float> %1, %1 + %3 = call <8 x i16> @test_v8i16_v4f32_helper(<4 x float> %2) + %4 = add <8 x i16> %3, %3 + store <8 x i16> %4, <8 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v8i16_v4i32: +declare <8 x i16> @test_v8i16_v4i32_helper(<4 x i32> %p) +define void @test_v8i16_v4i32(<4 x i32>* %p, <8 x i16>* %q) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext + %1 = load <4 x i32>* %p + %2 = add <4 x i32> %1, %1 + %3 = call <8 x i16> @test_v8i16_v4i32_helper(<4 x i32> %2) + %4 = add <8 x i16> %3, %3 + store <8 x i16> %4, <8 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v8i16_v16i8: +declare <8 x i16> @test_v8i16_v16i8_helper(<16 x i8> %p) +define void @test_v8i16_v16i8(<16 x i8>* %p, <8 x i16>* %q) { +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext + %1 = load <16 x i8>* %p + %2 = add <16 x i8> %1, %1 + %3 = call <8 x i16> @test_v8i16_v16i8_helper(<16 x i8> %2) + %4 = add <8 x i16> %3, %3 + store <8 x i16> %4, <8 x i16>* %q + ret void +} + +; CHECK-LABEL: test_v16i8_f128: +declare <16 x i8> @test_v16i8_f128_helper(fp128 %p) +define void @test_v16i8_f128(fp128* %p, <16 x i8>* %q) { +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext + %1 = load fp128* %p + %2 = fadd fp128 %1, %1 + %3 = call <16 x i8> @test_v16i8_f128_helper(fp128 %2) + %4 = add <16 x i8> %3, %3 + store <16 x i8> %4, <16 x i8>* %q + ret void +} + +; CHECK-LABEL: test_v16i8_v2f64: +declare <16 x i8> @test_v16i8_v2f64_helper(<2 x double> %p) +define void @test_v16i8_v2f64(<2 x double>* %p, <16 x i8>* %q) { +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext + %1 = load <2 x double>* %p + %2 = fadd <2 x double> %1, %1 + %3 = call <16 x i8> @test_v16i8_v2f64_helper(<2 x double> %2) + %4 = add <16 x i8> %3, %3 + store <16 x i8> %4, <16 x i8>* %q + ret void +} + +; CHECK-LABEL: test_v16i8_v2i64: +declare <16 x i8> @test_v16i8_v2i64_helper(<2 x i64> %p) +define void @test_v16i8_v2i64(<2 x i64>* %p, <16 x i8>* %q) { +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext + %1 = load <2 x i64>* %p + %2 = add <2 x i64> %1, %1 + %3 = call <16 x i8> @test_v16i8_v2i64_helper(<2 x i64> %2) + %4 = add <16 x i8> %3, %3 + store <16 x i8> %4, <16 x i8>* %q + ret void +} + +; CHECK-LABEL: test_v16i8_v4f32: +declare <16 x i8> @test_v16i8_v4f32_helper(<4 x float> %p) +define void @test_v16i8_v4f32(<4 x float>* %p, <16 x i8>* %q) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext + %1 = load <4 x float>* %p + %2 = fadd <4 x float> %1, %1 + %3 = call <16 x i8> @test_v16i8_v4f32_helper(<4 x float> %2) + %4 = add <16 x i8> %3, %3 + store <16 x i8> %4, <16 x i8>* %q + ret void +} + +; CHECK-LABEL: test_v16i8_v4i32: +declare <16 x i8> @test_v16i8_v4i32_helper(<4 x i32> %p) +define void @test_v16i8_v4i32(<4 x i32>* %p, <16 x i8>* %q) { +; CHECK: rev64 v{{[0-9]+}}.4s +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext + %1 = load <4 x i32>* %p + %2 = add <4 x i32> %1, %1 + %3 = call <16 x i8> @test_v16i8_v4i32_helper(<4 x i32> %2) + %4 = add <16 x i8> %3, %3 + store <16 x i8> %4, <16 x i8>* %q + ret void +} + +; CHECK-LABEL: test_v16i8_v8i16: +declare <16 x i8> @test_v16i8_v8i16_helper(<8 x i16> %p) +define void @test_v16i8_v8i16(<8 x i16>* %p, <16 x i8>* %q) { +; CHECK: rev64 v{{[0-9]+}}.8h +; CHECK: ext +; CHECK: rev64 v{{[0-9]+}}.16b +; CHECK: ext + %1 = load <8 x i16>* %p + %2 = add <8 x i16> %1, %1 + %3 = call <16 x i8> @test_v16i8_v8i16_helper(<8 x i16> %2) + %4 = add <16 x i8> %3, %3 + store <16 x i8> %4, <16 x i8>* %q + ret void +} diff --git a/test/CodeGen/AArch64/arm64-big-imm-offsets.ll b/test/CodeGen/AArch64/arm64-big-imm-offsets.ll new file mode 100644 index 00000000000..a56df07a49a --- /dev/null +++ b/test/CodeGen/AArch64/arm64-big-imm-offsets.ll @@ -0,0 +1,14 @@ +; RUN: llc -march=arm64 < %s + + +; Make sure large offsets aren't mistaken for valid immediate offsets. +; +define void @f(i32* nocapture %p) { +entry: + %a = ptrtoint i32* %p to i64 + %ao = add i64 %a, 25769803792 + %b = inttoptr i64 %ao to i32* + store volatile i32 0, i32* %b, align 4 + store volatile i32 0, i32* %b, align 4 + ret void +} diff --git a/test/CodeGen/AArch64/arm64-big-stack.ll b/test/CodeGen/AArch64/arm64-big-stack.ll new file mode 100644 index 00000000000..3f91bb3c248 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-big-stack.ll @@ -0,0 +1,21 @@ +; RUN: llc < %s | FileCheck %s +target triple = "arm64-apple-macosx10" + +; Check that big stacks are generated correctly. +; Currently, this is done by a sequence of sub instructions, +; which can encode immediate with a 12 bits mask an optionally +; shift left (up to 12). I.e., 16773120 is the biggest value. +; +; CHECK-LABEL: foo: +; CHECK: sub sp, sp, #4095, lsl #12 +; CHECK: sub sp, sp, #4095, lsl #12 +; CHECK: sub sp, sp, #2, lsl #12 +define void @foo() nounwind ssp { +entry: + %buffer = alloca [33554432 x i8], align 1 + %arraydecay = getelementptr inbounds [33554432 x i8]* %buffer, i64 0, i64 0 + call void @doit(i8* %arraydecay) nounwind + ret void +} + +declare void @doit(i8*) diff --git a/test/CodeGen/AArch64/arm64-bitfield-extract.ll b/test/CodeGen/AArch64/arm64-bitfield-extract.ll new file mode 100644 index 00000000000..112efddd4fa --- /dev/null +++ b/test/CodeGen/AArch64/arm64-bitfield-extract.ll @@ -0,0 +1,532 @@ +; RUN: opt -codegenprepare -mtriple=arm64-apple=ios -S -o - %s | FileCheck --check-prefix=OPT %s +; RUN: llc < %s -march=arm64 | FileCheck %s +%struct.X = type { i8, i8, [2 x i8] } +%struct.Y = type { i32, i8 } +%struct.Z = type { i8, i8, [2 x i8], i16 } +%struct.A = type { i64, i8 } + +define void @foo(%struct.X* nocapture %x, %struct.Y* nocapture %y) nounwind optsize ssp { +; CHECK-LABEL: foo: +; CHECK: ubfx +; CHECK-NOT: and +; CHECK: ret + + %tmp = bitcast %struct.X* %x to i32* + %tmp1 = load i32* %tmp, align 4 + %b = getelementptr inbounds %struct.Y* %y, i64 0, i32 1 + %bf.clear = lshr i32 %tmp1, 3 + %bf.clear.lobit = and i32 %bf.clear, 1 + %frombool = trunc i32 %bf.clear.lobit to i8 + store i8 %frombool, i8* %b, align 1 + ret void +} + +define i32 @baz(i64 %cav1.coerce) nounwind { +; CHECK-LABEL: baz: +; CHECK: sbfx w0, w0, #0, #4 + %tmp = trunc i64 %cav1.coerce to i32 + %tmp1 = shl i32 %tmp, 28 + %bf.val.sext = ashr exact i32 %tmp1, 28 + ret i32 %bf.val.sext +} + +define i32 @bar(i64 %cav1.coerce) nounwind { +; CHECK-LABEL: bar: +; CHECK: sbfx w0, w0, #4, #6 + %tmp = trunc i64 %cav1.coerce to i32 + %cav1.sroa.0.1.insert = shl i32 %tmp, 22 + %tmp1 = ashr i32 %cav1.sroa.0.1.insert, 26 + ret i32 %tmp1 +} + +define void @fct1(%struct.Z* nocapture %x, %struct.A* nocapture %y) nounwind optsize ssp { +; CHECK-LABEL: fct1: +; CHECK: ubfx +; CHECK-NOT: and +; CHECK: ret + + %tmp = bitcast %struct.Z* %x to i64* + %tmp1 = load i64* %tmp, align 4 + %b = getelementptr inbounds %struct.A* %y, i64 0, i32 0 + %bf.clear = lshr i64 %tmp1, 3 + %bf.clear.lobit = and i64 %bf.clear, 1 + store i64 %bf.clear.lobit, i64* %b, align 8 + ret void +} + +define i64 @fct2(i64 %cav1.coerce) nounwind { +; CHECK-LABEL: fct2: +; CHECK: sbfx x0, x0, #0, #36 + %tmp = shl i64 %cav1.coerce, 28 + %bf.val.sext = ashr exact i64 %tmp, 28 + ret i64 %bf.val.sext +} + +define i64 @fct3(i64 %cav1.coerce) nounwind { +; CHECK-LABEL: fct3: +; CHECK: sbfx x0, x0, #4, #38 + %cav1.sroa.0.1.insert = shl i64 %cav1.coerce, 22 + %tmp1 = ashr i64 %cav1.sroa.0.1.insert, 26 + ret i64 %tmp1 +} + +define void @fct4(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp { +entry: +; CHECK-LABEL: fct4: +; CHECK: ldr [[REG1:x[0-9]+]], +; CHECK-NEXT: bfxil [[REG1]], x1, #16, #24 +; CHECK-NEXT: str [[REG1]], +; CHECK-NEXT: ret + %0 = load i64* %y, align 8 + %and = and i64 %0, -16777216 + %shr = lshr i64 %x, 16 + %and1 = and i64 %shr, 16777215 + %or = or i64 %and, %and1 + store i64 %or, i64* %y, align 8 + ret void +} + +define void @fct5(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp { +entry: +; CHECK-LABEL: fct5: +; CHECK: ldr [[REG1:w[0-9]+]], +; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3 +; CHECK-NEXT: str [[REG1]], +; CHECK-NEXT: ret + %0 = load i32* %y, align 8 + %and = and i32 %0, -8 + %shr = lshr i32 %x, 16 + %and1 = and i32 %shr, 7 + %or = or i32 %and, %and1 + store i32 %or, i32* %y, align 8 + ret void +} + +; Check if we can still catch bfm instruction when we drop some low bits +define void @fct6(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp { +entry: +; CHECK-LABEL: fct6: +; CHECK: ldr [[REG1:w[0-9]+]], +; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3 +; lsr is an alias of ubfm +; CHECK-NEXT: lsr [[REG2:w[0-9]+]], [[REG1]], #2 +; CHECK-NEXT: str [[REG2]], +; CHECK-NEXT: ret + %0 = load i32* %y, align 8 + %and = and i32 %0, -8 + %shr = lshr i32 %x, 16 + %and1 = and i32 %shr, 7 + %or = or i32 %and, %and1 + %shr1 = lshr i32 %or, 2 + store i32 %shr1, i32* %y, align 8 + ret void +} + + +; Check if we can still catch bfm instruction when we drop some high bits +define void @fct7(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp { +entry: +; CHECK-LABEL: fct7: +; CHECK: ldr [[REG1:w[0-9]+]], +; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3 +; lsl is an alias of ubfm +; CHECK-NEXT: lsl [[REG2:w[0-9]+]], [[REG1]], #2 +; CHECK-NEXT: str [[REG2]], +; CHECK-NEXT: ret + %0 = load i32* %y, align 8 + %and = and i32 %0, -8 + %shr = lshr i32 %x, 16 + %and1 = and i32 %shr, 7 + %or = or i32 %and, %and1 + %shl = shl i32 %or, 2 + store i32 %shl, i32* %y, align 8 + ret void +} + + +; Check if we can still catch bfm instruction when we drop some low bits +; (i64 version) +define void @fct8(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp { +entry: +; CHECK-LABEL: fct8: +; CHECK: ldr [[REG1:x[0-9]+]], +; CHECK-NEXT: bfxil [[REG1]], x1, #16, #3 +; lsr is an alias of ubfm +; CHECK-NEXT: lsr [[REG2:x[0-9]+]], [[REG1]], #2 +; CHECK-NEXT: str [[REG2]], +; CHECK-NEXT: ret + %0 = load i64* %y, align 8 + %and = and i64 %0, -8 + %shr = lshr i64 %x, 16 + %and1 = and i64 %shr, 7 + %or = or i64 %and, %and1 + %shr1 = lshr i64 %or, 2 + store i64 %shr1, i64* %y, align 8 + ret void +} + + +; Check if we can still catch bfm instruction when we drop some high bits +; (i64 version) +define void @fct9(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp { +entry: +; CHECK-LABEL: fct9: +; CHECK: ldr [[REG1:x[0-9]+]], +; CHECK-NEXT: bfxil [[REG1]], x1, #16, #3 +; lsr is an alias of ubfm +; CHECK-NEXT: lsl [[REG2:x[0-9]+]], [[REG1]], #2 +; CHECK-NEXT: str [[REG2]], +; CHECK-NEXT: ret + %0 = load i64* %y, align 8 + %and = and i64 %0, -8 + %shr = lshr i64 %x, 16 + %and1 = and i64 %shr, 7 + %or = or i64 %and, %and1 + %shl = shl i64 %or, 2 + store i64 %shl, i64* %y, align 8 + ret void +} + +; Check if we can catch bfm instruction when lsb is 0 (i.e., no lshr) +; (i32 version) +define void @fct10(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp { +entry: +; CHECK-LABEL: fct10: +; CHECK: ldr [[REG1:w[0-9]+]], +; CHECK-NEXT: bfxil [[REG1]], w1, #0, #3 +; lsl is an alias of ubfm +; CHECK-NEXT: lsl [[REG2:w[0-9]+]], [[REG1]], #2 +; CHECK-NEXT: str [[REG2]], +; CHECK-NEXT: ret + %0 = load i32* %y, align 8 + %and = and i32 %0, -8 + %and1 = and i32 %x, 7 + %or = or i32 %and, %and1 + %shl = shl i32 %or, 2 + store i32 %shl, i32* %y, align 8 + ret void +} + +; Check if we can catch bfm instruction when lsb is 0 (i.e., no lshr) +; (i64 version) +define void @fct11(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp { +entry: +; CHECK-LABEL: fct11: +; CHECK: ldr [[REG1:x[0-9]+]], +; CHECK-NEXT: bfxil [[REG1]], x1, #0, #3 +; lsl is an alias of ubfm +; CHECK-NEXT: lsl [[REG2:x[0-9]+]], [[REG1]], #2 +; CHECK-NEXT: str [[REG2]], +; CHECK-NEXT: ret + %0 = load i64* %y, align 8 + %and = and i64 %0, -8 + %and1 = and i64 %x, 7 + %or = or i64 %and, %and1 + %shl = shl i64 %or, 2 + store i64 %shl, i64* %y, align 8 + ret void +} + +define zeroext i1 @fct12bis(i32 %tmp2) unnamed_addr nounwind ssp align 2 { +; CHECK-LABEL: fct12bis: +; CHECK-NOT: and +; CHECK: ubfx w0, w0, #11, #1 + %and.i.i = and i32 %tmp2, 2048 + %tobool.i.i = icmp ne i32 %and.i.i, 0 + ret i1 %tobool.i.i +} + +; Check if we can still catch bfm instruction when we drop some high bits +; and some low bits +define void @fct12(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp { +entry: +; CHECK-LABEL: fct12: +; CHECK: ldr [[REG1:w[0-9]+]], +; CHECK-NEXT: bfxil [[REG1]], w1, #16, #3 +; lsr is an alias of ubfm +; CHECK-NEXT: ubfx [[REG2:w[0-9]+]], [[REG1]], #2, #28 +; CHECK-NEXT: str [[REG2]], +; CHECK-NEXT: ret + %0 = load i32* %y, align 8 + %and = and i32 %0, -8 + %shr = lshr i32 %x, 16 + %and1 = and i32 %shr, 7 + %or = or i32 %and, %and1 + %shl = shl i32 %or, 2 + %shr2 = lshr i32 %shl, 4 + store i32 %shr2, i32* %y, align 8 + ret void +} + +; Check if we can still catch bfm instruction when we drop some high bits +; and some low bits +; (i64 version) +define void @fct13(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp { +entry: +; CHECK-LABEL: fct13: +; CHECK: ldr [[REG1:x[0-9]+]], +; CHECK-NEXT: bfxil [[REG1]], x1, #16, #3 +; lsr is an alias of ubfm +; CHECK-NEXT: ubfx [[REG2:x[0-9]+]], [[REG1]], #2, #60 +; CHECK-NEXT: str [[REG2]], +; CHECK-NEXT: ret + %0 = load i64* %y, align 8 + %and = and i64 %0, -8 + %shr = lshr i64 %x, 16 + %and1 = and i64 %shr, 7 + %or = or i64 %and, %and1 + %shl = shl i64 %or, 2 + %shr2 = lshr i64 %shl, 4 + store i64 %shr2, i64* %y, align 8 + ret void +} + + +; Check if we can still catch bfm instruction when we drop some high bits +; and some low bits +define void @fct14(i32* nocapture %y, i32 %x, i32 %x1) nounwind optsize inlinehint ssp { +entry: +; CHECK-LABEL: fct14: +; CHECK: ldr [[REG1:w[0-9]+]], +; CHECK-NEXT: bfxil [[REG1]], w1, #16, #8 +; lsr is an alias of ubfm +; CHECK-NEXT: lsr [[REG2:w[0-9]+]], [[REG1]], #4 +; CHECK-NEXT: bfxil [[REG2]], w2, #5, #3 +; lsl is an alias of ubfm +; CHECK-NEXT: lsl [[REG3:w[0-9]+]], [[REG2]], #2 +; CHECK-NEXT: str [[REG3]], +; CHECK-NEXT: ret + %0 = load i32* %y, align 8 + %and = and i32 %0, -256 + %shr = lshr i32 %x, 16 + %and1 = and i32 %shr, 255 + %or = or i32 %and, %and1 + %shl = lshr i32 %or, 4 + %and2 = and i32 %shl, -8 + %shr1 = lshr i32 %x1, 5 + %and3 = and i32 %shr1, 7 + %or1 = or i32 %and2, %and3 + %shl1 = shl i32 %or1, 2 + store i32 %shl1, i32* %y, align 8 + ret void +} + +; Check if we can still catch bfm instruction when we drop some high bits +; and some low bits +; (i64 version) +define void @fct15(i64* nocapture %y, i64 %x, i64 %x1) nounwind optsize inlinehint ssp { +entry: +; CHECK-LABEL: fct15: +; CHECK: ldr [[REG1:x[0-9]+]], +; CHECK-NEXT: bfxil [[REG1]], x1, #16, #8 +; lsr is an alias of ubfm +; CHECK-NEXT: lsr [[REG2:x[0-9]+]], [[REG1]], #4 +; CHECK-NEXT: bfxil [[REG2]], x2, #5, #3 +; lsl is an alias of ubfm +; CHECK-NEXT: lsl [[REG3:x[0-9]+]], [[REG2]], #2 +; CHECK-NEXT: str [[REG3]], +; CHECK-NEXT: ret + %0 = load i64* %y, align 8 + %and = and i64 %0, -256 + %shr = lshr i64 %x, 16 + %and1 = and i64 %shr, 255 + %or = or i64 %and, %and1 + %shl = lshr i64 %or, 4 + %and2 = and i64 %shl, -8 + %shr1 = lshr i64 %x1, 5 + %and3 = and i64 %shr1, 7 + %or1 = or i64 %and2, %and3 + %shl1 = shl i64 %or1, 2 + store i64 %shl1, i64* %y, align 8 + ret void +} + +; Check if we can still catch bfm instruction when we drop some high bits +; and some low bits and a masking operation has to be kept +define void @fct16(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp { +entry: +; CHECK-LABEL: fct16: +; CHECK: ldr [[REG1:w[0-9]+]], +; Create the constant +; CHECK: movz [[REGCST:w[0-9]+]], #0x1a, lsl #16 +; CHECK: movk [[REGCST]], #0x8160 +; Do the masking +; CHECK: and [[REG2:w[0-9]+]], [[REG1]], [[REGCST]] +; CHECK-NEXT: bfxil [[REG2]], w1, #16, #3 +; lsr is an alias of ubfm +; CHECK-NEXT: ubfx [[REG3:w[0-9]+]], [[REG2]], #2, #28 +; CHECK-NEXT: str [[REG3]], +; CHECK-NEXT: ret + %0 = load i32* %y, align 8 + %and = and i32 %0, 1737056 + %shr = lshr i32 %x, 16 + %and1 = and i32 %shr, 7 + %or = or i32 %and, %and1 + %shl = shl i32 %or, 2 + %shr2 = lshr i32 %shl, 4 + store i32 %shr2, i32* %y, align 8 + ret void +} + + +; Check if we can still catch bfm instruction when we drop some high bits +; and some low bits and a masking operation has to be kept +; (i64 version) +define void @fct17(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp { +entry: +; CHECK-LABEL: fct17: +; CHECK: ldr [[REG1:x[0-9]+]], +; Create the constant +; CHECK: movz w[[REGCST:[0-9]+]], #0x1a, lsl #16 +; CHECK: movk w[[REGCST]], #0x8160 +; Do the masking +; CHECK: and [[REG2:x[0-9]+]], [[REG1]], x[[REGCST]] +; CHECK-NEXT: bfxil [[REG2]], x1, #16, #3 +; lsr is an alias of ubfm +; CHECK-NEXT: ubfx [[REG3:x[0-9]+]], [[REG2]], #2, #60 +; CHECK-NEXT: str [[REG3]], +; CHECK-NEXT: ret + %0 = load i64* %y, align 8 + %and = and i64 %0, 1737056 + %shr = lshr i64 %x, 16 + %and1 = and i64 %shr, 7 + %or = or i64 %and, %and1 + %shl = shl i64 %or, 2 + %shr2 = lshr i64 %shl, 4 + store i64 %shr2, i64* %y, align 8 + ret void +} + +define i64 @fct18(i32 %xor72) nounwind ssp { +; CHECK-LABEL: fct18: +; CHECK: ubfx x0, x0, #9, #8 + %shr81 = lshr i32 %xor72, 9 + %conv82 = zext i32 %shr81 to i64 + %result = and i64 %conv82, 255 + ret i64 %result +} + +; Using the access to the global array to keep the instruction and control flow. +@first_ones = external global [65536 x i8] + +; Function Attrs: nounwind readonly ssp +define i32 @fct19(i64 %arg1) nounwind readonly ssp { +; CHECK-LABEL: fct19: +entry: + %x.sroa.1.0.extract.shift = lshr i64 %arg1, 16 + %x.sroa.1.0.extract.trunc = trunc i64 %x.sroa.1.0.extract.shift to i16 + %x.sroa.3.0.extract.shift = lshr i64 %arg1, 32 + %x.sroa.5.0.extract.shift = lshr i64 %arg1, 48 + %tobool = icmp eq i64 %x.sroa.5.0.extract.shift, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry + %arrayidx3 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %x.sroa.5.0.extract.shift + %0 = load i8* %arrayidx3, align 1 + %conv = zext i8 %0 to i32 + br label %return + +; OPT-LABEL: if.end +if.end: ; preds = %entry +; OPT: lshr +; CHECK: ubfx [[REG1:x[0-9]+]], [[REG2:x[0-9]+]], #32, #16 + %x.sroa.3.0.extract.trunc = trunc i64 %x.sroa.3.0.extract.shift to i16 + %tobool6 = icmp eq i16 %x.sroa.3.0.extract.trunc, 0 +; CHECK: cbz + br i1 %tobool6, label %if.end13, label %if.then7 + +; OPT-LABEL: if.then7 +if.then7: ; preds = %if.end +; OPT: lshr +; "and" should be combined to "ubfm" while "ubfm" should be removed by cse. +; So neither of them should be in the assemble code. +; CHECK-NOT: and +; CHECK-NOT: ubfm + %idxprom10 = and i64 %x.sroa.3.0.extract.shift, 65535 + %arrayidx11 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %idxprom10 + %1 = load i8* %arrayidx11, align 1 + %conv12 = zext i8 %1 to i32 + %add = add nsw i32 %conv12, 16 + br label %return + +; OPT-LABEL: if.end13 +if.end13: ; preds = %if.end +; OPT: lshr +; OPT: trunc +; CHECK: ubfx [[REG3:x[0-9]+]], [[REG4:x[0-9]+]], #16, #16 + %tobool16 = icmp eq i16 %x.sroa.1.0.extract.trunc, 0 +; CHECK: cbz + br i1 %tobool16, label %return, label %if.then17 + +; OPT-LABEL: if.then17 +if.then17: ; preds = %if.end13 +; OPT: lshr +; "and" should be combined to "ubfm" while "ubfm" should be removed by cse. +; So neither of them should be in the assemble code. +; CHECK-NOT: and +; CHECK-NOT: ubfm + %idxprom20 = and i64 %x.sroa.1.0.extract.shift, 65535 + %arrayidx21 = getelementptr inbounds [65536 x i8]* @first_ones, i64 0, i64 %idxprom20 + %2 = load i8* %arrayidx21, align 1 + %conv22 = zext i8 %2 to i32 + %add23 = add nsw i32 %conv22, 32 + br label %return + +return: ; preds = %if.end13, %if.then17, %if.then7, %if.then +; CHECK: ret + %retval.0 = phi i32 [ %conv, %if.then ], [ %add, %if.then7 ], [ %add23, %if.then17 ], [ 64, %if.end13 ] + ret i32 %retval.0 +} + +; Make sure we do not assert if the immediate in and is bigger than i64. +; PR19503. +; OPT-LABEL: @fct20 +; OPT: lshr +; OPT-NOT: lshr +; OPT: ret +; CHECK-LABEL: fct20: +; CHECK: ret +define i80 @fct20(i128 %a, i128 %b) { +entry: + %shr = lshr i128 %a, 18 + %conv = trunc i128 %shr to i80 + %tobool = icmp eq i128 %b, 0 + br i1 %tobool, label %then, label %end +then: + %and = and i128 %shr, 483673642326615442599424 + %conv2 = trunc i128 %and to i80 + br label %end +end: + %conv3 = phi i80 [%conv, %entry], [%conv2, %then] + ret i80 %conv3 +} + +; Check if we can still catch UBFX when "AND" is used by SHL. +; CHECK-LABEL: fct21: +; CHECK: ubfx +@arr = external global [8 x [64 x i64]] +define i64 @fct21(i64 %x) { +entry: + %shr = lshr i64 %x, 4 + %and = and i64 %shr, 15 + %arrayidx = getelementptr inbounds [8 x [64 x i64]]* @arr, i64 0, i64 0, i64 %and + %0 = load i64* %arrayidx, align 8 + ret i64 %0 +} + +define i16 @test_ignored_rightbits(i32 %dst, i32 %in) { +; CHECK-LABEL: test_ignored_rightbits: + + %positioned_field = shl i32 %in, 3 + %positioned_masked_field = and i32 %positioned_field, 120 + %masked_dst = and i32 %dst, 7 + %insertion = or i32 %masked_dst, %positioned_masked_field +; CHECK: {{bfm|bfi|bfxil}} + + %shl16 = shl i32 %insertion, 8 + %or18 = or i32 %shl16, %insertion + %conv19 = trunc i32 %or18 to i16 +; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #8, #7 + + ret i16 %conv19 +} diff --git a/test/CodeGen/AArch64/arm64-blockaddress.ll b/test/CodeGen/AArch64/arm64-blockaddress.ll new file mode 100644 index 00000000000..ac4f19e65df --- /dev/null +++ b/test/CodeGen/AArch64/arm64-blockaddress.ll @@ -0,0 +1,30 @@ +; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s +; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix=CHECK-LINUX +; RUN: llc < %s -mtriple=arm64-linux-gnu -code-model=large| FileCheck %s --check-prefix=CHECK-LARGE + +; rdar://9188695 + +define i64 @t() nounwind ssp { +entry: +; CHECK-LABEL: t: +; CHECK: adrp [[REG:x[0-9]+]], Ltmp1@PAGE +; CHECK: add {{x[0-9]+}}, [[REG]], Ltmp1@PAGEOFF + +; CHECK-LINUX-LABEL: t: +; CHECK-LINUX: adrp [[REG:x[0-9]+]], .Ltmp1 +; CHECK-LINUX: add {{x[0-9]+}}, [[REG]], :lo12:.Ltmp1 + +; CHECK-LARGE-LABEL: t: +; CHECK-LARGE: movz [[ADDR_REG:x[0-9]+]], #:abs_g3:[[DEST_LBL:.Ltmp[0-9]+]] +; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g2_nc:[[DEST_LBL]] +; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g1_nc:[[DEST_LBL]] +; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g0_nc:[[DEST_LBL]] + + %recover = alloca i64, align 8 + store volatile i64 ptrtoint (i8* blockaddress(@t, %mylabel) to i64), i64* %recover, align 8 + br label %mylabel + +mylabel: + %tmp = load volatile i64* %recover, align 8 + ret i64 %tmp +} diff --git a/test/CodeGen/AArch64/arm64-build-vector.ll b/test/CodeGen/AArch64/arm64-build-vector.ll new file mode 100644 index 00000000000..c109263cedb --- /dev/null +++ b/test/CodeGen/AArch64/arm64-build-vector.ll @@ -0,0 +1,35 @@ +; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s + +; Check that building up a vector w/ only one non-zero lane initializes +; intelligently. +define void @one_lane(i32* nocapture %out_int, i32 %skip0) nounwind { +; CHECK-LABEL: one_lane: +; CHECK: dup.16b v[[REG:[0-9]+]], wzr +; CHECK-NEXT: ins.b v[[REG]][0], w1 +; v and q are aliases, and str is preferred against st.16b when possible +; rdar://11246289 +; CHECK: str q[[REG]], [x0] +; CHECK: ret + %conv = trunc i32 %skip0 to i8 + %vset_lane = insertelement <16 x i8> , i8 %conv, i32 0 + %tmp = bitcast i32* %out_int to <4 x i32>* + %tmp1 = bitcast <16 x i8> %vset_lane to <4 x i32> + store <4 x i32> %tmp1, <4 x i32>* %tmp, align 16 + ret void +} + +; Check that building a vector from floats doesn't insert an unnecessary +; copy for lane zero. +define <4 x float> @foo(float %a, float %b, float %c, float %d) nounwind { +; CHECK-LABEL: foo: +; CHECK-NOT: ins.s v0[0], v0[0] +; CHECK: ins.s v0[1], v1[0] +; CHECK: ins.s v0[2], v2[0] +; CHECK: ins.s v0[3], v3[0] +; CHECK: ret + %1 = insertelement <4 x float> undef, float %a, i32 0 + %2 = insertelement <4 x float> %1, float %b, i32 1 + %3 = insertelement <4 x float> %2, float %c, i32 2 + %4 = insertelement <4 x float> %3, float %d, i32 3 + ret <4 x float> %4 +} diff --git a/test/CodeGen/AArch64/arm64-call-tailcalls.ll b/test/CodeGen/AArch64/arm64-call-tailcalls.ll new file mode 100644 index 00000000000..487c1d9bec3 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-call-tailcalls.ll @@ -0,0 +1,91 @@ +; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s + +@t = weak global i32 ()* null +@x = external global i32, align 4 + +define void @t2() { +; CHECK-LABEL: t2: +; CHECK: adrp x[[GOTADDR:[0-9]+]], _t@GOTPAGE +; CHECK: ldr x[[ADDR:[0-9]+]], [x[[GOTADDR]], _t@GOTPAGEOFF] +; CHECK: ldr x[[DEST:[0-9]+]], [x[[ADDR]]] +; CHECK: br x[[DEST]] + %tmp = load i32 ()** @t + %tmp.upgrd.2 = tail call i32 %tmp() + ret void +} + +define void @t3() { +; CHECK-LABEL: t3: +; CHECK: b _t2 + tail call void @t2() + ret void +} + +define double @t4(double %a) nounwind readonly ssp { +; CHECK-LABEL: t4: +; CHECK: b _sin + %tmp = tail call double @sin(double %a) nounwind readonly + ret double %tmp +} + +define float @t5(float %a) nounwind readonly ssp { +; CHECK-LABEL: t5: +; CHECK: b _sinf + %tmp = tail call float @sinf(float %a) nounwind readonly + ret float %tmp +} + +define void @t7() nounwind { +; CHECK-LABEL: t7: +; CHECK: b _foo +; CHECK: b _bar + + br i1 undef, label %bb, label %bb1.lr.ph + +bb1.lr.ph: ; preds = %entry + tail call void @bar() nounwind + ret void + +bb: ; preds = %entry + tail call void @foo() nounwind + ret void +} + +define i32 @t8(i32 %x) nounwind ssp { +; CHECK-LABEL: t8: +; CHECK: b _a +; CHECK: b _b +; CHECK: b _c + %and = and i32 %x, 1 + %tobool = icmp eq i32 %and, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry + %call = tail call i32 @a(i32 %x) nounwind + br label %return + +if.end: ; preds = %entry + %and1 = and i32 %x, 2 + %tobool2 = icmp eq i32 %and1, 0 + br i1 %tobool2, label %if.end5, label %if.then3 + +if.then3: ; preds = %if.end + %call4 = tail call i32 @b(i32 %x) nounwind + br label %return + +if.end5: ; preds = %if.end + %call6 = tail call i32 @c(i32 %x) nounwind + br label %return + +return: ; preds = %if.end5, %if.then3, %if.then + %retval.0 = phi i32 [ %call, %if.then ], [ %call4, %if.then3 ], [ %call6, %if.end5 ] + ret i32 %retval.0 +} + +declare float @sinf(float) nounwind readonly +declare double @sin(double) nounwind readonly +declare void @bar() nounwind +declare void @foo() nounwind +declare i32 @a(i32) +declare i32 @b(i32) +declare i32 @c(i32) diff --git a/test/CodeGen/AArch64/arm64-cast-opt.ll b/test/CodeGen/AArch64/arm64-cast-opt.ll new file mode 100644 index 00000000000..65a871d4368 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-cast-opt.ll @@ -0,0 +1,31 @@ +; RUN: llc -O3 -march=arm64 -mtriple arm64-apple-ios5.0.0 < %s | FileCheck %s +; +; Zero truncation is not necessary when the values are extended properly +; already. + +@block = common global i8* null, align 8 + +define zeroext i8 @foo(i32 %i1, i32 %i2) { +; CHECK-LABEL: foo: +; CHECK: cset +; CHECK-NOT: and +entry: + %idxprom = sext i32 %i1 to i64 + %0 = load i8** @block, align 8 + %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom + %1 = load i8* %arrayidx, align 1 + %idxprom1 = sext i32 %i2 to i64 + %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1 + %2 = load i8* %arrayidx2, align 1 + %cmp = icmp eq i8 %1, %2 + br i1 %cmp, label %return, label %if.then + +if.then: ; preds = %entry + %cmp7 = icmp ugt i8 %1, %2 + %conv9 = zext i1 %cmp7 to i8 + br label %return + +return: ; preds = %entry, %if.then + %retval.0 = phi i8 [ %conv9, %if.then ], [ 1, %entry ] + ret i8 %retval.0 +} diff --git a/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll b/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll new file mode 100644 index 00000000000..664a26cafe4 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll @@ -0,0 +1,190 @@ +; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -aarch64-ccmp | FileCheck %s +target triple = "arm64-apple-ios7.0.0" + +@channelColumns = external global i64 +@channelTracks = external global i64 +@mazeRoute = external hidden unnamed_addr global i8*, align 8 +@TOP = external global i64* +@BOT = external global i64* +@netsAssign = external global i64* + +; Function from yacr2/maze.c +; The branch at the end of %if.then is driven by %cmp5 and %cmp6. +; Isel converts the and i1 into two branches, and arm64-ccmp should not convert +; it back again. %cmp6 has much higher latency than %cmp5. +; CHECK: Maze1 +; CHECK: %if.then +; CHECK: cmp x{{[0-9]+}}, #2 +; CHECK-NEXT b.cc +; CHECK: %if.then +; CHECK: cmp x{{[0-9]+}}, #2 +; CHECK-NEXT b.cc +define i32 @Maze1() nounwind ssp { +entry: + %0 = load i64* @channelColumns, align 8, !tbaa !0 + %cmp90 = icmp eq i64 %0, 0 + br i1 %cmp90, label %for.end, label %for.body + +for.body: ; preds = %for.inc, %entry + %1 = phi i64 [ %0, %entry ], [ %37, %for.inc ] + %i.092 = phi i64 [ 1, %entry ], [ %inc53, %for.inc ] + %numLeft.091 = phi i32 [ 0, %entry ], [ %numLeft.1, %for.inc ] + %2 = load i8** @mazeRoute, align 8, !tbaa !3 + %arrayidx = getelementptr inbounds i8* %2, i64 %i.092 + %3 = load i8* %arrayidx, align 1, !tbaa !1 + %tobool = icmp eq i8 %3, 0 + br i1 %tobool, label %for.inc, label %if.then + +if.then: ; preds = %for.body + %4 = load i64** @TOP, align 8, !tbaa !3 + %arrayidx1 = getelementptr inbounds i64* %4, i64 %i.092 + %5 = load i64* %arrayidx1, align 8, !tbaa !0 + %6 = load i64** @netsAssign, align 8, !tbaa !3 + %arrayidx2 = getelementptr inbounds i64* %6, i64 %5 + %7 = load i64* %arrayidx2, align 8, !tbaa !0 + %8 = load i64** @BOT, align 8, !tbaa !3 + %arrayidx3 = getelementptr inbounds i64* %8, i64 %i.092 + %9 = load i64* %arrayidx3, align 8, !tbaa !0 + %arrayidx4 = getelementptr inbounds i64* %6, i64 %9 + %10 = load i64* %arrayidx4, align 8, !tbaa !0 + %cmp5 = icmp ugt i64 %i.092, 1 + %cmp6 = icmp ugt i64 %10, 1 + %or.cond = and i1 %cmp5, %cmp6 + br i1 %or.cond, label %land.lhs.true7, label %if.else + +land.lhs.true7: ; preds = %if.then + %11 = load i64* @channelTracks, align 8, !tbaa !0 + %add = add i64 %11, 1 + %call = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 %add, i64 %10, i64 0, i64 %7, i32 -1, i32 -1) + %tobool8 = icmp eq i32 %call, 0 + br i1 %tobool8, label %land.lhs.true7.if.else_crit_edge, label %if.then9 + +land.lhs.true7.if.else_crit_edge: ; preds = %land.lhs.true7 + %.pre = load i64* @channelColumns, align 8, !tbaa !0 + br label %if.else + +if.then9: ; preds = %land.lhs.true7 + %12 = load i8** @mazeRoute, align 8, !tbaa !3 + %arrayidx10 = getelementptr inbounds i8* %12, i64 %i.092 + store i8 0, i8* %arrayidx10, align 1, !tbaa !1 + %13 = load i64** @TOP, align 8, !tbaa !3 + %arrayidx11 = getelementptr inbounds i64* %13, i64 %i.092 + %14 = load i64* %arrayidx11, align 8, !tbaa !0 + tail call fastcc void @CleanNet(i64 %14) + %15 = load i64** @BOT, align 8, !tbaa !3 + %arrayidx12 = getelementptr inbounds i64* %15, i64 %i.092 + %16 = load i64* %arrayidx12, align 8, !tbaa !0 + tail call fastcc void @CleanNet(i64 %16) + br label %for.inc + +if.else: ; preds = %land.lhs.true7.if.else_crit_edge, %if.then + %17 = phi i64 [ %.pre, %land.lhs.true7.if.else_crit_edge ], [ %1, %if.then ] + %cmp13 = icmp ult i64 %i.092, %17 + %or.cond89 = and i1 %cmp13, %cmp6 + br i1 %or.cond89, label %land.lhs.true16, label %if.else24 + +land.lhs.true16: ; preds = %if.else + %18 = load i64* @channelTracks, align 8, !tbaa !0 + %add17 = add i64 %18, 1 + %call18 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 %add17, i64 %10, i64 0, i64 %7, i32 1, i32 -1) + %tobool19 = icmp eq i32 %call18, 0 + br i1 %tobool19, label %if.else24, label %if.then20 + +if.then20: ; preds = %land.lhs.true16 + %19 = load i8** @mazeRoute, align 8, !tbaa !3 + %arrayidx21 = getelementptr inbounds i8* %19, i64 %i.092 + store i8 0, i8* %arrayidx21, align 1, !tbaa !1 + %20 = load i64** @TOP, align 8, !tbaa !3 + %arrayidx22 = getelementptr inbounds i64* %20, i64 %i.092 + %21 = load i64* %arrayidx22, align 8, !tbaa !0 + tail call fastcc void @CleanNet(i64 %21) + %22 = load i64** @BOT, align 8, !tbaa !3 + %arrayidx23 = getelementptr inbounds i64* %22, i64 %i.092 + %23 = load i64* %arrayidx23, align 8, !tbaa !0 + tail call fastcc void @CleanNet(i64 %23) + br label %for.inc + +if.else24: ; preds = %land.lhs.true16, %if.else + br i1 %cmp5, label %land.lhs.true26, label %if.else36 + +land.lhs.true26: ; preds = %if.else24 + %24 = load i64* @channelTracks, align 8, !tbaa !0 + %cmp27 = icmp ult i64 %7, %24 + br i1 %cmp27, label %land.lhs.true28, label %if.else36 + +land.lhs.true28: ; preds = %land.lhs.true26 + %add29 = add i64 %24, 1 + %call30 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 0, i64 %7, i64 %add29, i64 %10, i32 -1, i32 1) + %tobool31 = icmp eq i32 %call30, 0 + br i1 %tobool31, label %if.else36, label %if.then32 + +if.then32: ; preds = %land.lhs.true28 + %25 = load i8** @mazeRoute, align 8, !tbaa !3 + %arrayidx33 = getelementptr inbounds i8* %25, i64 %i.092 + store i8 0, i8* %arrayidx33, align 1, !tbaa !1 + %26 = load i64** @TOP, align 8, !tbaa !3 + %arrayidx34 = getelementptr inbounds i64* %26, i64 %i.092 + %27 = load i64* %arrayidx34, align 8, !tbaa !0 + tail call fastcc void @CleanNet(i64 %27) + %28 = load i64** @BOT, align 8, !tbaa !3 + %arrayidx35 = getelementptr inbounds i64* %28, i64 %i.092 + %29 = load i64* %arrayidx35, align 8, !tbaa !0 + tail call fastcc void @CleanNet(i64 %29) + br label %for.inc + +if.else36: ; preds = %land.lhs.true28, %land.lhs.true26, %if.else24 + %30 = load i64* @channelColumns, align 8, !tbaa !0 + %cmp37 = icmp ult i64 %i.092, %30 + br i1 %cmp37, label %land.lhs.true38, label %if.else48 + +land.lhs.true38: ; preds = %if.else36 + %31 = load i64* @channelTracks, align 8, !tbaa !0 + %cmp39 = icmp ult i64 %7, %31 + br i1 %cmp39, label %land.lhs.true40, label %if.else48 + +land.lhs.true40: ; preds = %land.lhs.true38 + %add41 = add i64 %31, 1 + %call42 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 0, i64 %7, i64 %add41, i64 %10, i32 1, i32 1) + %tobool43 = icmp eq i32 %call42, 0 + br i1 %tobool43, label %if.else48, label %if.then44 + +if.then44: ; preds = %land.lhs.true40 + %32 = load i8** @mazeRoute, align 8, !tbaa !3 + %arrayidx45 = getelementptr inbounds i8* %32, i64 %i.092 + store i8 0, i8* %arrayidx45, align 1, !tbaa !1 + %33 = load i64** @TOP, align 8, !tbaa !3 + %arrayidx46 = getelementptr inbounds i64* %33, i64 %i.092 + %34 = load i64* %arrayidx46, align 8, !tbaa !0 + tail call fastcc void @CleanNet(i64 %34) + %35 = load i64** @BOT, align 8, !tbaa !3 + %arrayidx47 = getelementptr inbounds i64* %35, i64 %i.092 + %36 = load i64* %arrayidx47, align 8, !tbaa !0 + tail call fastcc void @CleanNet(i64 %36) + br label %for.inc + +if.else48: ; preds = %land.lhs.true40, %land.lhs.true38, %if.else36 + %inc = add nsw i32 %numLeft.091, 1 + br label %for.inc + +for.inc: ; preds = %if.else48, %if.then44, %if.then32, %if.then20, %if.then9, %for.body + %numLeft.1 = phi i32 [ %numLeft.091, %if.then9 ], [ %numLeft.091, %if.then20 ], [ %numLeft.091, %if.then32 ], [ %numLeft.091, %if.then44 ], [ %inc, %if.else48 ], [ %numLeft.091, %for.body ] + %inc53 = add i64 %i.092, 1 + %37 = load i64* @channelColumns, align 8, !tbaa !0 + %cmp = icmp ugt i64 %inc53, %37 + br i1 %cmp, label %for.end, label %for.body + +for.end: ; preds = %for.inc, %entry + %numLeft.0.lcssa = phi i32 [ 0, %entry ], [ %numLeft.1, %for.inc ] + ret i32 %numLeft.0.lcssa +} + +; Materializable +declare hidden fastcc i32 @Maze1Mech(i64, i64, i64, i64, i64, i32, i32) nounwind ssp + +; Materializable +declare hidden fastcc void @CleanNet(i64) nounwind ssp + +!0 = metadata !{metadata !"long", metadata !1} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA"} +!3 = metadata !{metadata !"any pointer", metadata !1} diff --git a/test/CodeGen/AArch64/arm64-ccmp.ll b/test/CodeGen/AArch64/arm64-ccmp.ll new file mode 100644 index 00000000000..63965f9538b --- /dev/null +++ b/test/CodeGen/AArch64/arm64-ccmp.ll @@ -0,0 +1,289 @@ +; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -aarch64-ccmp -aarch64-stress-ccmp | FileCheck %s +target triple = "arm64-apple-ios" + +; CHECK: single_same +; CHECK: cmp w0, #5 +; CHECK-NEXT: ccmp w1, #17, #4, ne +; CHECK-NEXT: b.ne +; CHECK: %if.then +; CHECK: bl _foo +; CHECK: %if.end +define i32 @single_same(i32 %a, i32 %b) nounwind ssp { +entry: + %cmp = icmp eq i32 %a, 5 + %cmp1 = icmp eq i32 %b, 17 + %or.cond = or i1 %cmp, %cmp1 + br i1 %or.cond, label %if.then, label %if.end + +if.then: + %call = tail call i32 @foo() nounwind + br label %if.end + +if.end: + ret i32 7 +} + +; Different condition codes for the two compares. +; CHECK: single_different +; CHECK: cmp w0, #6 +; CHECK-NEXT: ccmp w1, #17, #0, ge +; CHECK-NEXT: b.eq +; CHECK: %if.then +; CHECK: bl _foo +; CHECK: %if.end +define i32 @single_different(i32 %a, i32 %b) nounwind ssp { +entry: + %cmp = icmp sle i32 %a, 5 + %cmp1 = icmp ne i32 %b, 17 + %or.cond = or i1 %cmp, %cmp1 + br i1 %or.cond, label %if.then, label %if.end + +if.then: + %call = tail call i32 @foo() nounwind + br label %if.end + +if.end: + ret i32 7 +} + +; Second block clobbers the flags, can't convert (easily). +; CHECK: single_flagclobber +; CHECK: cmp +; CHECK: b.eq +; CHECK: cmp +; CHECK: b.gt +define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp { +entry: + %cmp = icmp eq i32 %a, 5 + br i1 %cmp, label %if.then, label %lor.lhs.false + +lor.lhs.false: ; preds = %entry + %cmp1 = icmp slt i32 %b, 7 + %mul = shl nsw i32 %b, 1 + %add = add nsw i32 %b, 1 + %cond = select i1 %cmp1, i32 %mul, i32 %add + %cmp2 = icmp slt i32 %cond, 17 + br i1 %cmp2, label %if.then, label %if.end + +if.then: ; preds = %lor.lhs.false, %entry + %call = tail call i32 @foo() nounwind + br label %if.end + +if.end: ; preds = %if.then, %lor.lhs.false + ret i32 7 +} + +; Second block clobbers the flags and ends with a tbz terminator. +; CHECK: single_flagclobber_tbz +; CHECK: cmp +; CHECK: b.eq +; CHECK: cmp +; CHECK: tbz +define i32 @single_flagclobber_tbz(i32 %a, i32 %b) nounwind ssp { +entry: + %cmp = icmp eq i32 %a, 5 + br i1 %cmp, label %if.then, label %lor.lhs.false + +lor.lhs.false: ; preds = %entry + %cmp1 = icmp slt i32 %b, 7 + %mul = shl nsw i32 %b, 1 + %add = add nsw i32 %b, 1 + %cond = select i1 %cmp1, i32 %mul, i32 %add + %and = and i32 %cond, 8 + %cmp2 = icmp ne i32 %and, 0 + br i1 %cmp2, label %if.then, label %if.end + +if.then: ; preds = %lor.lhs.false, %entry + %call = tail call i32 @foo() nounwind + br label %if.end + +if.end: ; preds = %if.then, %lor.lhs.false + ret i32 7 +} + +; Speculatively execute division by zero. +; The sdiv/udiv instructions do not trap when the divisor is zero, so they are +; safe to speculate. +; CHECK: speculate_division +; CHECK-NOT: cmp +; CHECK: sdiv +; CHECK: cmp +; CHECK-NEXT: ccmp +define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp { +entry: + %cmp = icmp sgt i32 %a, 0 + br i1 %cmp, label %land.lhs.true, label %if.end + +land.lhs.true: + %div = sdiv i32 %b, %a + %cmp1 = icmp slt i32 %div, 17 + br i1 %cmp1, label %if.then, label %if.end + +if.then: + %call = tail call i32 @foo() nounwind + br label %if.end + +if.end: + ret i32 7 +} + +; Floating point compare. +; CHECK: single_fcmp +; CHECK: cmp +; CHECK-NOT: b. +; CHECK: fccmp {{.*}}, #8, ge +; CHECK: b.lt +define i32 @single_fcmp(i32 %a, float %b) nounwind ssp { +entry: + %cmp = icmp sgt i32 %a, 0 + br i1 %cmp, label %land.lhs.true, label %if.end + +land.lhs.true: + %conv = sitofp i32 %a to float + %div = fdiv float %b, %conv + %cmp1 = fcmp oge float %div, 1.700000e+01 + br i1 %cmp1, label %if.then, label %if.end + +if.then: + %call = tail call i32 @foo() nounwind + br label %if.end + +if.end: + ret i32 7 +} + +; Chain multiple compares. +; CHECK: multi_different +; CHECK: cmp +; CHECK: ccmp +; CHECK: ccmp +; CHECK: b. +define void @multi_different(i32 %a, i32 %b, i32 %c) nounwind ssp { +entry: + %cmp = icmp sgt i32 %a, %b + br i1 %cmp, label %land.lhs.true, label %if.end + +land.lhs.true: + %div = sdiv i32 %b, %a + %cmp1 = icmp eq i32 %div, 5 + %cmp4 = icmp sgt i32 %div, %c + %or.cond = and i1 %cmp1, %cmp4 + br i1 %or.cond, label %if.then, label %if.end + +if.then: + %call = tail call i32 @foo() nounwind + br label %if.end + +if.end: + ret void +} + +; Convert a cbz in the head block. +; CHECK: cbz_head +; CHECK: cmp w0, #0 +; CHECK: ccmp +define i32 @cbz_head(i32 %a, i32 %b) nounwind ssp { +entry: + %cmp = icmp eq i32 %a, 0 + %cmp1 = icmp ne i32 %b, 17 + %or.cond = or i1 %cmp, %cmp1 + br i1 %or.cond, label %if.then, label %if.end + +if.then: + %call = tail call i32 @foo() nounwind + br label %if.end + +if.end: + ret i32 7 +} + +; Check that the immediate operand is in range. The ccmp instruction encodes a +; smaller range of immediates than subs/adds. +; The ccmp immediates must be in the range 0-31. +; CHECK: immediate_range +; CHECK-NOT: ccmp +define i32 @immediate_range(i32 %a, i32 %b) nounwind ssp { +entry: + %cmp = icmp eq i32 %a, 5 + %cmp1 = icmp eq i32 %b, 32 + %or.cond = or i1 %cmp, %cmp1 + br i1 %or.cond, label %if.then, label %if.end + +if.then: + %call = tail call i32 @foo() nounwind + br label %if.end + +if.end: + ret i32 7 +} + +; Convert a cbz in the second block. +; CHECK: cbz_second +; CHECK: cmp w0, #0 +; CHECK: ccmp w1, #0, #0, ne +; CHECK: b.eq +define i32 @cbz_second(i32 %a, i32 %b) nounwind ssp { +entry: + %cmp = icmp eq i32 %a, 0 + %cmp1 = icmp ne i32 %b, 0 + %or.cond = or i1 %cmp, %cmp1 + br i1 %or.cond, label %if.then, label %if.end + +if.then: + %call = tail call i32 @foo() nounwind + br label %if.end + +if.end: + ret i32 7 +} + +; Convert a cbnz in the second block. +; CHECK: cbnz_second +; CHECK: cmp w0, #0 +; CHECK: ccmp w1, #0, #4, ne +; CHECK: b.ne +define i32 @cbnz_second(i32 %a, i32 %b) nounwind ssp { +entry: + %cmp = icmp eq i32 %a, 0 + %cmp1 = icmp eq i32 %b, 0 + %or.cond = or i1 %cmp, %cmp1 + br i1 %or.cond, label %if.then, label %if.end + +if.then: + %call = tail call i32 @foo() nounwind + br label %if.end + +if.end: + ret i32 7 +} +declare i32 @foo() + +%str1 = type { %str2 } +%str2 = type { [24 x i8], i8*, i32, %str1*, i32, [4 x i8], %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, i8*, i8, i8*, %str1*, i8* } + +; Test case distilled from 126.gcc. +; The phi in sw.bb.i.i gets multiple operands for the %entry predecessor. +; CHECK: build_modify_expr +define void @build_modify_expr() nounwind ssp { +entry: + switch i32 undef, label %sw.bb.i.i [ + i32 69, label %if.end85 + i32 70, label %if.end85 + i32 71, label %if.end85 + i32 72, label %if.end85 + i32 73, label %if.end85 + i32 105, label %if.end85 + i32 106, label %if.end85 + ] + +if.end85: + ret void + +sw.bb.i.i: + %ref.tr.i.i = phi %str1* [ %0, %sw.bb.i.i ], [ undef, %entry ] + %operands.i.i = getelementptr inbounds %str1* %ref.tr.i.i, i64 0, i32 0, i32 2 + %arrayidx.i.i = bitcast i32* %operands.i.i to %str1** + %0 = load %str1** %arrayidx.i.i, align 8 + %code1.i.i.phi.trans.insert = getelementptr inbounds %str1* %0, i64 0, i32 0, i32 0, i64 16 + br label %sw.bb.i.i +} diff --git a/test/CodeGen/AArch64/arm64-clrsb.ll b/test/CodeGen/AArch64/arm64-clrsb.ll new file mode 100644 index 00000000000..042e52e5e78 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-clrsb.ll @@ -0,0 +1,36 @@ +; RUN: llc < %s -march=arm64 | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-ios7.0.0" + +; Function Attrs: nounwind readnone +declare i32 @llvm.ctlz.i32(i32, i1) #0 +declare i64 @llvm.ctlz.i64(i64, i1) #1 + +; Function Attrs: nounwind ssp +define i32 @clrsb32(i32 %x) #2 { +entry: + %shr = ashr i32 %x, 31 + %xor = xor i32 %shr, %x + %mul = shl i32 %xor, 1 + %add = or i32 %mul, 1 + %0 = tail call i32 @llvm.ctlz.i32(i32 %add, i1 false) + + ret i32 %0 +; CHECK-LABEL: clrsb32 +; CHECK: cls [[TEMP:w[0-9]+]], [[TEMP]] +} + +; Function Attrs: nounwind ssp +define i64 @clrsb64(i64 %x) #3 { +entry: + %shr = ashr i64 %x, 63 + %xor = xor i64 %shr, %x + %mul = shl nsw i64 %xor, 1 + %add = or i64 %mul, 1 + %0 = tail call i64 @llvm.ctlz.i64(i64 %add, i1 false) + + ret i64 %0 +; CHECK-LABEL: clrsb64 +; CHECK: cls [[TEMP:x[0-9]+]], [[TEMP]] +} diff --git a/test/CodeGen/AArch64/arm64-coalesce-ext.ll b/test/CodeGen/AArch64/arm64-coalesce-ext.ll new file mode 100644 index 00000000000..9420bf3bb59 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-coalesce-ext.ll @@ -0,0 +1,17 @@ +; RUN: llc -march=arm64 -mtriple=arm64-apple-darwin < %s | FileCheck %s +; Check that the peephole optimizer knows about sext and zext instructions. +; CHECK: test1sext +define i32 @test1sext(i64 %A, i64 %B, i32* %P, i64 *%P2) nounwind { + %C = add i64 %A, %B + ; CHECK: add x[[SUM:[0-9]+]], x0, x1 + %D = trunc i64 %C to i32 + %E = shl i64 %C, 32 + %F = ashr i64 %E, 32 + ; CHECK: sxtw x[[EXT:[0-9]+]], w[[SUM]] + store volatile i64 %F, i64 *%P2 + ; CHECK: str x[[EXT]] + store volatile i32 %D, i32* %P + ; Reuse low bits of extended register, don't extend live range of SUM. + ; CHECK: str w[[SUM]] + ret i32 %D +} diff --git a/test/CodeGen/AArch64/arm64-code-model-large-abs.ll b/test/CodeGen/AArch64/arm64-code-model-large-abs.ll new file mode 100644 index 00000000000..264da2da25b --- /dev/null +++ b/test/CodeGen/AArch64/arm64-code-model-large-abs.ll @@ -0,0 +1,72 @@ +; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large < %s | FileCheck %s + +@var8 = global i8 0 +@var16 = global i16 0 +@var32 = global i32 0 +@var64 = global i64 0 + +define i8* @global_addr() { +; CHECK-LABEL: global_addr: + ret i8* @var8 + ; The movz/movk calculation should end up returned directly in x0. +; CHECK: movz x0, #:abs_g3:var8 +; CHECK: movk x0, #:abs_g2_nc:var8 +; CHECK: movk x0, #:abs_g1_nc:var8 +; CHECK: movk x0, #:abs_g0_nc:var8 +; CHECK-NEXT: ret +} + +define i8 @global_i8() { +; CHECK-LABEL: global_i8: + %val = load i8* @var8 + ret i8 %val +; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var8 +; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var8 +; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var8 +; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var8 +; CHECK: ldrb w0, [x[[ADDR_REG]]] +} + +define i16 @global_i16() { +; CHECK-LABEL: global_i16: + %val = load i16* @var16 + ret i16 %val +; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var16 +; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var16 +; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var16 +; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var16 +; CHECK: ldrh w0, [x[[ADDR_REG]]] +} + +define i32 @global_i32() { +; CHECK-LABEL: global_i32: + %val = load i32* @var32 + ret i32 %val +; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var32 +; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var32 +; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var32 +; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var32 +; CHECK: ldr w0, [x[[ADDR_REG]]] +} + +define i64 @global_i64() { +; CHECK-LABEL: global_i64: + %val = load i64* @var64 + ret i64 %val +; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var64 +; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var64 +; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var64 +; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var64 +; CHECK: ldr x0, [x[[ADDR_REG]]] +} + +define <2 x i64> @constpool() { +; CHECK-LABEL: constpool: + ret <2 x i64> + +; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:[[CPADDR:.LCPI[0-9]+_[0-9]+]] +; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:[[CPADDR]] +; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:[[CPADDR]] +; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:[[CPADDR]] +; CHECK: ldr q0, [x[[ADDR_REG]]] +} diff --git a/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll b/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll new file mode 100644 index 00000000000..81cee38420a --- /dev/null +++ b/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll @@ -0,0 +1,37 @@ +; RUN: llc -mtriple=arm64-apple-ios -O3 -aarch64-collect-loh -aarch64-collect-loh-bb-only=true -aarch64-collect-loh-pre-collect-register=false < %s -o - | FileCheck %s +; Check that the LOH analysis does not crash when the analysed chained +; contains instructions that are filtered out. +; +; Before the fix for , these cases were removed +; from the main container. Now, the deterministic container does not allow +; to remove arbitrary values, so we have to live with garbage values. +; + +%"class.H4ISP::H4ISPDevice" = type { i32 (%"class.H4ISP::H4ISPDevice"*, i32, i8*, i8*)*, i8*, i32*, %"class.H4ISP::H4ISPCameraManager"* } + +%"class.H4ISP::H4ISPCameraManager" = type opaque + +declare i32 @_ZN5H4ISP11H4ISPDevice32ISP_SelectBestMIPIFrequencyIndexEjPj(%"class.H4ISP::H4ISPDevice"*) + +@pH4ISPDevice = hidden global %"class.H4ISP::H4ISPDevice"* null, align 8 + +; CHECK-LABEL: _foo: +; CHECK: ret +; CHECK-NOT: .loh AdrpLdrGotLdr +define void @foo() { +entry: + br label %if.then83 +if.then83: ; preds = %if.end81 + %tmp = load %"class.H4ISP::H4ISPDevice"** @pH4ISPDevice, align 8 + %call84 = call i32 @_ZN5H4ISP11H4ISPDevice32ISP_SelectBestMIPIFrequencyIndexEjPj(%"class.H4ISP::H4ISPDevice"* %tmp) #19 + tail call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27}"() + %tmp2 = load %"class.H4ISP::H4ISPDevice"** @pH4ISPDevice, align 8 + tail call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x28}"() + %pCameraManager.i268 = getelementptr inbounds %"class.H4ISP::H4ISPDevice"* %tmp2, i64 0, i32 3 + %tmp3 = load %"class.H4ISP::H4ISPCameraManager"** %pCameraManager.i268, align 8 + %tobool.i269 = icmp eq %"class.H4ISP::H4ISPCameraManager"* %tmp3, null + br i1 %tobool.i269, label %if.then83, label %end +end: + ret void +} + diff --git a/test/CodeGen/AArch64/arm64-collect-loh-str.ll b/test/CodeGen/AArch64/arm64-collect-loh-str.ll new file mode 100644 index 00000000000..d7bc00e318f --- /dev/null +++ b/test/CodeGen/AArch64/arm64-collect-loh-str.ll @@ -0,0 +1,23 @@ +; RUN: llc -mtriple=arm64-apple-ios -O2 -aarch64-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s +; Test case for . +; AdrpAddStr cannot be used when the store uses same +; register as address and value. Indeed, the related +; if applied, may completely remove the definition or +; at least provide a wrong one (with the offset folded +; into the definition). + +%struct.anon = type { i32*, i32** } + +@pptp_wan_head = internal global %struct.anon zeroinitializer, align 8 + +; CHECK-LABEL: _pptp_wan_init +; CHECK: ret +; CHECK-NOT: AdrpAddStr +define i32 @pptp_wan_init() { +entry: + store i32* null, i32** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 0), align 8 + store i32** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 0), i32*** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 1), align 8 + ret i32 0 +} + + diff --git a/test/CodeGen/AArch64/arm64-collect-loh.ll b/test/CodeGen/AArch64/arm64-collect-loh.ll new file mode 100644 index 00000000000..6d73daac620 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-collect-loh.ll @@ -0,0 +1,53 @@ +; RUN: llc -mtriple=arm64-apple-ios -O2 -aarch64-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s +; RUN: llc -mtriple=arm64-linux-gnu -O2 -aarch64-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s --check-prefix=CHECK-ELF + +; CHECK-ELF-NOT: .loh +; CHECK-ELF-NOT: AdrpAdrp +; CHECK-ELF-NOT: AdrpAdd +; CHECK-ELF-NOT: AdrpLdrGot + +@a = internal unnamed_addr global i32 0, align 4 +@b = external global i32 + +; Function Attrs: noinline nounwind ssp +define void @foo(i32 %t) { +entry: + %tmp = load i32* @a, align 4 + %add = add nsw i32 %tmp, %t + store i32 %add, i32* @a, align 4 + ret void +} + +; Function Attrs: nounwind ssp +; Testcase for , AdrpAdrp reuse is valid only when the first adrp +; dominates the second. +; The first adrp comes from the loading of 'a' and the second the loading of 'b'. +; 'a' is loaded in if.then, 'b' in if.end4, if.then does not dominates if.end4. +; CHECK-LABEL: _test +; CHECK: ret +; CHECK-NOT: .loh AdrpAdrp +define i32 @test(i32 %t) { +entry: + %cmp = icmp sgt i32 %t, 5 + br i1 %cmp, label %if.then, label %if.end4 + +if.then: ; preds = %entry + %tmp = load i32* @a, align 4 + %add = add nsw i32 %tmp, %t + %cmp1 = icmp sgt i32 %add, 12 + br i1 %cmp1, label %if.then2, label %if.end4 + +if.then2: ; preds = %if.then + tail call void @foo(i32 %add) + %tmp1 = load i32* @a, align 4 + br label %if.end4 + +if.end4: ; preds = %if.then2, %if.then, %entry + %t.addr.0 = phi i32 [ %tmp1, %if.then2 ], [ %t, %if.then ], [ %t, %entry ] + %tmp2 = load i32* @b, align 4 + %add5 = add nsw i32 %tmp2, %t.addr.0 + tail call void @foo(i32 %add5) + %tmp3 = load i32* @b, align 4 + %add6 = add nsw i32 %tmp3, %t.addr.0 + ret i32 %add6 +} diff --git a/test/CodeGen/AArch64/arm64-complex-copy-noneon.ll b/test/CodeGen/AArch64/arm64-complex-copy-noneon.ll new file mode 100644 index 00000000000..f65b1161282 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-complex-copy-noneon.ll @@ -0,0 +1,21 @@ +; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=-neon < %s + +; The DAG combiner decided to use a vector load/store for this struct copy +; previously. This probably shouldn't happen without NEON, but the most +; important thing is that it compiles. + +define void @store_combine() nounwind { + %src = alloca { double, double }, align 8 + %dst = alloca { double, double }, align 8 + + %src.realp = getelementptr inbounds { double, double }* %src, i32 0, i32 0 + %src.real = load double* %src.realp + %src.imagp = getelementptr inbounds { double, double }* %src, i32 0, i32 1 + %src.imag = load double* %src.imagp + + %dst.realp = getelementptr inbounds { double, double }* %dst, i32 0, i32 0 + %dst.imagp = getelementptr inbounds { double, double }* %dst, i32 0, i32 1 + store double %src.real, double* %dst.realp + store double %src.imag, double* %dst.imagp + ret void +} diff --git a/test/CodeGen/AArch64/arm64-complex-ret.ll b/test/CodeGen/AArch64/arm64-complex-ret.ll new file mode 100644 index 00000000000..93d50a59861 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-complex-ret.ll @@ -0,0 +1,7 @@ +; RUN: llc -march=arm64 -o - %s | FileCheck %s + +define { i192, i192, i21, i192 } @foo(i192) { +; CHECK-LABEL: foo: +; CHECK: stp xzr, xzr, [x8] + ret { i192, i192, i21, i192 } {i192 0, i192 1, i21 2, i192 3} +} diff --git a/test/CodeGen/AArch64/arm64-const-addr.ll b/test/CodeGen/AArch64/arm64-const-addr.ll new file mode 100644 index 00000000000..c55a9226cc7 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-const-addr.ll @@ -0,0 +1,23 @@ +; RUN: llc -mtriple=arm64-darwin-unknown < %s | FileCheck %s + +%T = type { i32, i32, i32, i32 } + +; Test if the constant base address gets only materialized once. +define i32 @test1() nounwind { +; CHECK-LABEL: test1 +; CHECK: movz w8, #0x40f, lsl #16 +; CHECK-NEXT: movk w8, #0xc000 +; CHECK-NEXT: ldp w9, w10, [x8, #4] +; CHECK: ldr w8, [x8, #12] + %at = inttoptr i64 68141056 to %T* + %o1 = getelementptr %T* %at, i32 0, i32 1 + %t1 = load i32* %o1 + %o2 = getelementptr %T* %at, i32 0, i32 2 + %t2 = load i32* %o2 + %a1 = add i32 %t1, %t2 + %o3 = getelementptr %T* %at, i32 0, i32 3 + %t3 = load i32* %o3 + %a2 = add i32 %a1, %t3 + ret i32 %a2 +} + diff --git a/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll b/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll new file mode 100644 index 00000000000..d862b1e1943 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll @@ -0,0 +1,24 @@ +; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s + +; CHECK: fptosi_1 +; CHECK: fcvtzs.2d +; CHECK: xtn.2s +; CHECK: ret +define void @fptosi_1() nounwind noinline ssp { +entry: + %0 = fptosi <2 x double> undef to <2 x i32> + store <2 x i32> %0, <2 x i32>* undef, align 8 + ret void +} + +; CHECK: fptoui_1 +; CHECK: fcvtzu.2d +; CHECK: xtn.2s +; CHECK: ret +define void @fptoui_1() nounwind noinline ssp { +entry: + %0 = fptoui <2 x double> undef to <2 x i32> + store <2 x i32> %0, <2 x i32>* undef, align 8 + ret void +} + diff --git a/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll b/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll new file mode 100644 index 00000000000..daaf1e0f87d --- /dev/null +++ b/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll @@ -0,0 +1,29 @@ +; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s + +define <2 x double> @f1(<2 x i32> %v) nounwind readnone { +; CHECK-LABEL: f1: +; CHECK: sshll.2d v0, v0, #0 +; CHECK-NEXT: scvtf.2d v0, v0 +; CHECK-NEXT: ret + %conv = sitofp <2 x i32> %v to <2 x double> + ret <2 x double> %conv +} +define <2 x double> @f2(<2 x i32> %v) nounwind readnone { +; CHECK-LABEL: f2: +; CHECK: ushll.2d v0, v0, #0 +; CHECK-NEXT: ucvtf.2d v0, v0 +; CHECK-NEXT: ret + %conv = uitofp <2 x i32> %v to <2 x double> + ret <2 x double> %conv +} + +; CHECK: autogen_SD19655 +; CHECK: scvtf +; CHECK: ret +define void @autogen_SD19655() { + %T = load <2 x i64>* undef + %F = sitofp <2 x i64> undef to <2 x float> + store <2 x float> %F, <2 x float>* undef + ret void +} + diff --git a/test/CodeGen/AArch64/arm64-copy-tuple.ll b/test/CodeGen/AArch64/arm64-copy-tuple.ll new file mode 100644 index 00000000000..1803787d729 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-copy-tuple.ll @@ -0,0 +1,146 @@ +; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s + +; The main purpose of this test is to find out whether copyPhysReg can deal with +; the memmove-like situation arising in tuples, where an early copy can clobber +; the value needed by a later one if the tuples overlap. + +; We use dummy inline asm to force LLVM to generate a COPY between the registers +; we want by clobbering all the others. + +define void @test_D1D2_from_D0D1(i8* %addr) #0 { +; CHECK-LABEL: test_D1D2_from_D0D1: +; CHECK: mov.8b v2, v1 +; CHECK: mov.8b v1, v0 +entry: + %addr_v8i8 = bitcast i8* %addr to <8 x i8>* + %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8) + %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0 + %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1 + tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() + tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr) + + tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() + tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr) + ret void +} + +define void @test_D0D1_from_D1D2(i8* %addr) #0 { +; CHECK-LABEL: test_D0D1_from_D1D2: +; CHECK: mov.8b v0, v1 +; CHECK: mov.8b v1, v2 +entry: + %addr_v8i8 = bitcast i8* %addr to <8 x i8>* + %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8) + %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0 + %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1 + tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() + tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr) + + tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() + tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr) + ret void +} + +define void @test_D0D1_from_D31D0(i8* %addr) #0 { +; CHECK-LABEL: test_D0D1_from_D31D0: +; CHECK: mov.8b v1, v0 +; CHECK: mov.8b v0, v31 +entry: + %addr_v8i8 = bitcast i8* %addr to <8 x i8>* + %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8) + %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0 + %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1 + tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"() + tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr) + + tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() + tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr) + ret void +} + +define void @test_D31D0_from_D0D1(i8* %addr) #0 { +; CHECK-LABEL: test_D31D0_from_D0D1: +; CHECK: mov.8b v31, v0 +; CHECK: mov.8b v0, v1 +entry: + %addr_v8i8 = bitcast i8* %addr to <8 x i8>* + %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8) + %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0 + %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1 + tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() + tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr) + + tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"() + tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr) + ret void +} + +define void @test_D2D3D4_from_D0D1D2(i8* %addr) #0 { +; CHECK-LABEL: test_D2D3D4_from_D0D1D2: +; CHECK: mov.8b v4, v2 +; CHECK: mov.8b v3, v1 +; CHECK: mov.8b v2, v0 +entry: + %addr_v8i8 = bitcast i8* %addr to <8 x i8>* + %vec = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* %addr_v8i8) + %vec0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 0 + %vec1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 1 + %vec2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 2 + + tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() + tail call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr) + + tail call void asm sideeffect "", "~{v0},~{v1},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() + tail call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr) + ret void +} + +define void @test_Q0Q1Q2_from_Q1Q2Q3(i8* %addr) #0 { +; CHECK-LABEL: test_Q0Q1Q2_from_Q1Q2Q3: +; CHECK: mov.16b v0, v1 +; CHECK: mov.16b v1, v2 +; CHECK: mov.16b v2, v3 +entry: + %addr_v16i8 = bitcast i8* %addr to <16 x i8>* + %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* %addr_v16i8) + %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0 + %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1 + %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2 + tail call void asm sideeffect "", "~{v0},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() + tail call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr) + + tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() + tail call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr) + ret void +} + +define void @test_Q1Q2Q3Q4_from_Q30Q31Q0Q1(i8* %addr) #0 { +; CHECK-LABEL: test_Q1Q2Q3Q4_from_Q30Q31Q0Q1: +; CHECK: mov.16b v4, v1 +; CHECK: mov.16b v3, v0 +; CHECK: mov.16b v2, v31 +; CHECK: mov.16b v1, v30 + %addr_v16i8 = bitcast i8* %addr to <16 x i8>* + %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* %addr_v16i8) + %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0 + %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1 + %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2 + %vec3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 3 + + tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}"() + tail call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr) + + tail call void asm sideeffect "", "~{v0},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() + tail call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr) + ret void +} + +declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>*) +declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>*) +declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>*) +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>*) + +declare void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) +declare void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) +declare void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) +declare void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) diff --git a/test/CodeGen/AArch64/arm64-crc32.ll b/test/CodeGen/AArch64/arm64-crc32.ll new file mode 100644 index 00000000000..d3099e6bb13 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-crc32.ll @@ -0,0 +1,71 @@ +; RUN: llc -march=arm64 -mattr=+crc -o - %s | FileCheck %s + +define i32 @test_crc32b(i32 %cur, i8 %next) { +; CHECK-LABEL: test_crc32b: +; CHECK: crc32b w0, w0, w1 + %bits = zext i8 %next to i32 + %val = call i32 @llvm.aarch64.crc32b(i32 %cur, i32 %bits) + ret i32 %val +} + +define i32 @test_crc32h(i32 %cur, i16 %next) { +; CHECK-LABEL: test_crc32h: +; CHECK: crc32h w0, w0, w1 + %bits = zext i16 %next to i32 + %val = call i32 @llvm.aarch64.crc32h(i32 %cur, i32 %bits) + ret i32 %val +} + +define i32 @test_crc32w(i32 %cur, i32 %next) { +; CHECK-LABEL: test_crc32w: +; CHECK: crc32w w0, w0, w1 + %val = call i32 @llvm.aarch64.crc32w(i32 %cur, i32 %next) + ret i32 %val +} + +define i32 @test_crc32x(i32 %cur, i64 %next) { +; CHECK-LABEL: test_crc32x: +; CHECK: crc32x w0, w0, x1 + %val = call i32 @llvm.aarch64.crc32x(i32 %cur, i64 %next) + ret i32 %val +} + +define i32 @test_crc32cb(i32 %cur, i8 %next) { +; CHECK-LABEL: test_crc32cb: +; CHECK: crc32cb w0, w0, w1 + %bits = zext i8 %next to i32 + %val = call i32 @llvm.aarch64.crc32cb(i32 %cur, i32 %bits) + ret i32 %val +} + +define i32 @test_crc32ch(i32 %cur, i16 %next) { +; CHECK-LABEL: test_crc32ch: +; CHECK: crc32ch w0, w0, w1 + %bits = zext i16 %next to i32 + %val = call i32 @llvm.aarch64.crc32ch(i32 %cur, i32 %bits) + ret i32 %val +} + +define i32 @test_crc32cw(i32 %cur, i32 %next) { +; CHECK-LABEL: test_crc32cw: +; CHECK: crc32cw w0, w0, w1 + %val = call i32 @llvm.aarch64.crc32cw(i32 %cur, i32 %next) + ret i32 %val +} + +define i32 @test_crc32cx(i32 %cur, i64 %next) { +; CHECK-LABEL: test_crc32cx: +; CHECK: crc32cx w0, w0, x1 + %val = call i32 @llvm.aarch64.crc32cx(i32 %cur, i64 %next) + ret i32 %val +} + +declare i32 @llvm.aarch64.crc32b(i32, i32) +declare i32 @llvm.aarch64.crc32h(i32, i32) +declare i32 @llvm.aarch64.crc32w(i32, i32) +declare i32 @llvm.aarch64.crc32x(i32, i64) + +declare i32 @llvm.aarch64.crc32cb(i32, i32) +declare i32 @llvm.aarch64.crc32ch(i32, i32) +declare i32 @llvm.aarch64.crc32cw(i32, i32) +declare i32 @llvm.aarch64.crc32cx(i32, i64) diff --git a/test/CodeGen/AArch64/arm64-crypto.ll b/test/CodeGen/AArch64/arm64-crypto.ll new file mode 100644 index 00000000000..2908b336b1b --- /dev/null +++ b/test/CodeGen/AArch64/arm64-crypto.ll @@ -0,0 +1,135 @@ +; RUN: llc -march=arm64 -mattr=crypto -aarch64-neon-syntax=apple -o - %s | FileCheck %s + +declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data, <16 x i8> %key) +declare <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data, <16 x i8> %key) +declare <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %data) +declare <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %data) + +define <16 x i8> @test_aese(<16 x i8> %data, <16 x i8> %key) { +; CHECK-LABEL: test_aese: +; CHECK: aese.16b v0, v1 + %res = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data, <16 x i8> %key) + ret <16 x i8> %res +} + +define <16 x i8> @test_aesd(<16 x i8> %data, <16 x i8> %key) { +; CHECK-LABEL: test_aesd: +; CHECK: aesd.16b v0, v1 + %res = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data, <16 x i8> %key) + ret <16 x i8> %res +} + +define <16 x i8> @test_aesmc(<16 x i8> %data) { +; CHECK-LABEL: test_aesmc: +; CHECK: aesmc.16b v0, v0 + %res = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %data) + ret <16 x i8> %res +} + +define <16 x i8> @test_aesimc(<16 x i8> %data) { +; CHECK-LABEL: test_aesimc: +; CHECK: aesimc.16b v0, v0 + %res = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %data) + ret <16 x i8> %res +} + +declare <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) +declare <4 x i32> @llvm.aarch64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) +declare <4 x i32> @llvm.aarch64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) +declare i32 @llvm.aarch64.crypto.sha1h(i32 %hash_e) +declare <4 x i32> @llvm.aarch64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11) +declare <4 x i32> @llvm.aarch64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15) + +define <4 x i32> @test_sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) { +; CHECK-LABEL: test_sha1c: +; CHECK: fmov [[HASH_E:s[0-9]+]], w0 +; CHECK: sha1c.4s q0, [[HASH_E]], v1 + %res = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) + ret <4 x i32> %res +} + +; Incomplete removal of unnecessary FMOV instructions in intrinsic SHA1 +define <4 x i32> @test_sha1c_in_a_row(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) { +; CHECK-LABEL: test_sha1c_in_a_row: +; CHECK: fmov [[HASH_E:s[0-9]+]], w0 +; CHECK: sha1c.4s q[[SHA1RES:[0-9]+]], [[HASH_E]], v1 +; CHECK-NOT: fmov +; CHECK: sha1c.4s q0, s[[SHA1RES]], v1 + %res = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) + %extract = extractelement <4 x i32> %res, i32 0 + %res2 = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %extract, <4 x i32> %wk) + ret <4 x i32> %res2 +} + +define <4 x i32> @test_sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) { +; CHECK-LABEL: test_sha1p: +; CHECK: fmov [[HASH_E:s[0-9]+]], w0 +; CHECK: sha1p.4s q0, [[HASH_E]], v1 + %res = call <4 x i32> @llvm.aarch64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) + ret <4 x i32> %res +} + +define <4 x i32> @test_sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) { +; CHECK-LABEL: test_sha1m: +; CHECK: fmov [[HASH_E:s[0-9]+]], w0 +; CHECK: sha1m.4s q0, [[HASH_E]], v1 + %res = call <4 x i32> @llvm.aarch64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) + ret <4 x i32> %res +} + +define i32 @test_sha1h(i32 %hash_e) { +; CHECK-LABEL: test_sha1h: +; CHECK: fmov [[HASH_E:s[0-9]+]], w0 +; CHECK: sha1h [[RES:s[0-9]+]], [[HASH_E]] +; CHECK: fmov w0, [[RES]] + %res = call i32 @llvm.aarch64.crypto.sha1h(i32 %hash_e) + ret i32 %res +} + +define <4 x i32> @test_sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11) { +; CHECK-LABEL: test_sha1su0: +; CHECK: sha1su0.4s v0, v1, v2 + %res = call <4 x i32> @llvm.aarch64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11) + ret <4 x i32> %res +} + +define <4 x i32> @test_sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15) { +; CHECK-LABEL: test_sha1su1: +; CHECK: sha1su1.4s v0, v1 + %res = call <4 x i32> @llvm.aarch64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15) + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.aarch64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) +declare <4 x i32> @llvm.aarch64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) +declare <4 x i32> @llvm.aarch64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7) +declare <4 x i32> @llvm.aarch64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) + +define <4 x i32> @test_sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) { +; CHECK-LABEL: test_sha256h: +; CHECK: sha256h.4s q0, q1, v2 + %res = call <4 x i32> @llvm.aarch64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) + ret <4 x i32> %res +} + +define <4 x i32> @test_sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) { +; CHECK-LABEL: test_sha256h2: +; CHECK: sha256h2.4s q0, q1, v2 + + %res = call <4 x i32> @llvm.aarch64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) + ret <4 x i32> %res +} + +define <4 x i32> @test_sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7) { +; CHECK-LABEL: test_sha256su0: +; CHECK: sha256su0.4s v0, v1 + %res = call <4 x i32> @llvm.aarch64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7) + ret <4 x i32> %res +} + +define <4 x i32> @test_sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) { +; CHECK-LABEL: test_sha256su1: +; CHECK: sha256su1.4s v0, v1, v2 + %res = call <4 x i32> @llvm.aarch64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) + ret <4 x i32> %res +} diff --git a/test/CodeGen/AArch64/arm64-cse.ll b/test/CodeGen/AArch64/arm64-cse.ll new file mode 100644 index 00000000000..bb14c895504 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-cse.ll @@ -0,0 +1,59 @@ +; RUN: llc -O3 < %s | FileCheck %s +target triple = "arm64-apple-ios" + +; rdar://12462006 +; CSE between "icmp reg reg" and "sub reg reg". +; Both can be in the same basic block or in different basic blocks. +define i8* @t1(i8* %base, i32* nocapture %offset, i32 %size) nounwind { +entry: +; CHECK-LABEL: t1: +; CHECK: subs +; CHECK-NOT: cmp +; CHECK-NOT: sub +; CHECK: b.ge +; CHECK: sub +; CHECK: sub +; CHECK-NOT: sub +; CHECK: ret + %0 = load i32* %offset, align 4 + %cmp = icmp slt i32 %0, %size + %s = sub nsw i32 %0, %size + br i1 %cmp, label %return, label %if.end + +if.end: + %sub = sub nsw i32 %0, %size + %s2 = sub nsw i32 %s, %size + %s3 = sub nsw i32 %sub, %s2 + store i32 %s3, i32* %offset, align 4 + %add.ptr = getelementptr inbounds i8* %base, i32 %sub + br label %return + +return: + %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ] + ret i8* %retval.0 +} + +; CSE between "icmp reg imm" and "sub reg imm". +define i8* @t2(i8* %base, i32* nocapture %offset) nounwind { +entry: +; CHECK-LABEL: t2: +; CHECK: subs +; CHECK-NOT: cmp +; CHECK-NOT: sub +; CHECK: b.lt +; CHECK-NOT: sub +; CHECK: ret + %0 = load i32* %offset, align 4 + %cmp = icmp slt i32 %0, 1 + br i1 %cmp, label %return, label %if.end + +if.end: + %sub = sub nsw i32 %0, 1 + store i32 %sub, i32* %offset, align 4 + %add.ptr = getelementptr inbounds i8* %base, i32 %sub + br label %return + +return: + %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ] + ret i8* %retval.0 +} diff --git a/test/CodeGen/AArch64/arm64-csel.ll b/test/CodeGen/AArch64/arm64-csel.ll new file mode 100644 index 00000000000..98eba30f119 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-csel.ll @@ -0,0 +1,230 @@ +; RUN: llc -O3 < %s | FileCheck %s +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64" +target triple = "arm64-unknown-unknown" + +; CHECK-LABEL: foo1 +; CHECK: cinc w{{[0-9]+}}, w{{[0-9]+}}, ne +define i32 @foo1(i32 %b, i32 %c) nounwind readnone ssp { +entry: + %not.tobool = icmp ne i32 %c, 0 + %add = zext i1 %not.tobool to i32 + %b.add = add i32 %c, %b + %add1 = add i32 %b.add, %add + ret i32 %add1 +} + +; CHECK-LABEL: foo2 +; CHECK: cneg w{{[0-9]+}}, w{{[0-9]+}}, ne +define i32 @foo2(i32 %b, i32 %c) nounwind readnone ssp { +entry: + %mul = sub i32 0, %b + %tobool = icmp eq i32 %c, 0 + %b.mul = select i1 %tobool, i32 %b, i32 %mul + %add = add nsw i32 %b.mul, %c + ret i32 %add +} + +; CHECK-LABEL: foo3 +; CHECK: cinv w{{[0-9]+}}, w{{[0-9]+}}, ne +define i32 @foo3(i32 %b, i32 %c) nounwind readnone ssp { +entry: + %not.tobool = icmp ne i32 %c, 0 + %xor = sext i1 %not.tobool to i32 + %b.xor = xor i32 %xor, %b + %add = add nsw i32 %b.xor, %c + ret i32 %add +} + +; rdar://11632325 +define i32@foo4(i32 %a) nounwind ssp { +; CHECK-LABEL: foo4 +; CHECK: cneg +; CHECK-NEXT: ret + %cmp = icmp sgt i32 %a, -1 + %neg = sub nsw i32 0, %a + %cond = select i1 %cmp, i32 %a, i32 %neg + ret i32 %cond +} + +define i32@foo5(i32 %a, i32 %b) nounwind ssp { +entry: +; CHECK-LABEL: foo5 +; CHECK: subs +; CHECK-NEXT: cneg +; CHECK-NEXT: ret + %sub = sub nsw i32 %a, %b + %cmp = icmp sgt i32 %sub, -1 + %sub3 = sub nsw i32 0, %sub + %cond = select i1 %cmp, i32 %sub, i32 %sub3 + ret i32 %cond +} + +; make sure we can handle branch instruction in optimizeCompare. +define i32@foo6(i32 %a, i32 %b) nounwind ssp { +; CHECK-LABEL: foo6 +; CHECK: b + %sub = sub nsw i32 %a, %b + %cmp = icmp sgt i32 %sub, 0 + br i1 %cmp, label %l.if, label %l.else + +l.if: + ret i32 1 + +l.else: + ret i32 %sub +} + +; If CPSR is used multiple times and V flag is used, we don't remove cmp. +define i32 @foo7(i32 %a, i32 %b) nounwind { +entry: +; CHECK-LABEL: foo7: +; CHECK: sub +; CHECK-next: adds +; CHECK-next: csneg +; CHECK-next: b + %sub = sub nsw i32 %a, %b + %cmp = icmp sgt i32 %sub, -1 + %sub3 = sub nsw i32 0, %sub + %cond = select i1 %cmp, i32 %sub, i32 %sub3 + br i1 %cmp, label %if.then, label %if.else + +if.then: + %cmp2 = icmp slt i32 %sub, -1 + %sel = select i1 %cmp2, i32 %cond, i32 %a + ret i32 %sel + +if.else: + ret i32 %cond +} + +define i32 @foo8(i32 %v, i32 %a, i32 %b) nounwind readnone ssp { +entry: +; CHECK-LABEL: foo8: +; CHECK: cmp w0, #0 +; CHECK: csinv w0, w1, w2, ne + %tobool = icmp eq i32 %v, 0 + %neg = xor i32 -1, %b + %cond = select i1 %tobool, i32 %neg, i32 %a + ret i32 %cond +} + +define i32 @foo9(i32 %v) nounwind readnone optsize ssp { +entry: +; CHECK-LABEL: foo9: +; CHECK: cmp w0, #0 +; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4 +; CHECK: cinv w0, w[[REG]], eq + %tobool = icmp ne i32 %v, 0 + %cond = select i1 %tobool, i32 4, i32 -5 + ret i32 %cond +} + +define i64 @foo10(i64 %v) nounwind readnone optsize ssp { +entry: +; CHECK-LABEL: foo10: +; CHECK: cmp x0, #0 +; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4 +; CHECK: cinv x0, x[[REG]], eq + %tobool = icmp ne i64 %v, 0 + %cond = select i1 %tobool, i64 4, i64 -5 + ret i64 %cond +} + +define i32 @foo11(i32 %v) nounwind readnone optsize ssp { +entry: +; CHECK-LABEL: foo11: +; CHECK: cmp w0, #0 +; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4 +; CHECK: cneg w0, w[[REG]], eq + %tobool = icmp ne i32 %v, 0 + %cond = select i1 %tobool, i32 4, i32 -4 + ret i32 %cond +} + +define i64 @foo12(i64 %v) nounwind readnone optsize ssp { +entry: +; CHECK-LABEL: foo12: +; CHECK: cmp x0, #0 +; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4 +; CHECK: cneg x0, x[[REG]], eq + %tobool = icmp ne i64 %v, 0 + %cond = select i1 %tobool, i64 4, i64 -4 + ret i64 %cond +} + +define i32 @foo13(i32 %v, i32 %a, i32 %b) nounwind readnone optsize ssp { +entry: +; CHECK-LABEL: foo13: +; CHECK: cmp w0, #0 +; CHECK: csneg w0, w1, w2, ne + %tobool = icmp eq i32 %v, 0 + %sub = sub i32 0, %b + %cond = select i1 %tobool, i32 %sub, i32 %a + ret i32 %cond +} + +define i64 @foo14(i64 %v, i64 %a, i64 %b) nounwind readnone optsize ssp { +entry: +; CHECK-LABEL: foo14: +; CHECK: cmp x0, #0 +; CHECK: csneg x0, x1, x2, ne + %tobool = icmp eq i64 %v, 0 + %sub = sub i64 0, %b + %cond = select i1 %tobool, i64 %sub, i64 %a + ret i64 %cond +} + +define i32 @foo15(i32 %a, i32 %b) nounwind readnone optsize ssp { +entry: +; CHECK-LABEL: foo15: +; CHECK: cmp w0, w1 +; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1 +; CHECK: cinc w0, w[[REG]], gt + %cmp = icmp sgt i32 %a, %b + %. = select i1 %cmp, i32 2, i32 1 + ret i32 %. +} + +define i32 @foo16(i32 %a, i32 %b) nounwind readnone optsize ssp { +entry: +; CHECK-LABEL: foo16: +; CHECK: cmp w0, w1 +; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1 +; CHECK: cinc w0, w[[REG]], le + %cmp = icmp sgt i32 %a, %b + %. = select i1 %cmp, i32 1, i32 2 + ret i32 %. +} + +define i64 @foo17(i64 %a, i64 %b) nounwind readnone optsize ssp { +entry: +; CHECK-LABEL: foo17: +; CHECK: cmp x0, x1 +; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1 +; CHECK: cinc x0, x[[REG]], gt + %cmp = icmp sgt i64 %a, %b + %. = select i1 %cmp, i64 2, i64 1 + ret i64 %. +} + +define i64 @foo18(i64 %a, i64 %b) nounwind readnone optsize ssp { +entry: +; CHECK-LABEL: foo18: +; CHECK: cmp x0, x1 +; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1 +; CHECK: cinc x0, x[[REG]], le + %cmp = icmp sgt i64 %a, %b + %. = select i1 %cmp, i64 1, i64 2 + ret i64 %. +} + +define i64 @foo19(i64 %a, i64 %b, i64 %c) { +entry: +; CHECK-LABEL: foo19: +; CHECK: cinc x0, x2 +; CHECK-NOT: add + %cmp = icmp ult i64 %a, %b + %inc = zext i1 %cmp to i64 + %inc.c = add i64 %inc, %c + ret i64 %inc.c +} diff --git a/test/CodeGen/AArch64/arm64-cvt.ll b/test/CodeGen/AArch64/arm64-cvt.ll new file mode 100644 index 00000000000..420a8bc0483 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-cvt.ll @@ -0,0 +1,401 @@ +; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s + +; +; Floating-point scalar convert to signed integer (to nearest with ties to away) +; +define i32 @fcvtas_1w1s(float %A) nounwind { +;CHECK-LABEL: fcvtas_1w1s: +;CHECK: fcvtas w0, s0 +;CHECK-NEXT: ret + %tmp3 = call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float %A) + ret i32 %tmp3 +} + +define i64 @fcvtas_1x1s(float %A) nounwind { +;CHECK-LABEL: fcvtas_1x1s: +;CHECK: fcvtas x0, s0 +;CHECK-NEXT: ret + %tmp3 = call i64 @llvm.aarch64.neon.fcvtas.i64.f32(float %A) + ret i64 %tmp3 +} + +define i32 @fcvtas_1w1d(double %A) nounwind { +;CHECK-LABEL: fcvtas_1w1d: +;CHECK: fcvtas w0, d0 +;CHECK-NEXT: ret + %tmp3 = call i32 @llvm.aarch64.neon.fcvtas.i32.f64(double %A) + ret i32 %tmp3 +} + +define i64 @fcvtas_1x1d(double %A) nounwind { +;CHECK-LABEL: fcvtas_1x1d: +;CHECK: fcvtas x0, d0 +;CHECK-NEXT: ret + %tmp3 = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double %A) + ret i64 %tmp3 +} + +declare i32 @llvm.aarch64.neon.fcvtas.i32.f32(float) nounwind readnone +declare i64 @llvm.aarch64.neon.fcvtas.i64.f32(float) nounwind readnone +declare i32 @llvm.aarch64.neon.fcvtas.i32.f64(double) nounwind readnone +declare i64 @llvm.aarch64.neon.fcvtas.i64.f64(double) nounwind readnone + +; +; Floating-point scalar convert to unsigned integer +; +define i32 @fcvtau_1w1s(float %A) nounwind { +;CHECK-LABEL: fcvtau_1w1s: +;CHECK: fcvtau w0, s0 +;CHECK-NEXT: ret + %tmp3 = call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float %A) + ret i32 %tmp3 +} + +define i64 @fcvtau_1x1s(float %A) nounwind { +;CHECK-LABEL: fcvtau_1x1s: +;CHECK: fcvtau x0, s0 +;CHECK-NEXT: ret + %tmp3 = call i64 @llvm.aarch64.neon.fcvtau.i64.f32(float %A) + ret i64 %tmp3 +} + +define i32 @fcvtau_1w1d(double %A) nounwind { +;CHECK-LABEL: fcvtau_1w1d: +;CHECK: fcvtau w0, d0 +;CHECK-NEXT: ret + %tmp3 = call i32 @llvm.aarch64.neon.fcvtau.i32.f64(double %A) + ret i32 %tmp3 +} + +define i64 @fcvtau_1x1d(double %A) nounwind { +;CHECK-LABEL: fcvtau_1x1d: +;CHECK: fcvtau x0, d0 +;CHECK-NEXT: ret + %tmp3 = call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double %A) + ret i64 %tmp3 +} + +declare i32 @llvm.aarch64.neon.fcvtau.i32.f32(float) nounwind readnone +declare i64 @llvm.aarch64.neon.fcvtau.i64.f32(float) nounwind readnone +declare i32 @llvm.aarch64.neon.fcvtau.i32.f64(double) nounwind readnone +declare i64 @llvm.aarch64.neon.fcvtau.i64.f64(double) nounwind readnone + +; +; Floating-point scalar convert to signed integer (toward -Inf) +; +define i32 @fcvtms_1w1s(float %A) nounwind { +;CHECK-LABEL: fcvtms_1w1s: +;CHECK: fcvtms w0, s0 +;CHECK-NEXT: ret + %tmp3 = call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float %A) + ret i32 %tmp3 +} + +define i64 @fcvtms_1x1s(float %A) nounwind { +;CHECK-LABEL: fcvtms_1x1s: +;CHECK: fcvtms x0, s0 +;CHECK-NEXT: ret + %tmp3 = call i64 @llvm.aarch64.neon.fcvtms.i64.f32(float %A) + ret i64 %tmp3 +} + +define i32 @fcvtms_1w1d(double %A) nounwind { +;CHECK-LABEL: fcvtms_1w1d: +;CHECK: fcvtms w0, d0 +;CHECK-NEXT: ret + %tmp3 = call i32 @llvm.aarch64.neon.fcvtms.i32.f64(double %A) + ret i32 %tmp3 +} + +define i64 @fcvtms_1x1d(double %A) nounwind { +;CHECK-LABEL: fcvtms_1x1d: +;CHECK: fcvtms x0, d0 +;CHECK-NEXT: ret + %tmp3 = call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double %A) + ret i64 %tmp3 +} + +declare i32 @llvm.aarch64.neon.fcvtms.i32.f32(float) nounwind readnone +declare i64 @llvm.aarch64.neon.fcvtms.i64.f32(float) nounwind readnone +declare i32 @llvm.aarch64.neon.fcvtms.i32.f64(double) nounwind readnone +declare i64 @llvm.aarch64.neon.fcvtms.i64.f64(double) nounwind readnone + +; +; Floating-point scalar convert to unsigned integer (toward -Inf) +; +define i32 @fcvtmu_1w1s(float %A) nounwind { +;CHECK-LABEL: fcvtmu_1w1s: +;CHECK: fcvtmu w0, s0 +;CHECK-NEXT: ret + %tmp3 = call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float %A) + ret i32 %tmp3 +} + +define i64 @fcvtmu_1x1s(float %A) nounwind { +;CHECK-LABEL: fcvtmu_1x1s: +;CHECK: fcvtmu x0, s0 +;CHECK-NEXT: ret + %tmp3 = call i64 @llvm.aarch64.neon.fcvtmu.i64.f32(float %A) + ret i64 %tmp3 +} + +define i32 @fcvtmu_1w1d(double %A) nounwind { +;CHECK-LABEL: fcvtmu_1w1d: +;CHECK: fcvtmu w0, d0 +;CHECK-NEXT: ret + %tmp3 = call i32 @llvm.aarch64.neon.fcvtmu.i32.f64(double %A) + ret i32 %tmp3 +} + +define i64 @fcvtmu_1x1d(double %A) nounwind { +;CHECK-LABEL: fcvtmu_1x1d: +;CHECK: fcvtmu x0, d0 +;CHECK-NEXT: ret + %tmp3 = call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double %A) + ret i64 %tmp3 +} + +declare i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float) nounwind readnone +declare i64 @llvm.aarch64.neon.fcvtmu.i64.f32(float) nounwind readnone +declare i32 @llvm.aarch64.neon.fcvtmu.i32.f64(double) nounwind readnone +declare i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double) nounwind readnone + +; +; Floating-point scalar convert to signed integer (to nearest with ties to even) +; +define i32 @fcvtns_1w1s(float %A) nounwind { +;CHECK-LABEL: fcvtns_1w1s: +;CHECK: fcvtns w0, s0 +;CHECK-NEXT: ret + %tmp3 = call i32 @llvm.aarch64.neon.fcvtns.i32.f32(float %A) + ret i32 %tmp3 +} + +define i64 @fcvtns_1x1s(float %A) nounwind { +;CHECK-LABEL: fcvtns_1x1s: +;CHECK: fcvtns x0, s0 +;CHECK-NEXT: ret + %tmp3 = call i64 @llvm.aarch64.neon.fcvtns.i64.f32(float %A) + ret i64 %tmp3 +} + +define i32 @fcvtns_1w1d(double %A) nounwind { +;CHECK-LABEL: fcvtns_1w1d: +;CHECK: fcvtns w0, d0 +;CHECK-NEXT: ret + %tmp3 = call i32 @llvm.aarch64.neon.fcvtns.i32.f64(double %A) + ret i32 %tmp3 +} + +define i64 @fcvtns_1x1d(double %A) nounwind { +;CHECK-LABEL: fcvtns_1x1d: +;CHECK: fcvtns x0, d0 +;CHECK-NEXT: ret + %tmp3 = call i64 @llvm.aarch64.neon.fcvtns.i64.f64(double %A) + ret i64 %tmp3 +} + +declare i32 @llvm.aarch64.neon.fcvtns.i32.f32(float) nounwind readnone +declare i64 @llvm.aarch64.neon.fcvtns.i64.f32(float) nounwind readnone +declare i32 @llvm.aarch64.neon.fcvtns.i32.f64(double) nounwind readnone +declare i64 @llvm.aarch64.neon.fcvtns.i64.f64(double) nounwind readnone + +; +; Floating-point scalar convert to unsigned integer (to nearest with ties to even) +; +define i32 @fcvtnu_1w1s(float %A) nounwind { +;CHECK-LABEL: fcvtnu_1w1s: +;CHECK: fcvtnu w0, s0 +;CHECK-NEXT: ret + %tmp3 = call i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float %A) + ret i32 %tmp3 +} + +define i64 @fcvtnu_1x1s(float %A) nounwind { +;CHECK-LABEL: fcvtnu_1x1s: +;CHECK: fcvtnu x0, s0 +;CHECK-NEXT: ret + %tmp3 = call i64 @llvm.aarch64.neon.fcvtnu.i64.f32(float %A) + ret i64 %tmp3 +} + +define i32 @fcvtnu_1w1d(double %A) nounwind { +;CHECK-LABEL: fcvtnu_1w1d: +;CHECK: fcvtnu w0, d0 +;CHECK-NEXT: ret + %tmp3 = call i32 @llvm.aarch64.neon.fcvtnu.i32.f64(double %A) + ret i32 %tmp3 +} + +define i64 @fcvtnu_1x1d(double %A) nounwind { +;CHECK-LABEL: fcvtnu_1x1d: +;CHECK: fcvtnu x0, d0 +;CHECK-NEXT: ret + %tmp3 = call i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double %A) + ret i64 %tmp3 +} + +declare i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float) nounwind readnone +declare i64 @llvm.aarch64.neon.fcvtnu.i64.f32(float) nounwind readnone +declare i32 @llvm.aarch64.neon.fcvtnu.i32.f64(double) nounwind readnone +declare i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double) nounwind readnone + +; +; Floating-point scalar convert to signed integer (toward +Inf) +; +define i32 @fcvtps_1w1s(float %A) nounwind { +;CHECK-LABEL: fcvtps_1w1s: +;CHECK: fcvtps w0, s0 +;CHECK-NEXT: ret + %tmp3 = call i32 @llvm.aarch64.neon.fcvtps.i32.f32(float %A) + ret i32 %tmp3 +} + +define i64 @fcvtps_1x1s(float %A) nounwind { +;CHECK-LABEL: fcvtps_1x1s: +;CHECK: fcvtps x0, s0 +;CHECK-NEXT: ret + %tmp3 = call i64 @llvm.aarch64.neon.fcvtps.i64.f32(float %A) + ret i64 %tmp3 +} + +define i32 @fcvtps_1w1d(double %A) nounwind { +;CHECK-LABEL: fcvtps_1w1d: +;CHECK: fcvtps w0, d0 +;CHECK-NEXT: ret + %tmp3 = call i32 @llvm.aarch64.neon.fcvtps.i32.f64(double %A) + ret i32 %tmp3 +} + +define i64 @fcvtps_1x1d(double %A) nounwind { +;CHECK-LABEL: fcvtps_1x1d: +;CHECK: fcvtps x0, d0 +;CHECK-NEXT: ret + %tmp3 = call i64 @llvm.aarch64.neon.fcvtps.i64.f64(double %A) + ret i64 %tmp3 +} + +declare i32 @llvm.aarch64.neon.fcvtps.i32.f32(float) nounwind readnone +declare i64 @llvm.aarch64.neon.fcvtps.i64.f32(float) nounwind readnone +declare i32 @llvm.aarch64.neon.fcvtps.i32.f64(double) nounwind readnone +declare i64 @llvm.aarch64.neon.fcvtps.i64.f64(double) nounwind readnone + +; +; Floating-point scalar convert to unsigned integer (toward +Inf) +; +define i32 @fcvtpu_1w1s(float %A) nounwind { +;CHECK-LABEL: fcvtpu_1w1s: +;CHECK: fcvtpu w0, s0 +;CHECK-NEXT: ret + %tmp3 = call i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float %A) + ret i32 %tmp3 +} + +define i64 @fcvtpu_1x1s(float %A) nounwind { +;CHECK-LABEL: fcvtpu_1x1s: +;CHECK: fcvtpu x0, s0 +;CHECK-NEXT: ret + %tmp3 = call i64 @llvm.aarch64.neon.fcvtpu.i64.f32(float %A) + ret i64 %tmp3 +} + +define i32 @fcvtpu_1w1d(double %A) nounwind { +;CHECK-LABEL: fcvtpu_1w1d: +;CHECK: fcvtpu w0, d0 +;CHECK-NEXT: ret + %tmp3 = call i32 @llvm.aarch64.neon.fcvtpu.i32.f64(double %A) + ret i32 %tmp3 +} + +define i64 @fcvtpu_1x1d(double %A) nounwind { +;CHECK-LABEL: fcvtpu_1x1d: +;CHECK: fcvtpu x0, d0 +;CHECK-NEXT: ret + %tmp3 = call i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double %A) + ret i64 %tmp3 +} + +declare i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float) nounwind readnone +declare i64 @llvm.aarch64.neon.fcvtpu.i64.f32(float) nounwind readnone +declare i32 @llvm.aarch64.neon.fcvtpu.i32.f64(double) nounwind readnone +declare i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double) nounwind readnone + +; +; Floating-point scalar convert to signed integer (toward zero) +; +define i32 @fcvtzs_1w1s(float %A) nounwind { +;CHECK-LABEL: fcvtzs_1w1s: +;CHECK: fcvtzs w0, s0 +;CHECK-NEXT: ret + %tmp3 = call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %A) + ret i32 %tmp3 +} + +define i64 @fcvtzs_1x1s(float %A) nounwind { +;CHECK-LABEL: fcvtzs_1x1s: +;CHECK: fcvtzs x0, s0 +;CHECK-NEXT: ret + %tmp3 = call i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float %A) + ret i64 %tmp3 +} + +define i32 @fcvtzs_1w1d(double %A) nounwind { +;CHECK-LABEL: fcvtzs_1w1d: +;CHECK: fcvtzs w0, d0 +;CHECK-NEXT: ret + %tmp3 = call i32 @llvm.aarch64.neon.fcvtzs.i32.f64(double %A) + ret i32 %tmp3 +} + +define i64 @fcvtzs_1x1d(double %A) nounwind { +;CHECK-LABEL: fcvtzs_1x1d: +;CHECK: fcvtzs x0, d0 +;CHECK-NEXT: ret + %tmp3 = call i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double %A) + ret i64 %tmp3 +} + +declare i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float) nounwind readnone +declare i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float) nounwind readnone +declare i32 @llvm.aarch64.neon.fcvtzs.i32.f64(double) nounwind readnone +declare i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double) nounwind readnone + +; +; Floating-point scalar convert to unsigned integer (toward zero) +; +define i32 @fcvtzu_1w1s(float %A) nounwind { +;CHECK-LABEL: fcvtzu_1w1s: +;CHECK: fcvtzu w0, s0 +;CHECK-NEXT: ret + %tmp3 = call i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float %A) + ret i32 %tmp3 +} + +define i64 @fcvtzu_1x1s(float %A) nounwind { +;CHECK-LABEL: fcvtzu_1x1s: +;CHECK: fcvtzu x0, s0 +;CHECK-NEXT: ret + %tmp3 = call i64 @llvm.aarch64.neon.fcvtzu.i64.f32(float %A) + ret i64 %tmp3 +} + +define i32 @fcvtzu_1w1d(double %A) nounwind { +;CHECK-LABEL: fcvtzu_1w1d: +;CHECK: fcvtzu w0, d0 +;CHECK-NEXT: ret + %tmp3 = call i32 @llvm.aarch64.neon.fcvtzu.i32.f64(double %A) + ret i32 %tmp3 +} + +define i64 @fcvtzu_1x1d(double %A) nounwind { +;CHECK-LABEL: fcvtzu_1x1d: +;CHECK: fcvtzu x0, d0 +;CHECK-NEXT: ret + %tmp3 = call i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double %A) + ret i64 %tmp3 +} + +declare i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float) nounwind readnone +declare i64 @llvm.aarch64.neon.fcvtzu.i64.f32(float) nounwind readnone +declare i32 @llvm.aarch64.neon.fcvtzu.i32.f64(double) nounwind readnone +declare i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double) nounwind readnone diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-convergence.ll b/test/CodeGen/AArch64/arm64-dagcombiner-convergence.ll new file mode 100644 index 00000000000..a45e31320de --- /dev/null +++ b/test/CodeGen/AArch64/arm64-dagcombiner-convergence.ll @@ -0,0 +1,19 @@ +; RUN: llc < %s -o /dev/null +; rdar://10795250 +; DAGCombiner should converge. + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64" +target triple = "arm64-apple-macosx10.8.0" + +define i64 @foo(i128 %Params.coerce, i128 %SelLocs.coerce) { +entry: + %tmp = lshr i128 %Params.coerce, 61 + %.tr38.i = trunc i128 %tmp to i64 + %mul.i = and i64 %.tr38.i, 4294967288 + %tmp1 = lshr i128 %SelLocs.coerce, 62 + %.tr.i = trunc i128 %tmp1 to i64 + %mul7.i = and i64 %.tr.i, 4294967292 + %add.i = add i64 %mul7.i, %mul.i + %conv.i.i = and i64 %add.i, 4294967292 + ret i64 %conv.i.i +} diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll new file mode 100644 index 00000000000..2cf01357324 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll @@ -0,0 +1,29 @@ +; RUN: llc -mcpu=cyclone < %s | FileCheck %s + +target datalayout = "e-i64:64-n32:64-S128" +target triple = "arm64-apple-ios" + +%"struct.SU" = type { i32, %"struct.SU"*, i32*, i32, i32, %"struct.BO", i32, [5 x i8] } +%"struct.BO" = type { %"struct.RE" } + +%"struct.RE" = type { i32, i32, i32, i32 } + +; This is a read-modify-write of some bifields combined into an i48. It gets +; legalized into i32 and i16 accesses. Only a single store of zero to the low +; i32 part should be live. + +; CHECK-LABEL: test: +; CHECK-NOT: ldr +; CHECK: str wzr +; CHECK-NOT: str +define void @test(%"struct.SU"* nocapture %su) { +entry: + %r1 = getelementptr inbounds %"struct.SU"* %su, i64 1, i32 5 + %r2 = bitcast %"struct.BO"* %r1 to i48* + %r3 = load i48* %r2, align 8 + %r4 = and i48 %r3, -4294967296 + %r5 = or i48 0, %r4 + store i48 %r5, i48* %r2, align 8 + + ret void +} diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll b/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll new file mode 100644 index 00000000000..2e4b658f1c9 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll @@ -0,0 +1,46 @@ +; RUN: llc -O3 < %s | FileCheck %s +; RUN: llc -O3 -addr-sink-using-gep=1 < %s | FileCheck %s +; Test case for a DAG combiner bug where we combined an indexed load +; with an extension (sext, zext, or any) into a regular extended load, +; i.e., dropping the indexed value. +; + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-ios" + +%class.A = type { i64, i64 } +%class.C = type { i64 } + +; CHECK-LABEL: XX: +; CHECK: ldr +define void @XX(%class.A* %K) { +entry: + br i1 undef, label %if.then, label %lor.rhs.i + +lor.rhs.i: ; preds = %entry + %tmp = load i32* undef, align 4 + %y.i.i.i = getelementptr inbounds %class.A* %K, i64 0, i32 1 + %tmp1 = load i64* %y.i.i.i, align 8 + %U.sroa.3.8.extract.trunc.i = trunc i64 %tmp1 to i32 + %div11.i = sdiv i32 %U.sroa.3.8.extract.trunc.i, 17 + %add12.i = add nsw i32 0, %div11.i + %U.sroa.3.12.extract.shift.i = lshr i64 %tmp1, 32 + %U.sroa.3.12.extract.trunc.i = trunc i64 %U.sroa.3.12.extract.shift.i to i32 + %div15.i = sdiv i32 %U.sroa.3.12.extract.trunc.i, 13 + %add16.i = add nsw i32 %add12.i, %div15.i + %rem.i.i = srem i32 %add16.i, %tmp + %idxprom = sext i32 %rem.i.i to i64 + %arrayidx = getelementptr inbounds %class.C** undef, i64 %idxprom + %tobool533 = icmp eq %class.C* undef, null + br i1 %tobool533, label %while.end, label %while.body + +if.then: ; preds = %entry + unreachable + +while.body: ; preds = %lor.rhs.i + unreachable + +while.end: ; preds = %lor.rhs.i + %tmp3 = load %class.C** %arrayidx, align 8 + unreachable +} diff --git a/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll b/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll new file mode 100644 index 00000000000..0679014e59a --- /dev/null +++ b/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll @@ -0,0 +1,102 @@ +; RUN: llc -mtriple arm64-apple-ios -O3 -o - < %s | FileCheck %s +; + +%class.Complex = type { float, float } +%class.Complex_int = type { i32, i32 } +%class.Complex_long = type { i64, i64 } + +; CHECK-LABEL: @test +; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #3 +; CHECK: ldp [[CPLX1_I:s[0-9]+]], [[CPLX1_R:s[0-9]+]], {{\[}}[[BASE]]] +; CHECK: ldp [[CPLX2_I:s[0-9]+]], [[CPLX2_R:s[0-9]+]], {{\[}}[[BASE]], #64] +; CHECK: fadd {{s[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]] +; CHECK: fadd {{s[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]] +; CHECK: ret +define void @test(%class.Complex* nocapture %out, i64 %out_start) { +entry: + %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start + %0 = bitcast %class.Complex* %arrayidx to i64* + %1 = load i64* %0, align 4 + %t0.sroa.0.0.extract.trunc = trunc i64 %1 to i32 + %2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float + %t0.sroa.2.0.extract.shift = lshr i64 %1, 32 + %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32 + %3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float + %add = add i64 %out_start, 8 + %arrayidx2 = getelementptr inbounds %class.Complex* %out, i64 %add + %i.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 0 + %4 = load float* %i.i, align 4 + %add.i = fadd float %4, %2 + %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0 + %r.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 1 + %5 = load float* %r.i, align 4 + %add5.i = fadd float %5, %3 + %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1 + %ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>* + store <2 x float> %retval.sroa.0.4.vec.insert.i, <2 x float>* %ref.tmp.sroa.0.0.cast, align 4 + ret void +} + +; CHECK-LABEL: @test_int +; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #3 +; CHECK: ldp [[CPLX1_I:w[0-9]+]], [[CPLX1_R:w[0-9]+]], {{\[}}[[BASE]]] +; CHECK: ldp [[CPLX2_I:w[0-9]+]], [[CPLX2_R:w[0-9]+]], {{\[}}[[BASE]], #64] +; CHECK: add {{w[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]] +; CHECK: add {{w[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]] +; CHECK: ret +define void @test_int(%class.Complex_int* nocapture %out, i64 %out_start) { +entry: + %arrayidx = getelementptr inbounds %class.Complex_int* %out, i64 %out_start + %0 = bitcast %class.Complex_int* %arrayidx to i64* + %1 = load i64* %0, align 4 + %t0.sroa.0.0.extract.trunc = trunc i64 %1 to i32 + %2 = bitcast i32 %t0.sroa.0.0.extract.trunc to i32 + %t0.sroa.2.0.extract.shift = lshr i64 %1, 32 + %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32 + %3 = bitcast i32 %t0.sroa.2.0.extract.trunc to i32 + %add = add i64 %out_start, 8 + %arrayidx2 = getelementptr inbounds %class.Complex_int* %out, i64 %add + %i.i = getelementptr inbounds %class.Complex_int* %arrayidx2, i64 0, i32 0 + %4 = load i32* %i.i, align 4 + %add.i = add i32 %4, %2 + %retval.sroa.0.0.vec.insert.i = insertelement <2 x i32> undef, i32 %add.i, i32 0 + %r.i = getelementptr inbounds %class.Complex_int* %arrayidx2, i64 0, i32 1 + %5 = load i32* %r.i, align 4 + %add5.i = add i32 %5, %3 + %retval.sroa.0.4.vec.insert.i = insertelement <2 x i32> %retval.sroa.0.0.vec.insert.i, i32 %add5.i, i32 1 + %ref.tmp.sroa.0.0.cast = bitcast %class.Complex_int* %arrayidx to <2 x i32>* + store <2 x i32> %retval.sroa.0.4.vec.insert.i, <2 x i32>* %ref.tmp.sroa.0.0.cast, align 4 + ret void +} + +; CHECK-LABEL: @test_long +; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #4 +; CHECK: ldp [[CPLX1_I:x[0-9]+]], [[CPLX1_R:x[0-9]+]], {{\[}}[[BASE]]] +; CHECK: ldp [[CPLX2_I:x[0-9]+]], [[CPLX2_R:x[0-9]+]], {{\[}}[[BASE]], #128] +; CHECK: add {{x[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]] +; CHECK: add {{x[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]] +; CHECK: ret +define void @test_long(%class.Complex_long* nocapture %out, i64 %out_start) { +entry: + %arrayidx = getelementptr inbounds %class.Complex_long* %out, i64 %out_start + %0 = bitcast %class.Complex_long* %arrayidx to i128* + %1 = load i128* %0, align 4 + %t0.sroa.0.0.extract.trunc = trunc i128 %1 to i64 + %2 = bitcast i64 %t0.sroa.0.0.extract.trunc to i64 + %t0.sroa.2.0.extract.shift = lshr i128 %1, 64 + %t0.sroa.2.0.extract.trunc = trunc i128 %t0.sroa.2.0.extract.shift to i64 + %3 = bitcast i64 %t0.sroa.2.0.extract.trunc to i64 + %add = add i64 %out_start, 8 + %arrayidx2 = getelementptr inbounds %class.Complex_long* %out, i64 %add + %i.i = getelementptr inbounds %class.Complex_long* %arrayidx2, i32 0, i32 0 + %4 = load i64* %i.i, align 4 + %add.i = add i64 %4, %2 + %retval.sroa.0.0.vec.insert.i = insertelement <2 x i64> undef, i64 %add.i, i32 0 + %r.i = getelementptr inbounds %class.Complex_long* %arrayidx2, i32 0, i32 1 + %5 = load i64* %r.i, align 4 + %add5.i = add i64 %5, %3 + %retval.sroa.0.4.vec.insert.i = insertelement <2 x i64> %retval.sroa.0.0.vec.insert.i, i64 %add5.i, i32 1 + %ref.tmp.sroa.0.0.cast = bitcast %class.Complex_long* %arrayidx to <2 x i64>* + store <2 x i64> %retval.sroa.0.4.vec.insert.i, <2 x i64>* %ref.tmp.sroa.0.0.cast, align 4 + ret void +} diff --git a/test/CodeGen/AArch64/arm64-dead-def-frame-index.ll b/test/CodeGen/AArch64/arm64-dead-def-frame-index.ll new file mode 100644 index 00000000000..9bb4b712076 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-dead-def-frame-index.ll @@ -0,0 +1,18 @@ +; RUN: llc -march=arm64 < %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "arm64-apple-ios7.0.0" + +; Function Attrs: nounwind ssp uwtable +define i32 @test1() #0 { + %tmp1 = alloca i8 + %tmp2 = alloca i32, i32 4096 + %tmp3 = icmp eq i8* %tmp1, null + %tmp4 = zext i1 %tmp3 to i32 + + ret i32 %tmp4 + + ; CHECK-LABEL: test1 + ; CHECK: adds [[TEMP:[a-z0-9]+]], sp, #4, lsl #12 + ; CHECK: adds [[TEMP]], [[TEMP]], #15 +} diff --git a/test/CodeGen/AArch64/arm64-dead-register-def-bug.ll b/test/CodeGen/AArch64/arm64-dead-register-def-bug.ll new file mode 100644 index 00000000000..1bbcf50ba73 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-dead-register-def-bug.ll @@ -0,0 +1,32 @@ +; RUN: llc -mtriple="arm64-apple-ios" < %s | FileCheck %s +; +; Check that the dead register definition pass is considering implicit defs. +; When rematerializing through truncates, the coalescer may produce instructions +; with dead defs, but live implicit-defs of subregs: +; E.g. %X1 = MOVi64imm 2, %W1; %X1:GPR64, %W1:GPR32 +; These instructions are live, and their definitions should not be rewritten. +; +; + +define void @testcase() { +; CHECK: testcase: +; CHECK-NOT: orr xzr, xzr, #0x2 + +bb1: + %tmp1 = tail call float @ceilf(float 2.000000e+00) + %tmp2 = fptoui float %tmp1 to i64 + br i1 undef, label %bb2, label %bb3 + +bb2: + tail call void @foo() + br label %bb3 + +bb3: + %tmp3 = trunc i64 %tmp2 to i32 + tail call void @bar(i32 %tmp3) + ret void +} + +declare void @foo() +declare void @bar(i32) +declare float @ceilf(float) nounwind readnone diff --git a/test/CodeGen/AArch64/arm64-dup.ll b/test/CodeGen/AArch64/arm64-dup.ll new file mode 100644 index 00000000000..0c56b46c417 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-dup.ll @@ -0,0 +1,323 @@ +; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s + +define <8 x i8> @v_dup8(i8 %A) nounwind { +;CHECK-LABEL: v_dup8: +;CHECK: dup.8b + %tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0 + %tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1 + %tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2 + %tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3 + %tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4 + %tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5 + %tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6 + %tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7 + ret <8 x i8> %tmp8 +} + +define <4 x i16> @v_dup16(i16 %A) nounwind { +;CHECK-LABEL: v_dup16: +;CHECK: dup.4h + %tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0 + %tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1 + %tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2 + %tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3 + ret <4 x i16> %tmp4 +} + +define <2 x i32> @v_dup32(i32 %A) nounwind { +;CHECK-LABEL: v_dup32: +;CHECK: dup.2s + %tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0 + %tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1 + ret <2 x i32> %tmp2 +} + +define <2 x float> @v_dupfloat(float %A) nounwind { +;CHECK-LABEL: v_dupfloat: +;CHECK: dup.2s + %tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0 + %tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1 + ret <2 x float> %tmp2 +} + +define <16 x i8> @v_dupQ8(i8 %A) nounwind { +;CHECK-LABEL: v_dupQ8: +;CHECK: dup.16b + %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0 + %tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1 + %tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2 + %tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3 + %tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4 + %tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5 + %tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6 + %tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7 + %tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8 + %tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9 + %tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10 + %tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11 + %tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12 + %tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13 + %tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14 + %tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15 + ret <16 x i8> %tmp16 +} + +define <8 x i16> @v_dupQ16(i16 %A) nounwind { +;CHECK-LABEL: v_dupQ16: +;CHECK: dup.8h + %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0 + %tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1 + %tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2 + %tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3 + %tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4 + %tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5 + %tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6 + %tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7 + ret <8 x i16> %tmp8 +} + +define <4 x i32> @v_dupQ32(i32 %A) nounwind { +;CHECK-LABEL: v_dupQ32: +;CHECK: dup.4s + %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0 + %tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1 + %tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2 + %tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3 + ret <4 x i32> %tmp4 +} + +define <4 x float> @v_dupQfloat(float %A) nounwind { +;CHECK-LABEL: v_dupQfloat: +;CHECK: dup.4s + %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0 + %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1 + %tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2 + %tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3 + ret <4 x float> %tmp4 +} + +; Check to make sure it works with shuffles, too. + +define <8 x i8> @v_shuffledup8(i8 %A) nounwind { +;CHECK-LABEL: v_shuffledup8: +;CHECK: dup.8b + %tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0 + %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer + ret <8 x i8> %tmp2 +} + +define <4 x i16> @v_shuffledup16(i16 %A) nounwind { +;CHECK-LABEL: v_shuffledup16: +;CHECK: dup.4h + %tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0 + %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer + ret <4 x i16> %tmp2 +} + +define <2 x i32> @v_shuffledup32(i32 %A) nounwind { +;CHECK-LABEL: v_shuffledup32: +;CHECK: dup.2s + %tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0 + %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer + ret <2 x i32> %tmp2 +} + +define <2 x float> @v_shuffledupfloat(float %A) nounwind { +;CHECK-LABEL: v_shuffledupfloat: +;CHECK: dup.2s + %tmp1 = insertelement <2 x float> undef, float %A, i32 0 + %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer + ret <2 x float> %tmp2 +} + +define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind { +;CHECK-LABEL: v_shuffledupQ8: +;CHECK: dup.16b + %tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0 + %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer + ret <16 x i8> %tmp2 +} + +define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind { +;CHECK-LABEL: v_shuffledupQ16: +;CHECK: dup.8h + %tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0 + %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer + ret <8 x i16> %tmp2 +} + +define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind { +;CHECK-LABEL: v_shuffledupQ32: +;CHECK: dup.4s + %tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0 + %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer + ret <4 x i32> %tmp2 +} + +define <4 x float> @v_shuffledupQfloat(float %A) nounwind { +;CHECK-LABEL: v_shuffledupQfloat: +;CHECK: dup.4s + %tmp1 = insertelement <4 x float> undef, float %A, i32 0 + %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer + ret <4 x float> %tmp2 +} + +define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind { +;CHECK-LABEL: vduplane8: +;CHECK: dup.8b + %tmp1 = load <8 x i8>* %A + %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind { +;CHECK-LABEL: vduplane16: +;CHECK: dup.4h + %tmp1 = load <4 x i16>* %A + %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind { +;CHECK-LABEL: vduplane32: +;CHECK: dup.2s + %tmp1 = load <2 x i32>* %A + %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 > + ret <2 x i32> %tmp2 +} + +define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind { +;CHECK-LABEL: vduplanefloat: +;CHECK: dup.2s + %tmp1 = load <2 x float>* %A + %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 > + ret <2 x float> %tmp2 +} + +define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind { +;CHECK-LABEL: vduplaneQ8: +;CHECK: dup.16b + %tmp1 = load <8 x i8>* %A + %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind { +;CHECK-LABEL: vduplaneQ16: +;CHECK: dup.8h + %tmp1 = load <4 x i16>* %A + %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind { +;CHECK-LABEL: vduplaneQ32: +;CHECK: dup.4s + %tmp1 = load <2 x i32>* %A + %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > + ret <4 x i32> %tmp2 +} + +define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind { +;CHECK-LABEL: vduplaneQfloat: +;CHECK: dup.4s + %tmp1 = load <2 x float>* %A + %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > + ret <4 x float> %tmp2 +} + +define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone { +;CHECK-LABEL: foo: +;CHECK: dup.2d +entry: + %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> + ret <2 x i64> %0 +} + +define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone { +;CHECK-LABEL: bar: +;CHECK: dup.2d +entry: + %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> + ret <2 x i64> %0 +} + +define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone { +;CHECK-LABEL: baz: +;CHECK: dup.2d +entry: + %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> + ret <2 x double> %0 +} + +define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone { +;CHECK-LABEL: qux: +;CHECK: dup.2d +entry: + %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> + ret <2 x double> %0 +} + +define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone { +; CHECK-LABEL: f: +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ins.s v0[1], w1 +; CHECK-NEXT: ret + %vecinit = insertelement <2 x i32> undef, i32 %a, i32 0 + %vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1 + ret <2 x i32> %vecinit1 +} + +define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone { +; CHECK-LABEL: g: +; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ins.s v0[1], w1 +; CHECK-NEXT: ins.s v0[2], w1 +; CHECK-NEXT: ins.s v0[3], w0 +; CHECK-NEXT: ret + %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0 + %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2 + %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a, i32 3 + ret <4 x i32> %vecinit3 +} + +define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone { +; CHECK-LABEL: h: +; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ins.d v0[1], x1 +; CHECK-NEXT: ret + %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0 + %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1 + ret <2 x i64> %vecinit1 +} + +; We used to spot this as a BUILD_VECTOR implementable by dup, but assume that +; the single value needed was of the same type as the vector. This is false if +; the scalar corresponding to the vector type is illegal (e.g. a <4 x i16> +; BUILD_VECTOR will have an i32 as its source). In that case, the operation is +; not a simple "dup vD.4h, vN.h[idx]" after all, and we crashed. +; +; *However*, it is a dup vD.4h, vN.h[2*idx]. +define <4 x i16> @test_build_illegal(<4 x i32> %in) { +; CHECK-LABEL: test_build_illegal: +; CHECK: dup.4h v0, v0[6] + %val = extractelement <4 x i32> %in, i32 3 + %smallval = trunc i32 %val to i16 + %vec = insertelement <4x i16> undef, i16 %smallval, i32 3 + + ret <4 x i16> %vec +} + +; We used to inherit an already extract_subvectored v4i16 from +; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing +; the formation of an indexed-by-7 MLS. +define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 { +; CHECK-LABEL: test_high_splat: +; CHECK: mls.4h v0, v1, v2[7] +entry: + %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> + %mul = mul <4 x i16> %shuffle, %b + %sub = sub <4 x i16> %a, %mul + ret <4 x i16> %sub +} diff --git a/test/CodeGen/AArch64/arm64-early-ifcvt.ll b/test/CodeGen/AArch64/arm64-early-ifcvt.ll new file mode 100644 index 00000000000..17d783a488f --- /dev/null +++ b/test/CodeGen/AArch64/arm64-early-ifcvt.ll @@ -0,0 +1,423 @@ +; RUN: llc < %s -stress-early-ifcvt | FileCheck %s +target triple = "arm64-apple-macosx" + +; CHECK: mm2 +define i32 @mm2(i32* nocapture %p, i32 %n) nounwind uwtable readonly ssp { +entry: + br label %do.body + +; CHECK: do.body +; Loop body has no branches before the backedge. +; CHECK-NOT: LBB +do.body: + %max.0 = phi i32 [ 0, %entry ], [ %max.1, %do.cond ] + %min.0 = phi i32 [ 0, %entry ], [ %min.1, %do.cond ] + %n.addr.0 = phi i32 [ %n, %entry ], [ %dec, %do.cond ] + %p.addr.0 = phi i32* [ %p, %entry ], [ %incdec.ptr, %do.cond ] + %incdec.ptr = getelementptr inbounds i32* %p.addr.0, i64 1 + %0 = load i32* %p.addr.0, align 4 + %cmp = icmp sgt i32 %0, %max.0 + br i1 %cmp, label %do.cond, label %if.else + +if.else: + %cmp1 = icmp slt i32 %0, %min.0 + %.min.0 = select i1 %cmp1, i32 %0, i32 %min.0 + br label %do.cond + +do.cond: + %max.1 = phi i32 [ %0, %do.body ], [ %max.0, %if.else ] + %min.1 = phi i32 [ %min.0, %do.body ], [ %.min.0, %if.else ] +; CHECK: cbnz + %dec = add i32 %n.addr.0, -1 + %tobool = icmp eq i32 %dec, 0 + br i1 %tobool, label %do.end, label %do.body + +do.end: + %sub = sub nsw i32 %max.1, %min.1 + ret i32 %sub +} + +; CHECK-LABEL: fold_inc_true_32: +; CHECK: {{subs.*wzr,|cmp}} w2, #1 +; CHECK-NEXT: csinc w0, w1, w0, eq +; CHECK-NEXT: ret +define i32 @fold_inc_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp { +entry: + %tobool = icmp eq i32 %c, 1 + %inc = add nsw i32 %x, 1 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i32 [ %y, %eq_bb ], [ %inc, %entry ] + ret i32 %cond +} + +; CHECK-LABEL: fold_inc_true_64: +; CHECK: {{subs.*xzr,|cmp}} x2, #1 +; CHECK-NEXT: csinc x0, x1, x0, eq +; CHECK-NEXT: ret +define i64 @fold_inc_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp { +entry: + %tobool = icmp eq i64 %c, 1 + %inc = add nsw i64 %x, 1 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i64 [ %y, %eq_bb ], [ %inc, %entry ] + ret i64 %cond +} + +; CHECK-LABEL: fold_inc_false_32: +; CHECK: {{subs.*wzr,|cmp}} w2, #1 +; CHECK-NEXT: csinc w0, w1, w0, ne +; CHECK-NEXT: ret +define i32 @fold_inc_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp { +entry: + %tobool = icmp eq i32 %c, 1 + %inc = add nsw i32 %x, 1 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i32 [ %inc, %eq_bb ], [ %y, %entry ] + ret i32 %cond +} + +; CHECK-LABEL: fold_inc_false_64: +; CHECK: {{subs.*xzr,|cmp}} x2, #1 +; CHECK-NEXT: csinc x0, x1, x0, ne +; CHECK-NEXT: ret +define i64 @fold_inc_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp { +entry: + %tobool = icmp eq i64 %c, 1 + %inc = add nsw i64 %x, 1 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i64 [ %inc, %eq_bb ], [ %y, %entry ] + ret i64 %cond +} + +; CHECK-LABEL: fold_inv_true_32: +; CHECK: {{subs.*wzr,|cmp}} w2, #1 +; CHECK-NEXT: csinv w0, w1, w0, eq +; CHECK-NEXT: ret +define i32 @fold_inv_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp { +entry: + %tobool = icmp eq i32 %c, 1 + %inv = xor i32 %x, -1 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i32 [ %y, %eq_bb ], [ %inv, %entry ] + ret i32 %cond +} + +; CHECK-LABEL: fold_inv_true_64: +; CHECK: {{subs.*xzr,|cmp}} x2, #1 +; CHECK-NEXT: csinv x0, x1, x0, eq +; CHECK-NEXT: ret +define i64 @fold_inv_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp { +entry: + %tobool = icmp eq i64 %c, 1 + %inv = xor i64 %x, -1 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i64 [ %y, %eq_bb ], [ %inv, %entry ] + ret i64 %cond +} + +; CHECK-LABEL: fold_inv_false_32: +; CHECK: {{subs.*wzr,|cmp}} w2, #1 +; CHECK-NEXT: csinv w0, w1, w0, ne +; CHECK-NEXT: ret +define i32 @fold_inv_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp { +entry: + %tobool = icmp eq i32 %c, 1 + %inv = xor i32 %x, -1 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i32 [ %inv, %eq_bb ], [ %y, %entry ] + ret i32 %cond +} + +; CHECK-LABEL: fold_inv_false_64: +; CHECK: {{subs.*xzr,|cmp}} x2, #1 +; CHECK-NEXT: csinv x0, x1, x0, ne +; CHECK-NEXT: ret +define i64 @fold_inv_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp { +entry: + %tobool = icmp eq i64 %c, 1 + %inv = xor i64 %x, -1 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i64 [ %inv, %eq_bb ], [ %y, %entry ] + ret i64 %cond +} + +; CHECK-LABEL: fold_neg_true_32: +; CHECK: {{subs.*wzr,|cmp}} w2, #1 +; CHECK-NEXT: csneg w0, w1, w0, eq +; CHECK-NEXT: ret +define i32 @fold_neg_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp { +entry: + %tobool = icmp eq i32 %c, 1 + %neg = sub nsw i32 0, %x + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i32 [ %y, %eq_bb ], [ %neg, %entry ] + ret i32 %cond +} + +; CHECK-LABEL: fold_neg_true_64: +; CHECK: {{subs.*xzr,|cmp}} x2, #1 +; CHECK-NEXT: csneg x0, x1, x0, eq +; CHECK-NEXT: ret +define i64 @fold_neg_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp { +entry: + %tobool = icmp eq i64 %c, 1 + %neg = sub nsw i64 0, %x + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i64 [ %y, %eq_bb ], [ %neg, %entry ] + ret i64 %cond +} + +; CHECK-LABEL: fold_neg_false_32: +; CHECK: {{subs.*wzr,|cmp}} w2, #1 +; CHECK-NEXT: csneg w0, w1, w0, ne +; CHECK-NEXT: ret +define i32 @fold_neg_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp { +entry: + %tobool = icmp eq i32 %c, 1 + %neg = sub nsw i32 0, %x + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i32 [ %neg, %eq_bb ], [ %y, %entry ] + ret i32 %cond +} + +; CHECK-LABEL: fold_neg_false_64: +; CHECK: {{subs.*xzr,|cmp}} x2, #1 +; CHECK-NEXT: csneg x0, x1, x0, ne +; CHECK-NEXT: ret +define i64 @fold_neg_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp { +entry: + %tobool = icmp eq i64 %c, 1 + %neg = sub nsw i64 0, %x + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i64 [ %neg, %eq_bb ], [ %y, %entry ] + ret i64 %cond +} + +; CHECK: cbnz_32 +; CHECK: {{subs.*wzr,|cmp}} w2, #0 +; CHECK-NEXT: csel w0, w1, w0, ne +; CHECK-NEXT: ret +define i32 @cbnz_32(i32 %x, i32 %y, i32 %c) nounwind ssp { +entry: + %tobool = icmp eq i32 %c, 0 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i32 [ %x, %eq_bb ], [ %y, %entry ] + ret i32 %cond +} + +; CHECK: cbnz_64 +; CHECK: {{subs.*xzr,|cmp}} x2, #0 +; CHECK-NEXT: csel x0, x1, x0, ne +; CHECK-NEXT: ret +define i64 @cbnz_64(i64 %x, i64 %y, i64 %c) nounwind ssp { +entry: + %tobool = icmp eq i64 %c, 0 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i64 [ %x, %eq_bb ], [ %y, %entry ] + ret i64 %cond +} + +; CHECK: cbz_32 +; CHECK: {{subs.*wzr,|cmp}} w2, #0 +; CHECK-NEXT: csel w0, w1, w0, eq +; CHECK-NEXT: ret +define i32 @cbz_32(i32 %x, i32 %y, i32 %c) nounwind ssp { +entry: + %tobool = icmp ne i32 %c, 0 + br i1 %tobool, label %ne_bb, label %done + +ne_bb: + br label %done + +done: + %cond = phi i32 [ %x, %ne_bb ], [ %y, %entry ] + ret i32 %cond +} + +; CHECK: cbz_64 +; CHECK: {{subs.*xzr,|cmp}} x2, #0 +; CHECK-NEXT: csel x0, x1, x0, eq +; CHECK-NEXT: ret +define i64 @cbz_64(i64 %x, i64 %y, i64 %c) nounwind ssp { +entry: + %tobool = icmp ne i64 %c, 0 + br i1 %tobool, label %ne_bb, label %done + +ne_bb: + br label %done + +done: + %cond = phi i64 [ %x, %ne_bb ], [ %y, %entry ] + ret i64 %cond +} + +; CHECK: tbnz_32 +; CHECK: {{ands.*xzr,|tst}} w2, #0x80 +; CHECK-NEXT: csel w0, w1, w0, ne +; CHECK-NEXT: ret +define i32 @tbnz_32(i32 %x, i32 %y, i32 %c) nounwind ssp { +entry: + %mask = and i32 %c, 128 + %tobool = icmp eq i32 %mask, 0 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i32 [ %x, %eq_bb ], [ %y, %entry ] + ret i32 %cond +} + +; CHECK: tbnz_64 +; CHECK: {{ands.*xzr,|tst}} x2, #0x8000000000000000 +; CHECK-NEXT: csel x0, x1, x0, ne +; CHECK-NEXT: ret +define i64 @tbnz_64(i64 %x, i64 %y, i64 %c) nounwind ssp { +entry: + %mask = and i64 %c, 9223372036854775808 + %tobool = icmp eq i64 %mask, 0 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i64 [ %x, %eq_bb ], [ %y, %entry ] + ret i64 %cond +} + +; CHECK: tbz_32 +; CHECK: {{ands.*xzr,|tst}} w2, #0x80 +; CHECK-NEXT: csel w0, w1, w0, eq +; CHECK-NEXT: ret +define i32 @tbz_32(i32 %x, i32 %y, i32 %c) nounwind ssp { +entry: + %mask = and i32 %c, 128 + %tobool = icmp ne i32 %mask, 0 + br i1 %tobool, label %ne_bb, label %done + +ne_bb: + br label %done + +done: + %cond = phi i32 [ %x, %ne_bb ], [ %y, %entry ] + ret i32 %cond +} + +; CHECK: tbz_64 +; CHECK: {{ands.*xzr,|tst}} x2, #0x8000000000000000 +; CHECK-NEXT: csel x0, x1, x0, eq +; CHECK-NEXT: ret +define i64 @tbz_64(i64 %x, i64 %y, i64 %c) nounwind ssp { +entry: + %mask = and i64 %c, 9223372036854775808 + %tobool = icmp ne i64 %mask, 0 + br i1 %tobool, label %ne_bb, label %done + +ne_bb: + br label %done + +done: + %cond = phi i64 [ %x, %ne_bb ], [ %y, %entry ] + ret i64 %cond +} + +; This function from 175.vpr folds an ADDWri into a CSINC. +; Remember to clear the kill flag on the ADDWri. +define i32 @get_ytrack_to_xtracks() nounwind ssp { +entry: + br label %for.body + +for.body: + %x0 = load i32* undef, align 4 + br i1 undef, label %if.then.i146, label %is_sbox.exit155 + +if.then.i146: + %add8.i143 = add nsw i32 0, %x0 + %rem.i144 = srem i32 %add8.i143, %x0 + %add9.i145 = add i32 %rem.i144, 1 + br label %is_sbox.exit155 + +is_sbox.exit155: ; preds = %if.then.i146, %for.body + %seg_offset.0.i151 = phi i32 [ %add9.i145, %if.then.i146 ], [ undef, %for.body ] + %idxprom15.i152 = sext i32 %seg_offset.0.i151 to i64 + %arrayidx18.i154 = getelementptr inbounds i32* null, i64 %idxprom15.i152 + %x1 = load i32* %arrayidx18.i154, align 4 + br i1 undef, label %for.body51, label %for.body + +for.body51: ; preds = %is_sbox.exit155 + call fastcc void @get_switch_type(i32 %x1, i32 undef, i16 signext undef, i16 signext undef, i16* undef) + unreachable +} +declare fastcc void @get_switch_type(i32, i32, i16 signext, i16 signext, i16* nocapture) nounwind ssp diff --git a/test/CodeGen/AArch64/arm64-elf-calls.ll b/test/CodeGen/AArch64/arm64-elf-calls.ll new file mode 100644 index 00000000000..8c4020327b9 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-elf-calls.ll @@ -0,0 +1,20 @@ +; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s +; RUN: llc -mtriple=arm64-linux-gnu -filetype=obj -o - %s | llvm-objdump -triple=arm64-linux-gnu - -r | FileCheck %s --check-prefix=CHECK-OBJ + +declare void @callee() + +define void @caller() { + call void @callee() + ret void +; CHECK-LABEL: caller: +; CHECK: bl callee +; CHECK-OBJ: R_AARCH64_CALL26 callee +} + +define void @tail_caller() { + tail call void @callee() + ret void +; CHECK-LABEL: tail_caller: +; CHECK: b callee +; CHECK-OBJ: R_AARCH64_JUMP26 callee +} diff --git a/test/CodeGen/AArch64/arm64-elf-constpool.ll b/test/CodeGen/AArch64/arm64-elf-constpool.ll new file mode 100644 index 00000000000..95d334376b7 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-elf-constpool.ll @@ -0,0 +1,13 @@ +; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s +; RUN: llc -mtriple=arm64-linux-gnu -O0 -o - %s | FileCheck %s + +; O0 checked for fastisel purposes. It has a separate path which +; creates a constpool entry for floating values. + +define double @needs_const() { + ret double 3.14159 +; CHECK: .LCPI0_0: + +; CHECK: adrp {{x[0-9]+}}, .LCPI0_0 +; CHECK: ldr d0, [{{x[0-9]+}}, :lo12:.LCPI0_0] +} diff --git a/test/CodeGen/AArch64/arm64-elf-globals.ll b/test/CodeGen/AArch64/arm64-elf-globals.ll new file mode 100644 index 00000000000..4ed44e7c17a --- /dev/null +++ b/test/CodeGen/AArch64/arm64-elf-globals.ll @@ -0,0 +1,115 @@ +; RUN: llc -mtriple=arm64-linux-gnu -o - %s -mcpu=cyclone | FileCheck %s +; RUN: llc -mtriple=arm64-linux-gnu -o - %s -O0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST +; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-PIC +; RUN: llc -mtriple=arm64-linux-gnu -O0 -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST-PIC + +@var8 = external global i8, align 1 +@var16 = external global i16, align 2 +@var32 = external global i32, align 4 +@var64 = external global i64, align 8 + +define i8 @test_i8(i8 %new) { + %val = load i8* @var8, align 1 + store i8 %new, i8* @var8 + ret i8 %val +; CHECK-LABEL: test_i8: +; CHECK: adrp x[[HIREG:[0-9]+]], var8 +; CHECK: ldrb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8] +; CHECK: strb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8] + +; CHECK-PIC-LABEL: test_i8: +; CHECK-PIC: adrp x[[HIREG:[0-9]+]], :got:var8 +; CHECK-PIC: ldr x[[VAR_ADDR:[0-9]+]], [x[[HIREG]], :got_lo12:var8] +; CHECK-PIC: ldrb {{w[0-9]+}}, [x[[VAR_ADDR]]] + +; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var8 +; CHECK-FAST: ldrb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8] + +; CHECK-FAST-PIC: adrp x[[HIREG:[0-9]+]], :got:var8 +; CHECK-FAST-PIC: ldr x[[VARADDR:[0-9]+]], [x[[HIREG]], :got_lo12:var8] +; CHECK-FAST-PIC: ldr {{w[0-9]+}}, [x[[VARADDR]]] +} + +define i16 @test_i16(i16 %new) { + %val = load i16* @var16, align 2 + store i16 %new, i16* @var16 + ret i16 %val +; CHECK-LABEL: test_i16: +; CHECK: adrp x[[HIREG:[0-9]+]], var16 +; CHECK: ldrh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16] +; CHECK: strh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16] + +; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var16 +; CHECK-FAST: ldrh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16] +} + +define i32 @test_i32(i32 %new) { + %val = load i32* @var32, align 4 + store i32 %new, i32* @var32 + ret i32 %val +; CHECK-LABEL: test_i32: +; CHECK: adrp x[[HIREG:[0-9]+]], var32 +; CHECK: ldr {{w[0-9]+}}, [x[[HIREG]], :lo12:var32] +; CHECK: str {{w[0-9]+}}, [x[[HIREG]], :lo12:var32] + +; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var32 +; CHECK-FAST: add {{x[0-9]+}}, x[[HIREG]], :lo12:var32 +} + +define i64 @test_i64(i64 %new) { + %val = load i64* @var64, align 8 + store i64 %new, i64* @var64 + ret i64 %val +; CHECK-LABEL: test_i64: +; CHECK: adrp x[[HIREG:[0-9]+]], var64 +; CHECK: ldr {{x[0-9]+}}, [x[[HIREG]], :lo12:var64] +; CHECK: str {{x[0-9]+}}, [x[[HIREG]], :lo12:var64] + +; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var64 +; CHECK-FAST: add {{x[0-9]+}}, x[[HIREG]], :lo12:var64 +} + +define i64* @test_addr() { + ret i64* @var64 +; CHECK-LABEL: test_addr: +; CHECK: adrp [[HIREG:x[0-9]+]], var64 +; CHECK: add x0, [[HIREG]], :lo12:var64 + +; CHECK-FAST: adrp [[HIREG:x[0-9]+]], var64 +; CHECK-FAST: add x0, [[HIREG]], :lo12:var64 +} + +@hiddenvar = hidden global i32 0, align 4 +@protectedvar = protected global i32 0, align 4 + +define i32 @test_vis() { + %lhs = load i32* @hiddenvar, align 4 + %rhs = load i32* @protectedvar, align 4 + %ret = add i32 %lhs, %rhs + ret i32 %ret +; CHECK-PIC: adrp {{x[0-9]+}}, hiddenvar +; CHECK-PIC: ldr {{w[0-9]+}}, [{{x[0-9]+}}, :lo12:hiddenvar] +; CHECK-PIC: adrp {{x[0-9]+}}, protectedvar +; CHECK-PIC: ldr {{w[0-9]+}}, [{{x[0-9]+}}, :lo12:protectedvar] +} + +@var_default = external global [2 x i32] + +define i32 @test_default_align() { + %addr = getelementptr [2 x i32]* @var_default, i32 0, i32 0 + %val = load i32* %addr + ret i32 %val +; CHECK-LABEL: test_default_align: +; CHECK: adrp x[[HIREG:[0-9]+]], var_default +; CHECK: ldr w0, [x[[HIREG]], :lo12:var_default] +} + +define i64 @test_default_unaligned() { + %addr = bitcast [2 x i32]* @var_default to i64* + %val = load i64* %addr + ret i64 %val +; CHECK-LABEL: test_default_unaligned: +; CHECK: adrp [[HIREG:x[0-9]+]], var_default +; CHECK: add x[[ADDR:[0-9]+]], [[HIREG]], :lo12:var_default +; CHECK: ldr x0, [x[[ADDR]]] +} diff --git a/test/CodeGen/AArch64/arm64-ext.ll b/test/CodeGen/AArch64/arm64-ext.ll new file mode 100644 index 00000000000..67860de51b0 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-ext.ll @@ -0,0 +1,118 @@ +; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s + +define <8 x i8> @test_vextd(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK-LABEL: test_vextd: +;CHECK: {{ext.8b.*#3}} + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> + ret <8 x i8> %tmp3 +} + +define <8 x i8> @test_vextRd(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK-LABEL: test_vextRd: +;CHECK: {{ext.8b.*#5}} + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> + ret <8 x i8> %tmp3 +} + +define <16 x i8> @test_vextq(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK-LABEL: test_vextq: +;CHECK: {{ext.16b.*3}} + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> + ret <16 x i8> %tmp3 +} + +define <16 x i8> @test_vextRq(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK-LABEL: test_vextRq: +;CHECK: {{ext.16b.*7}} + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> + ret <16 x i8> %tmp3 +} + +define <4 x i16> @test_vextd16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK-LABEL: test_vextd16: +;CHECK: {{ext.8b.*#6}} + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> + ret <4 x i16> %tmp3 +} + +define <4 x i32> @test_vextq32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK-LABEL: test_vextq32: +;CHECK: {{ext.16b.*12}} + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> + ret <4 x i32> %tmp3 +} + +; Undef shuffle indices should not prevent matching to VEXT: + +define <8 x i8> @test_vextd_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK-LABEL: test_vextd_undef: +;CHECK: {{ext.8b.*}} + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> + ret <8 x i8> %tmp3 +} + +define <8 x i8> @test_vextd_undef2(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK-LABEL: test_vextd_undef2: +;CHECK: {{ext.8b.*#6}} + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> + ret <8 x i8> %tmp3 +} + +define <16 x i8> @test_vextRq_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK-LABEL: test_vextRq_undef: +;CHECK: {{ext.16b.*#7}} + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> + ret <16 x i8> %tmp3 +} + +define <8 x i16> @test_vextRq_undef2(<8 x i16>* %A) nounwind { +;CHECK-LABEL: test_vextRq_undef2: +;CHECK: {{ext.16b.*#10}} + %tmp1 = load <8 x i16>* %A + %vext = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> + ret <8 x i16> %vext; +} + +; Tests for ReconstructShuffle function. Indices have to be carefully +; chosen to reach lowering phase as a BUILD_VECTOR. + +; One vector needs vext, the other can be handled by extract_subvector +; Also checks interleaving of sources is handled correctly. +; Essence: a vext is used on %A and something saner than stack load/store for final result. +define <4 x i16> @test_interleaved(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK-LABEL: test_interleaved: +;CHECK: ext.8b +;CHECK: zip1.4h + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> + ret <4 x i16> %tmp3 +} + +; An undef in the shuffle list should still be optimizable +define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK-LABEL: test_undef: +;CHECK: zip1.4h + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> + ret <4 x i16> %tmp3 +} diff --git a/test/CodeGen/AArch64/arm64-extend-int-to-fp.ll b/test/CodeGen/AArch64/arm64-extend-int-to-fp.ll new file mode 100644 index 00000000000..048fdb083a4 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-extend-int-to-fp.ll @@ -0,0 +1,19 @@ +; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s + +define <4 x float> @foo(<4 x i16> %a) nounwind { +; CHECK-LABEL: foo: +; CHECK: ushll.4s v0, v0, #0 +; CHECK-NEXT: ucvtf.4s v0, v0 +; CHECK-NEXT: ret + %vcvt.i = uitofp <4 x i16> %a to <4 x float> + ret <4 x float> %vcvt.i +} + +define <4 x float> @bar(<4 x i16> %a) nounwind { +; CHECK-LABEL: bar: +; CHECK: sshll.4s v0, v0, #0 +; CHECK-NEXT: scvtf.4s v0, v0 +; CHECK-NEXT: ret + %vcvt.i = sitofp <4 x i16> %a to <4 x float> + ret <4 x float> %vcvt.i +} diff --git a/test/CodeGen/AArch64/arm64-extend.ll b/test/CodeGen/AArch64/arm64-extend.ll new file mode 100644 index 00000000000..afcaca2c492 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-extend.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s +@array = external global [0 x i32] + +define i64 @foo(i32 %i) { +; CHECK: foo +; CHECK: adrp x[[REG:[0-9]+]], _array@GOTPAGE +; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _array@GOTPAGEOFF] +; CHECK: ldrsw x0, [x[[REG1]], w0, sxtw #2] +; CHECK: ret + %idxprom = sext i32 %i to i64 + %arrayidx = getelementptr inbounds [0 x i32]* @array, i64 0, i64 %idxprom + %tmp1 = load i32* %arrayidx, align 4 + %conv = sext i32 %tmp1 to i64 + ret i64 %conv +} diff --git a/test/CodeGen/AArch64/arm64-extern-weak.ll b/test/CodeGen/AArch64/arm64-extern-weak.ll new file mode 100644 index 00000000000..a239403befa --- /dev/null +++ b/test/CodeGen/AArch64/arm64-extern-weak.ll @@ -0,0 +1,51 @@ +; RUN: llc -mtriple=arm64-none-linux-gnu -o - < %s | FileCheck %s +; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large -o - < %s | FileCheck --check-prefix=CHECK-LARGE %s + +declare extern_weak i32 @var() + +define i32()* @foo() { +; The usual ADRP/ADD pair can't be used for a weak reference because it must +; evaluate to 0 if the symbol is undefined. We use a litpool entry. + ret i32()* @var + +; CHECK: adrp x[[VAR:[0-9]+]], :got:var +; CHECK: ldr x0, [x[[VAR]], :got_lo12:var] + + ; In the large model, the usual relocations are absolute and can + ; materialise 0. +; CHECK-LARGE: movz x0, #:abs_g3:var +; CHECK-LARGE: movk x0, #:abs_g2_nc:var +; CHECK-LARGE: movk x0, #:abs_g1_nc:var +; CHECK-LARGE: movk x0, #:abs_g0_nc:var +} + + +@arr_var = extern_weak global [10 x i32] + +define i32* @bar() { + %addr = getelementptr [10 x i32]* @arr_var, i32 0, i32 5 +; CHECK: adrp x[[ARR_VAR_HI:[0-9]+]], :got:arr_var +; CHECK: ldr [[ARR_VAR:x[0-9]+]], [x[[ARR_VAR_HI]], :got_lo12:arr_var] +; CHECK: add x0, [[ARR_VAR]], #20 + ret i32* %addr + + ; In the large model, the usual relocations are absolute and can + ; materialise 0. +; CHECK-LARGE: movz [[ARR_VAR:x[0-9]+]], #:abs_g3:arr_var +; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g2_nc:arr_var +; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g1_nc:arr_var +; CHECK-LARGE: movk [[ARR_VAR]], #:abs_g0_nc:arr_var +} + +@defined_weak_var = internal unnamed_addr global i32 0 + +define i32* @wibble() { + ret i32* @defined_weak_var +; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var +; CHECK: add x0, [[BASE]], :lo12:defined_weak_var + +; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var +; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var +; CHECK-LARGE: movk x0, #:abs_g1_nc:defined_weak_var +; CHECK-LARGE: movk x0, #:abs_g0_nc:defined_weak_var +} diff --git a/test/CodeGen/AArch64/arm64-extload-knownzero.ll b/test/CodeGen/AArch64/arm64-extload-knownzero.ll new file mode 100644 index 00000000000..14e5fd310d7 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-extload-knownzero.ll @@ -0,0 +1,28 @@ +; RUN: llc < %s -march=arm64 | FileCheck %s +; rdar://12771555 + +define void @foo(i16* %ptr, i32 %a) nounwind { +entry: +; CHECK-LABEL: foo: + %tmp1 = icmp ult i32 %a, 100 + br i1 %tmp1, label %bb1, label %bb2 +bb1: +; CHECK: %bb1 +; CHECK: ldrh [[REG:w[0-9]+]] + %tmp2 = load i16* %ptr, align 2 + br label %bb2 +bb2: +; CHECK: %bb2 +; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff +; CHECK: cmp [[REG]], #23 + %tmp3 = phi i16 [ 0, %entry ], [ %tmp2, %bb1 ] + %cmp = icmp ult i16 %tmp3, 24 + br i1 %cmp, label %bb3, label %exit +bb3: + call void @bar() nounwind + br label %exit +exit: + ret void +} + +declare void @bar () diff --git a/test/CodeGen/AArch64/arm64-extract.ll b/test/CodeGen/AArch64/arm64-extract.ll new file mode 100644 index 00000000000..01984662d23 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-extract.ll @@ -0,0 +1,58 @@ +; RUN: llc -aarch64-extr-generation=true -verify-machineinstrs < %s \ +; RUN: -march=arm64 | FileCheck %s + +define i64 @ror_i64(i64 %in) { +; CHECK-LABEL: ror_i64: + %left = shl i64 %in, 19 + %right = lshr i64 %in, 45 + %val5 = or i64 %left, %right +; CHECK: ror {{x[0-9]+}}, x0, #45 + ret i64 %val5 +} + +define i32 @ror_i32(i32 %in) { +; CHECK-LABEL: ror_i32: + %left = shl i32 %in, 9 + %right = lshr i32 %in, 23 + %val5 = or i32 %left, %right +; CHECK: ror {{w[0-9]+}}, w0, #23 + ret i32 %val5 +} + +define i32 @extr_i32(i32 %lhs, i32 %rhs) { +; CHECK-LABEL: extr_i32: + %left = shl i32 %lhs, 6 + %right = lshr i32 %rhs, 26 + %val = or i32 %left, %right + ; Order of lhs and rhs matters here. Regalloc would have to be very odd to use + ; something other than w0 and w1. +; CHECK: extr {{w[0-9]+}}, w0, w1, #26 + + ret i32 %val +} + +define i64 @extr_i64(i64 %lhs, i64 %rhs) { +; CHECK-LABEL: extr_i64: + %right = lshr i64 %rhs, 40 + %left = shl i64 %lhs, 24 + %val = or i64 %right, %left + ; Order of lhs and rhs matters here. Regalloc would have to be very odd to use + ; something other than w0 and w1. +; CHECK: extr {{x[0-9]+}}, x0, x1, #40 + + ret i64 %val +} + +; Regression test: a bad experimental pattern crept into git which optimised +; this pattern to a single EXTR. +define i32 @extr_regress(i32 %a, i32 %b) { +; CHECK-LABEL: extr_regress: + + %sh1 = shl i32 %a, 14 + %sh2 = lshr i32 %b, 14 + %val = or i32 %sh2, %sh1 +; CHECK-NOT: extr {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, #{{[0-9]+}} + + ret i32 %val +; CHECK: ret +} diff --git a/test/CodeGen/AArch64/arm64-extract_subvector.ll b/test/CodeGen/AArch64/arm64-extract_subvector.ll new file mode 100644 index 00000000000..8b15a6453b2 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-extract_subvector.ll @@ -0,0 +1,51 @@ +; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s + +; Extract of an upper half of a vector is an "ext.16b v0, v0, v0, #8" insn. + +define <8 x i8> @v8i8(<16 x i8> %a) nounwind { +; CHECK: v8i8 +; CHECK: ext.16b v0, v0, v0, #8 +; CHECK: ret + %ret = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> + ret <8 x i8> %ret +} + +define <4 x i16> @v4i16(<8 x i16> %a) nounwind { +; CHECK-LABEL: v4i16: +; CHECK: ext.16b v0, v0, v0, #8 +; CHECK: ret + %ret = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> + ret <4 x i16> %ret +} + +define <2 x i32> @v2i32(<4 x i32> %a) nounwind { +; CHECK-LABEL: v2i32: +; CHECK: ext.16b v0, v0, v0, #8 +; CHECK: ret + %ret = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> + ret <2 x i32> %ret +} + +define <1 x i64> @v1i64(<2 x i64> %a) nounwind { +; CHECK-LABEL: v1i64: +; CHECK: ext.16b v0, v0, v0, #8 +; CHECK: ret + %ret = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> + ret <1 x i64> %ret +} + +define <2 x float> @v2f32(<4 x float> %a) nounwind { +; CHECK-LABEL: v2f32: +; CHECK: ext.16b v0, v0, v0, #8 +; CHECK: ret + %ret = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> + ret <2 x float> %ret +} + +define <1 x double> @v1f64(<2 x double> %a) nounwind { +; CHECK-LABEL: v1f64: +; CHECK: ext.16b v0, v0, v0, #8 +; CHECK: ret + %ret = shufflevector <2 x double> %a, <2 x double> %a, <1 x i32> + ret <1 x double> %ret +} diff --git a/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll b/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll new file mode 100644 index 00000000000..ebd847e0f72 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll @@ -0,0 +1,47 @@ +; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s + +@sortlist = common global [5001 x i32] zeroinitializer, align 16 +@sortlist2 = common global [5001 x i64] zeroinitializer, align 16 + +; Load an address with an offset larget then LDR imm can handle +define i32 @foo() nounwind { +entry: +; CHECK: @foo +; CHECK: adrp x[[REG:[0-9]+]], _sortlist@GOTPAGE +; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _sortlist@GOTPAGEOFF] +; CHECK: movz x[[REG2:[0-9]+]], #0x4e20 +; CHECK: add x[[REG3:[0-9]+]], x[[REG1]], x[[REG2]] +; CHECK: ldr w0, [x[[REG3]]] +; CHECK: ret + %0 = load i32* getelementptr inbounds ([5001 x i32]* @sortlist, i32 0, i64 5000), align 4 + ret i32 %0 +} + +define i64 @foo2() nounwind { +entry: +; CHECK: @foo2 +; CHECK: adrp x[[REG:[0-9]+]], _sortlist2@GOTPAGE +; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _sortlist2@GOTPAGEOFF] +; CHECK: movz x[[REG2:[0-9]+]], #0x9c40 +; CHECK: add x[[REG3:[0-9]+]], x[[REG1]], x[[REG2]] +; CHECK: ldr x0, [x[[REG3]]] +; CHECK: ret + %0 = load i64* getelementptr inbounds ([5001 x i64]* @sortlist2, i32 0, i64 5000), align 4 + ret i64 %0 +} + +; Load an address with a ridiculously large offset. +; rdar://12505553 +@pd2 = common global i8* null, align 8 + +define signext i8 @foo3() nounwind ssp { +entry: +; CHECK: @foo3 +; CHECK: movz x[[REG:[0-9]+]], #0xb3a, lsl #32 +; CHECK: movk x[[REG]], #0x73ce, lsl #16 +; CHECK: movk x[[REG]], #0x2ff2 + %0 = load i8** @pd2, align 8 + %arrayidx = getelementptr inbounds i8* %0, i64 12345678901234 + %1 = load i8* %arrayidx, align 1 + ret i8 %1 +} diff --git a/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll b/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll new file mode 100644 index 00000000000..1706e9eba2b --- /dev/null +++ b/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll @@ -0,0 +1,25 @@ +; This test should cause the TargetMaterializeAlloca to be invoked +; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s + +%struct.S1Ty = type { i64 } +%struct.S2Ty = type { %struct.S1Ty, %struct.S1Ty } + +define void @takeS1(%struct.S1Ty* %V) nounwind { +entry: + %V.addr = alloca %struct.S1Ty*, align 8 + store %struct.S1Ty* %V, %struct.S1Ty** %V.addr, align 8 + ret void +} + +define void @main() nounwind { +entry: +; CHECK: main +; CHECK: mov x29, sp +; CHECK: mov x[[REG:[0-9]+]], sp +; CHECK-NEXT: orr x[[REG1:[0-9]+]], xzr, #0x8 +; CHECK-NEXT: add x0, x[[REG]], x[[REG1]] + %E = alloca %struct.S2Ty, align 4 + %B = getelementptr inbounds %struct.S2Ty* %E, i32 0, i32 1 + call void @takeS1(%struct.S1Ty* %B) + ret void +} diff --git a/test/CodeGen/AArch64/arm64-fast-isel-br.ll b/test/CodeGen/AArch64/arm64-fast-isel-br.ll new file mode 100644 index 00000000000..37a8295c893 --- /dev/null +++ b/test/CodeGen/AArch64/arm64-fast-isel-br.ll @@ -0,0 +1,155 @@ +; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin -mcpu=cyclone | FileCheck %s + +define void @branch1() nounwind uwtable ssp { + %x = alloca i32, align 4 + store i32 0, i32* %x, align 4 + %1 = load i32* %x, align 4 + %2 = icmp ne i32 %1, 0 + br i1 %2, label %3, label %4 + +;