//
//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "arm64-lower"
-
#include "ARM64ISelLowering.h"
#include "ARM64PerfectShuffle.h"
#include "ARM64Subtarget.h"
#include "llvm/Target/TargetOptions.h"
using namespace llvm;
+#define DEBUG_TYPE "arm64-lower"
+
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumShiftInserts, "Number of vector shift inserts");
cl::desc("Generate ARM64 tail calls (TEMPORARY OPTION)."),
cl::init(true));
-static cl::opt<bool>
-StrictAlign("arm64-strict-align", cl::Hidden,
- cl::desc("Disallow all unaligned memory accesses"));
+enum AlignMode {
+ StrictAlign,
+ NoStrictAlign
+};
+
+static cl::opt<AlignMode>
+Align(cl::desc("Load/store alignment support"),
+ cl::Hidden, cl::init(NoStrictAlign),
+ cl::values(
+ clEnumValN(StrictAlign, "arm64-strict-align",
+ "Disallow all unaligned memory accesses"),
+ clEnumValN(NoStrictAlign, "arm64-no-strict-align",
+ "Allow unaligned memory accesses"),
+ clEnumValEnd));
// Place holder until extr generation is tested fully.
static cl::opt<bool>
// Set up the register classes.
addRegisterClass(MVT::i32, &ARM64::GPR32allRegClass);
addRegisterClass(MVT::i64, &ARM64::GPR64allRegClass);
- addRegisterClass(MVT::f32, &ARM64::FPR32RegClass);
- addRegisterClass(MVT::f64, &ARM64::FPR64RegClass);
- addRegisterClass(MVT::f128, &ARM64::FPR128RegClass);
- addRegisterClass(MVT::v16i8, &ARM64::FPR8RegClass);
- addRegisterClass(MVT::v8i16, &ARM64::FPR16RegClass);
-
- // Someone set us up the NEON.
- addDRTypeForNEON(MVT::v2f32);
- addDRTypeForNEON(MVT::v8i8);
- addDRTypeForNEON(MVT::v4i16);
- addDRTypeForNEON(MVT::v2i32);
- addDRTypeForNEON(MVT::v1i64);
- addDRTypeForNEON(MVT::v1f64);
-
- addQRTypeForNEON(MVT::v4f32);
- addQRTypeForNEON(MVT::v2f64);
- addQRTypeForNEON(MVT::v16i8);
- addQRTypeForNEON(MVT::v8i16);
- addQRTypeForNEON(MVT::v4i32);
- addQRTypeForNEON(MVT::v2i64);
+
+ if (Subtarget->hasFPARMv8()) {
+ addRegisterClass(MVT::f16, &ARM64::FPR16RegClass);
+ addRegisterClass(MVT::f32, &ARM64::FPR32RegClass);
+ addRegisterClass(MVT::f64, &ARM64::FPR64RegClass);
+ addRegisterClass(MVT::f128, &ARM64::FPR128RegClass);
+ }
+
+ if (Subtarget->hasNEON()) {
+ addRegisterClass(MVT::v16i8, &ARM64::FPR8RegClass);
+ addRegisterClass(MVT::v8i16, &ARM64::FPR16RegClass);
+ // Someone set us up the NEON.
+ addDRTypeForNEON(MVT::v2f32);
+ addDRTypeForNEON(MVT::v8i8);
+ addDRTypeForNEON(MVT::v4i16);
+ addDRTypeForNEON(MVT::v2i32);
+ addDRTypeForNEON(MVT::v1i64);
+ addDRTypeForNEON(MVT::v1f64);
+
+ addQRTypeForNEON(MVT::v4f32);
+ addQRTypeForNEON(MVT::v2f64);
+ addQRTypeForNEON(MVT::v16i8);
+ addQRTypeForNEON(MVT::v8i16);
+ addQRTypeForNEON(MVT::v4i32);
+ addQRTypeForNEON(MVT::v2i64);
+ }
// Compute derived properties from the register classes
computeRegisterProperties();
setOperationAction(ISD::FREM, MVT::f64, Expand);
setOperationAction(ISD::FREM, MVT::f80, Expand);
- // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
- // silliness like this:
- setOperationAction(ISD::FABS, MVT::v1f64, Expand);
- setOperationAction(ISD::FADD, MVT::v1f64, Expand);
- setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
- setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
- setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
- setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
- setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
- setOperationAction(ISD::FMA, MVT::v1f64, Expand);
- setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
- setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
- setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
- setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
- setOperationAction(ISD::FREM, MVT::v1f64, Expand);
- setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
- setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
- setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
- setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
- setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
- setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
- setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
- setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
- setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
- setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
- setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
- setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
-
- setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
- setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
- setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
- setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
- setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
-
// Custom lowering hooks are needed for XOR
// to fold it into CSINC/CSINV.
setOperationAction(ISD::XOR, MVT::i32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
- // 128-bit atomics
- setOperationAction(ISD::ATOMIC_SWAP, MVT::i128, Custom);
- setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i128, Custom);
- setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
- setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i128, Custom);
- setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i128, Custom);
- setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i128, Custom);
- setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i128, Custom);
- setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i128, Custom);
- setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i128, Custom);
- setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i128, Custom);
- setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i128, Custom);
- setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i128, Custom);
- // These are surprisingly difficult. The only single-copy atomic 128-bit
- // instruction on AArch64 is stxp (when it succeeds). So a store can safely
- // become a simple swap, but a load can only be determined to have been atomic
- // if storing the same value back succeeds.
- setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
- setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Expand);
-
// Variable arguments.
setOperationAction(ISD::VASTART, MVT::Other, Custom);
setOperationAction(ISD::VAARG, MVT::Other, Custom);
// BlockAddress
setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
- // Add/Sub overflow ops with MVT::Glues are lowered to CPSR dependences.
+ // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
setOperationAction(ISD::ADDC, MVT::i32, Custom);
setOperationAction(ISD::ADDE, MVT::i32, Custom);
setOperationAction(ISD::SUBC, MVT::i32, Custom);
setOperationAction(ISD::ROTL, MVT::i32, Expand);
setOperationAction(ISD::ROTL, MVT::i64, Expand);
- // ARM64 doesn't have a direct vector ->f32 conversion instructions for
- // elements smaller than i32, so promote the input to i32 first.
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
- setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
- setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
- // Similarly, there is no direct i32 -> f64 vector conversion instruction.
- setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
-
// ARM64 doesn't have {U|S}MUL_LOHI.
setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
- // ARM64 doesn't have MUL.2d:
- setOperationAction(ISD::MUL, MVT::v2i64, Expand);
// Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero
// counterparts, which ARM64 supports directly.
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
// ARM64 has implementations of a lot of rounding-like FP operations.
- static MVT RoundingTypes[] = { MVT::f32, MVT::f64, MVT::v2f32,
- MVT::v4f32, MVT::v2f64 };
+ static MVT RoundingTypes[] = { MVT::f32, MVT::f64};
for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
MVT Ty = RoundingTypes[I];
setOperationAction(ISD::FFLOOR, Ty, Legal);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
+ setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+ setTruncStoreAction(MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f128, MVT::f80, Expand);
setTruncStoreAction(MVT::f128, MVT::f64, Expand);
setTruncStoreAction(MVT::f128, MVT::f32, Expand);
- setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+ setTruncStoreAction(MVT::f128, MVT::f16, Expand);
// Indexed loads and stores are supported.
for (unsigned im = (unsigned)ISD::PRE_INC;
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
setIndexedStoreAction(im, MVT::f32, Legal);
}
- // Likewise, narrowing and extending vector loads/stores aren't handled
- // directly.
- for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
- VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
-
- setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
- Expand);
-
- for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
- InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
- setTruncStoreAction((MVT::SimpleValueType)VT,
- (MVT::SimpleValueType)InnerVT, Expand);
- setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
- setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
- }
-
// Trap.
setOperationAction(ISD::TRAP, MVT::Other, Legal);
- setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
// We combine OR nodes for bitfield operations.
setTargetDAGCombine(ISD::OR);
setTargetDAGCombine(ISD::MUL);
+ setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::VSELECT);
+ setTargetDAGCombine(ISD::INTRINSIC_VOID);
+ setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+
MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
setMinFunctionAlignment(2);
- RequireStrictAlign = StrictAlign;
+ RequireStrictAlign = (Align == StrictAlign);
+
+ setHasExtractBitsInsn(true);
+
+ if (Subtarget->hasNEON()) {
+ // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
+ // silliness like this:
+ setOperationAction(ISD::FABS, MVT::v1f64, Expand);
+ setOperationAction(ISD::FADD, MVT::v1f64, Expand);
+ setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
+ setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
+ setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
+ setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
+ setOperationAction(ISD::FMA, MVT::v1f64, Expand);
+ setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
+ setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
+ setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
+ setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
+ setOperationAction(ISD::FREM, MVT::v1f64, Expand);
+ setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
+ setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
+ setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
+ setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
+ setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
+ setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
+ setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
+ setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
+ setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
+ setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
+
+ setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
+
+ setOperationAction(ISD::MUL, MVT::v1i64, Expand);
+
+ // ARM64 doesn't have a direct vector ->f32 conversion instructions for
+ // elements smaller than i32, so promote the input to i32 first.
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
+ // Similarly, there is no direct i32 -> f64 vector conversion instruction.
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
+
+ // ARM64 doesn't have MUL.2d:
+ setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
+ setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+ // Likewise, narrowing and extending vector loads/stores aren't handled
+ // directly.
+ for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+ VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+
+ setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
+ Expand);
+
+ setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+
+ for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+ InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+ setTruncStoreAction((MVT::SimpleValueType)VT,
+ (MVT::SimpleValueType)InnerVT, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+ setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
+ }
+
+ // ARM64 has implementations of a lot of rounding-like FP operations.
+ static MVT RoundingVecTypes[] = {MVT::v2f32, MVT::v4f32, MVT::v2f64 };
+ for (unsigned I = 0; I < array_lengthof(RoundingVecTypes); ++I) {
+ MVT Ty = RoundingVecTypes[I];
+ setOperationAction(ISD::FFLOOR, Ty, Legal);
+ setOperationAction(ISD::FNEARBYINT, Ty, Legal);
+ setOperationAction(ISD::FCEIL, Ty, Legal);
+ setOperationAction(ISD::FRINT, Ty, Legal);
+ setOperationAction(ISD::FTRUNC, Ty, Legal);
+ setOperationAction(ISD::FROUND, Ty, Legal);
+ }
+ }
}
void ARM64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
+ // CNT supports only B element sizes.
+ if (VT != MVT::v8i8 && VT != MVT::v16i8)
+ setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand);
+
setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
+
+ if (Subtarget->isLittleEndian()) {
+ for (unsigned im = (unsigned)ISD::PRE_INC;
+ im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+ setIndexedLoadAction(im, VT.getSimpleVT(), Legal);
+ setIndexedStoreAction(im, VT.getSimpleVT(), Legal);
+ }
+ }
}
void ARM64TargetLowering::addDRTypeForNEON(MVT VT) {
KnownOne &= KnownOne2;
break;
}
- case ISD::INTRINSIC_W_CHAIN:
+ case ISD::INTRINSIC_W_CHAIN: {
+ ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
+ Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
+ switch (IntID) {
+ default: return;
+ case Intrinsic::arm64_ldaxr:
+ case Intrinsic::arm64_ldxr: {
+ unsigned BitWidth = KnownOne.getBitWidth();
+ EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
+ unsigned MemBits = VT.getScalarType().getSizeInBits();
+ KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
+ return;
+ }
+ }
break;
+ }
case ISD::INTRINSIC_WO_CHAIN:
case ISD::INTRINSIC_VOID: {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
const char *ARM64TargetLowering::getTargetNodeName(unsigned Opcode) const {
switch (Opcode) {
default:
- return 0;
+ return nullptr;
case ARM64ISD::CALL: return "ARM64ISD::CALL";
case ARM64ISD::ADRP: return "ARM64ISD::ADRP";
case ARM64ISD::ADDlow: return "ARM64ISD::ADDlow";
case ARM64ISD::MVNImsl: return "ARM64ISD::MVNImsl";
case ARM64ISD::BICi: return "ARM64ISD::BICi";
case ARM64ISD::ORRi: return "ARM64ISD::ORRi";
+ case ARM64ISD::BSL: return "ARM64ISD::BSL";
case ARM64ISD::NEG: return "ARM64ISD::NEG";
case ARM64ISD::EXTR: return "ARM64ISD::EXTR";
case ARM64ISD::ZIP1: return "ARM64ISD::ZIP1";
case ARM64ISD::URSHR_I: return "ARM64ISD::URSHR_I";
case ARM64ISD::SQSHLU_I: return "ARM64ISD::SQSHLU_I";
case ARM64ISD::WrapperLarge: return "ARM64ISD::WrapperLarge";
+ case ARM64ISD::LD2post: return "ARM64ISD::LD2post";
+ case ARM64ISD::LD3post: return "ARM64ISD::LD3post";
+ case ARM64ISD::LD4post: return "ARM64ISD::LD4post";
+ case ARM64ISD::ST2post: return "ARM64ISD::ST2post";
+ case ARM64ISD::ST3post: return "ARM64ISD::ST3post";
+ case ARM64ISD::ST4post: return "ARM64ISD::ST4post";
+ case ARM64ISD::LD1x2post: return "ARM64ISD::LD1x2post";
+ case ARM64ISD::LD1x3post: return "ARM64ISD::LD1x3post";
+ case ARM64ISD::LD1x4post: return "ARM64ISD::LD1x4post";
+ case ARM64ISD::ST1x2post: return "ARM64ISD::ST1x2post";
+ case ARM64ISD::ST1x3post: return "ARM64ISD::ST1x3post";
+ case ARM64ISD::ST1x4post: return "ARM64ISD::ST1x4post";
+ case ARM64ISD::LD2DUPpost: return "ARM64ISD::LD2DUPpost";
+ case ARM64ISD::LD3DUPpost: return "ARM64ISD::LD3DUPpost";
+ case ARM64ISD::LD4DUPpost: return "ARM64ISD::LD4DUPpost";
+ case ARM64ISD::LD2LANEpost: return "ARM64ISD::LD2LANEpost";
+ case ARM64ISD::LD3LANEpost: return "ARM64ISD::LD3LANEpost";
+ case ARM64ISD::LD4LANEpost: return "ARM64ISD::LD4LANEpost";
+ case ARM64ISD::ST2LANEpost: return "ARM64ISD::ST2LANEpost";
+ case ARM64ISD::ST3LANEpost: return "ARM64ISD::ST3LANEpost";
+ case ARM64ISD::ST4LANEpost: return "ARM64ISD::ST4LANEpost";
}
}
-static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord,
- unsigned &LdrOpc, unsigned &StrOpc) {
- static unsigned LoadBares[] = { ARM64::LDXRB, ARM64::LDXRH, ARM64::LDXRW,
- ARM64::LDXRX, ARM64::LDXPX };
- static unsigned LoadAcqs[] = { ARM64::LDAXRB, ARM64::LDAXRH, ARM64::LDAXRW,
- ARM64::LDAXRX, ARM64::LDAXPX };
- static unsigned StoreBares[] = { ARM64::STXRB, ARM64::STXRH, ARM64::STXRW,
- ARM64::STXRX, ARM64::STXPX };
- static unsigned StoreRels[] = { ARM64::STLXRB, ARM64::STLXRH, ARM64::STLXRW,
- ARM64::STLXRX, ARM64::STLXPX };
-
- unsigned *LoadOps, *StoreOps;
- if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent)
- LoadOps = LoadAcqs;
- else
- LoadOps = LoadBares;
-
- if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
- StoreOps = StoreRels;
- else
- StoreOps = StoreBares;
-
- assert(isPowerOf2_32(Size) && Size <= 16 &&
- "unsupported size for atomic binary op!");
-
- LdrOpc = LoadOps[Log2_32(Size)];
- StrOpc = StoreOps[Log2_32(Size)];
-}
-
-MachineBasicBlock *ARM64TargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
- MachineBasicBlock *BB,
- unsigned Size) const {
- unsigned dest = MI->getOperand(0).getReg();
- unsigned ptr = MI->getOperand(1).getReg();
- unsigned oldval = MI->getOperand(2).getReg();
- unsigned newval = MI->getOperand(3).getReg();
- AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(4).getImm());
- unsigned scratch = BB->getParent()->getRegInfo().createVirtualRegister(
- &ARM64::GPR32RegClass);
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
- DebugLoc dl = MI->getDebugLoc();
-
- // FIXME: We currently always generate a seq_cst operation; we should
- // be able to relax this in some cases.
- unsigned ldrOpc, strOpc;
- getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
-
- MachineFunction *MF = BB->getParent();
- const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction::iterator It = BB;
- ++It; // insert the new blocks after the current block
-
- MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MF->insert(It, loop1MBB);
- MF->insert(It, loop2MBB);
- MF->insert(It, exitMBB);
-
- // Transfer the remainder of BB and its successor edges to exitMBB.
- exitMBB->splice(exitMBB->begin(), BB,
- std::next(MachineBasicBlock::iterator(MI)), BB->end());
- exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
- // thisMBB:
- // ...
- // fallthrough --> loop1MBB
- BB->addSuccessor(loop1MBB);
-
- // loop1MBB:
- // ldrex dest, [ptr]
- // cmp dest, oldval
- // bne exitMBB
- BB = loop1MBB;
- BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
- BuildMI(BB, dl, TII->get(Size == 8 ? ARM64::SUBSXrr : ARM64::SUBSWrr))
- .addReg(Size == 8 ? ARM64::XZR : ARM64::WZR, RegState::Define)
- .addReg(dest)
- .addReg(oldval);
- BuildMI(BB, dl, TII->get(ARM64::Bcc)).addImm(ARM64CC::NE).addMBB(exitMBB);
- BB->addSuccessor(loop2MBB);
- BB->addSuccessor(exitMBB);
-
- // loop2MBB:
- // strex scratch, newval, [ptr]
- // cmp scratch, #0
- // bne loop1MBB
- BB = loop2MBB;
- BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(newval).addReg(ptr);
- BuildMI(BB, dl, TII->get(ARM64::CBNZW)).addReg(scratch).addMBB(loop1MBB);
- BB->addSuccessor(loop1MBB);
- BB->addSuccessor(exitMBB);
-
- // exitMBB:
- // ...
- BB = exitMBB;
-
- MI->eraseFromParent(); // The instruction is gone now.
-
- return BB;
-}
-
-MachineBasicBlock *
-ARM64TargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
- unsigned Size, unsigned BinOpcode) const {
- // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
- const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction *MF = BB->getParent();
- MachineFunction::iterator It = BB;
- ++It;
-
- unsigned dest = MI->getOperand(0).getReg();
- unsigned ptr = MI->getOperand(1).getReg();
- unsigned incr = MI->getOperand(2).getReg();
- AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
- DebugLoc dl = MI->getDebugLoc();
-
- unsigned ldrOpc, strOpc;
- getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
-
- MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MF->insert(It, loopMBB);
- MF->insert(It, exitMBB);
-
- // Transfer the remainder of BB and its successor edges to exitMBB.
- exitMBB->splice(exitMBB->begin(), BB,
- std::next(MachineBasicBlock::iterator(MI)), BB->end());
- exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
- MachineRegisterInfo &RegInfo = MF->getRegInfo();
- unsigned scratch = RegInfo.createVirtualRegister(&ARM64::GPR32RegClass);
- unsigned scratch2 =
- (!BinOpcode)
- ? incr
- : RegInfo.createVirtualRegister(Size == 8 ? &ARM64::GPR64RegClass
- : &ARM64::GPR32RegClass);
-
- // thisMBB:
- // ...
- // fallthrough --> loopMBB
- BB->addSuccessor(loopMBB);
-
- // loopMBB:
- // ldxr dest, ptr
- // <binop> scratch2, dest, incr
- // stxr scratch, scratch2, ptr
- // cbnz scratch, loopMBB
- // fallthrough --> exitMBB
- BB = loopMBB;
- BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
- if (BinOpcode) {
- // operand order needs to go the other way for NAND
- if (BinOpcode == ARM64::BICWrr || BinOpcode == ARM64::BICXrr)
- BuildMI(BB, dl, TII->get(BinOpcode), scratch2).addReg(incr).addReg(dest);
- else
- BuildMI(BB, dl, TII->get(BinOpcode), scratch2).addReg(dest).addReg(incr);
- }
-
- BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr);
- BuildMI(BB, dl, TII->get(ARM64::CBNZW)).addReg(scratch).addMBB(loopMBB);
-
- BB->addSuccessor(loopMBB);
- BB->addSuccessor(exitMBB);
-
- // exitMBB:
- // ...
- BB = exitMBB;
-
- MI->eraseFromParent(); // The instruction is gone now.
-
- return BB;
-}
-
-MachineBasicBlock *ARM64TargetLowering::EmitAtomicBinary128(
- MachineInstr *MI, MachineBasicBlock *BB, unsigned BinOpcodeLo,
- unsigned BinOpcodeHi) const {
- // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
- const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction *MF = BB->getParent();
- MachineFunction::iterator It = BB;
- ++It;
-
- unsigned DestLo = MI->getOperand(0).getReg();
- unsigned DestHi = MI->getOperand(1).getReg();
- unsigned Ptr = MI->getOperand(2).getReg();
- unsigned IncrLo = MI->getOperand(3).getReg();
- unsigned IncrHi = MI->getOperand(4).getReg();
- AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(5).getImm());
- DebugLoc DL = MI->getDebugLoc();
-
- unsigned LdrOpc, StrOpc;
- getExclusiveOperation(16, Ord, LdrOpc, StrOpc);
-
- MachineBasicBlock *LoopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *ExitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MF->insert(It, LoopMBB);
- MF->insert(It, ExitMBB);
-
- // Transfer the remainder of BB and its successor edges to exitMBB.
- ExitMBB->splice(ExitMBB->begin(), BB,
- std::next(MachineBasicBlock::iterator(MI)), BB->end());
- ExitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
- MachineRegisterInfo &RegInfo = MF->getRegInfo();
- unsigned ScratchRes = RegInfo.createVirtualRegister(&ARM64::GPR32RegClass);
- unsigned ScratchLo = IncrLo, ScratchHi = IncrHi;
- if (BinOpcodeLo) {
- assert(BinOpcodeHi && "Expect neither or both opcodes to be defined");
- ScratchLo = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass);
- ScratchHi = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass);
- }
-
- // ThisMBB:
- // ...
- // fallthrough --> LoopMBB
- BB->addSuccessor(LoopMBB);
-
- // LoopMBB:
- // ldxp DestLo, DestHi, Ptr
- // <binoplo> ScratchLo, DestLo, IncrLo
- // <binophi> ScratchHi, DestHi, IncrHi
- // stxp ScratchRes, ScratchLo, ScratchHi, ptr
- // cbnz ScratchRes, LoopMBB
- // fallthrough --> ExitMBB
- BB = LoopMBB;
- BuildMI(BB, DL, TII->get(LdrOpc), DestLo)
- .addReg(DestHi, RegState::Define)
- .addReg(Ptr);
- if (BinOpcodeLo) {
- // operand order needs to go the other way for NAND
- if (BinOpcodeLo == ARM64::BICXrr) {
- std::swap(IncrLo, DestLo);
- std::swap(IncrHi, DestHi);
- }
-
- BuildMI(BB, DL, TII->get(BinOpcodeLo), ScratchLo).addReg(DestLo).addReg(
- IncrLo);
- BuildMI(BB, DL, TII->get(BinOpcodeHi), ScratchHi).addReg(DestHi).addReg(
- IncrHi);
- }
-
- BuildMI(BB, DL, TII->get(StrOpc), ScratchRes)
- .addReg(ScratchLo)
- .addReg(ScratchHi)
- .addReg(Ptr);
- BuildMI(BB, DL, TII->get(ARM64::CBNZW)).addReg(ScratchRes).addMBB(LoopMBB);
-
- BB->addSuccessor(LoopMBB);
- BB->addSuccessor(ExitMBB);
-
- // ExitMBB:
- // ...
- BB = ExitMBB;
-
- MI->eraseFromParent(); // The instruction is gone now.
-
- return BB;
-}
-
-MachineBasicBlock *
-ARM64TargetLowering::EmitAtomicCmpSwap128(MachineInstr *MI,
- MachineBasicBlock *BB) const {
- unsigned DestLo = MI->getOperand(0).getReg();
- unsigned DestHi = MI->getOperand(1).getReg();
- unsigned Ptr = MI->getOperand(2).getReg();
- unsigned OldValLo = MI->getOperand(3).getReg();
- unsigned OldValHi = MI->getOperand(4).getReg();
- unsigned NewValLo = MI->getOperand(5).getReg();
- unsigned NewValHi = MI->getOperand(6).getReg();
- AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(7).getImm());
- unsigned ScratchRes = BB->getParent()->getRegInfo().createVirtualRegister(
- &ARM64::GPR32RegClass);
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
- DebugLoc DL = MI->getDebugLoc();
-
- unsigned LdrOpc, StrOpc;
- getExclusiveOperation(16, Ord, LdrOpc, StrOpc);
-
- MachineFunction *MF = BB->getParent();
- const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction::iterator It = BB;
- ++It; // insert the new blocks after the current block
-
- MachineBasicBlock *Loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *Loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *ExitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MF->insert(It, Loop1MBB);
- MF->insert(It, Loop2MBB);
- MF->insert(It, ExitMBB);
-
- // Transfer the remainder of BB and its successor edges to exitMBB.
- ExitMBB->splice(ExitMBB->begin(), BB,
- std::next(MachineBasicBlock::iterator(MI)), BB->end());
- ExitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
- // ThisMBB:
- // ...
- // fallthrough --> Loop1MBB
- BB->addSuccessor(Loop1MBB);
-
- // Loop1MBB:
- // ldxp DestLo, DestHi, [Ptr]
- // cmp DestLo, OldValLo
- // sbc xzr, DestHi, OldValHi
- // bne ExitMBB
- BB = Loop1MBB;
- BuildMI(BB, DL, TII->get(LdrOpc), DestLo)
- .addReg(DestHi, RegState::Define)
- .addReg(Ptr);
- BuildMI(BB, DL, TII->get(ARM64::SUBSXrr), ARM64::XZR).addReg(DestLo).addReg(
- OldValLo);
- BuildMI(BB, DL, TII->get(ARM64::SBCXr), ARM64::XZR).addReg(DestHi).addReg(
- OldValHi);
-
- BuildMI(BB, DL, TII->get(ARM64::Bcc)).addImm(ARM64CC::NE).addMBB(ExitMBB);
- BB->addSuccessor(Loop2MBB);
- BB->addSuccessor(ExitMBB);
-
- // Loop2MBB:
- // stxp ScratchRes, NewValLo, NewValHi, [Ptr]
- // cbnz ScratchRes, Loop1MBB
- BB = Loop2MBB;
- BuildMI(BB, DL, TII->get(StrOpc), ScratchRes)
- .addReg(NewValLo)
- .addReg(NewValHi)
- .addReg(Ptr);
- BuildMI(BB, DL, TII->get(ARM64::CBNZW)).addReg(ScratchRes).addMBB(Loop1MBB);
- BB->addSuccessor(Loop1MBB);
- BB->addSuccessor(ExitMBB);
-
- // ExitMBB:
- // ...
- BB = ExitMBB;
-
- MI->eraseFromParent(); // The instruction is gone now.
-
- return BB;
-}
-
-MachineBasicBlock *ARM64TargetLowering::EmitAtomicMinMax128(
- MachineInstr *MI, MachineBasicBlock *BB, unsigned CondCode) const {
- // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
- const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction *MF = BB->getParent();
- MachineFunction::iterator It = BB;
- ++It;
-
- unsigned DestLo = MI->getOperand(0).getReg();
- unsigned DestHi = MI->getOperand(1).getReg();
- unsigned Ptr = MI->getOperand(2).getReg();
- unsigned IncrLo = MI->getOperand(3).getReg();
- unsigned IncrHi = MI->getOperand(4).getReg();
- AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(5).getImm());
- DebugLoc DL = MI->getDebugLoc();
-
- unsigned LdrOpc, StrOpc;
- getExclusiveOperation(16, Ord, LdrOpc, StrOpc);
-
- MachineBasicBlock *LoopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *ExitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
- MF->insert(It, LoopMBB);
- MF->insert(It, ExitMBB);
-
- // Transfer the remainder of BB and its successor edges to exitMBB.
- ExitMBB->splice(ExitMBB->begin(), BB,
- std::next(MachineBasicBlock::iterator(MI)), BB->end());
- ExitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
- MachineRegisterInfo &RegInfo = MF->getRegInfo();
- unsigned ScratchRes = RegInfo.createVirtualRegister(&ARM64::GPR32RegClass);
- unsigned ScratchLo = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass);
- unsigned ScratchHi = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass);
-
- // ThisMBB:
- // ...
- // fallthrough --> LoopMBB
- BB->addSuccessor(LoopMBB);
-
- // LoopMBB:
- // ldxp DestLo, DestHi, Ptr
- // cmp ScratchLo, DestLo, IncrLo
- // sbc xzr, ScratchHi, DestHi, IncrHi
- // csel ScratchLo, DestLo, IncrLo, <cmp-op>
- // csel ScratchHi, DestHi, IncrHi, <cmp-op>
- // stxp ScratchRes, ScratchLo, ScratchHi, ptr
- // cbnz ScratchRes, LoopMBB
- // fallthrough --> ExitMBB
- BB = LoopMBB;
- BuildMI(BB, DL, TII->get(LdrOpc), DestLo)
- .addReg(DestHi, RegState::Define)
- .addReg(Ptr);
-
- BuildMI(BB, DL, TII->get(ARM64::SUBSXrr), ARM64::XZR).addReg(DestLo).addReg(
- IncrLo);
- BuildMI(BB, DL, TII->get(ARM64::SBCXr), ARM64::XZR).addReg(DestHi).addReg(
- IncrHi);
-
- BuildMI(BB, DL, TII->get(ARM64::CSELXr), ScratchLo)
- .addReg(DestLo)
- .addReg(IncrLo)
- .addImm(CondCode);
- BuildMI(BB, DL, TII->get(ARM64::CSELXr), ScratchHi)
- .addReg(DestHi)
- .addReg(IncrHi)
- .addImm(CondCode);
-
- BuildMI(BB, DL, TII->get(StrOpc), ScratchRes)
- .addReg(ScratchLo)
- .addReg(ScratchHi)
- .addReg(Ptr);
- BuildMI(BB, DL, TII->get(ARM64::CBNZW)).addReg(ScratchRes).addMBB(LoopMBB);
-
- BB->addSuccessor(LoopMBB);
- BB->addSuccessor(ExitMBB);
-
- // ExitMBB:
- // ...
- BB = ExitMBB;
-
- MI->eraseFromParent(); // The instruction is gone now.
-
- return BB;
-}
-
MachineBasicBlock *
ARM64TargetLowering::EmitF128CSEL(MachineInstr *MI,
MachineBasicBlock *MBB) const {
unsigned IfTrueReg = MI->getOperand(1).getReg();
unsigned IfFalseReg = MI->getOperand(2).getReg();
unsigned CondCode = MI->getOperand(3).getImm();
- bool CPSRKilled = MI->getOperand(4).isKill();
+ bool NZCVKilled = MI->getOperand(4).isKill();
MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
// TrueBB falls through to the end.
TrueBB->addSuccessor(EndBB);
- if (!CPSRKilled) {
- TrueBB->addLiveIn(ARM64::CPSR);
- EndBB->addLiveIn(ARM64::CPSR);
+ if (!NZCVKilled) {
+ TrueBB->addLiveIn(ARM64::NZCV);
+ EndBB->addLiveIn(ARM64::NZCV);
}
BuildMI(*EndBB, EndBB->begin(), DL, TII->get(ARM64::PHI), DestReg)
assert(0 && "Unexpected instruction for custom inserter!");
break;
- case ARM64::ATOMIC_LOAD_ADD_I8:
- return EmitAtomicBinary(MI, BB, 1, ARM64::ADDWrr);
- case ARM64::ATOMIC_LOAD_ADD_I16:
- return EmitAtomicBinary(MI, BB, 2, ARM64::ADDWrr);
- case ARM64::ATOMIC_LOAD_ADD_I32:
- return EmitAtomicBinary(MI, BB, 4, ARM64::ADDWrr);
- case ARM64::ATOMIC_LOAD_ADD_I64:
- return EmitAtomicBinary(MI, BB, 8, ARM64::ADDXrr);
- case ARM64::ATOMIC_LOAD_ADD_I128:
- return EmitAtomicBinary128(MI, BB, ARM64::ADDSXrr, ARM64::ADCXr);
-
- case ARM64::ATOMIC_LOAD_AND_I8:
- return EmitAtomicBinary(MI, BB, 1, ARM64::ANDWrr);
- case ARM64::ATOMIC_LOAD_AND_I16:
- return EmitAtomicBinary(MI, BB, 2, ARM64::ANDWrr);
- case ARM64::ATOMIC_LOAD_AND_I32:
- return EmitAtomicBinary(MI, BB, 4, ARM64::ANDWrr);
- case ARM64::ATOMIC_LOAD_AND_I64:
- return EmitAtomicBinary(MI, BB, 8, ARM64::ANDXrr);
- case ARM64::ATOMIC_LOAD_AND_I128:
- return EmitAtomicBinary128(MI, BB, ARM64::ANDXrr, ARM64::ANDXrr);
-
- case ARM64::ATOMIC_LOAD_OR_I8:
- return EmitAtomicBinary(MI, BB, 1, ARM64::ORRWrr);
- case ARM64::ATOMIC_LOAD_OR_I16:
- return EmitAtomicBinary(MI, BB, 2, ARM64::ORRWrr);
- case ARM64::ATOMIC_LOAD_OR_I32:
- return EmitAtomicBinary(MI, BB, 4, ARM64::ORRWrr);
- case ARM64::ATOMIC_LOAD_OR_I64:
- return EmitAtomicBinary(MI, BB, 8, ARM64::ORRXrr);
- case ARM64::ATOMIC_LOAD_OR_I128:
- return EmitAtomicBinary128(MI, BB, ARM64::ORRXrr, ARM64::ORRXrr);
-
- case ARM64::ATOMIC_LOAD_XOR_I8:
- return EmitAtomicBinary(MI, BB, 1, ARM64::EORWrr);
- case ARM64::ATOMIC_LOAD_XOR_I16:
- return EmitAtomicBinary(MI, BB, 2, ARM64::EORWrr);
- case ARM64::ATOMIC_LOAD_XOR_I32:
- return EmitAtomicBinary(MI, BB, 4, ARM64::EORWrr);
- case ARM64::ATOMIC_LOAD_XOR_I64:
- return EmitAtomicBinary(MI, BB, 8, ARM64::EORXrr);
- case ARM64::ATOMIC_LOAD_XOR_I128:
- return EmitAtomicBinary128(MI, BB, ARM64::EORXrr, ARM64::EORXrr);
-
- case ARM64::ATOMIC_LOAD_NAND_I8:
- return EmitAtomicBinary(MI, BB, 1, ARM64::BICWrr);
- case ARM64::ATOMIC_LOAD_NAND_I16:
- return EmitAtomicBinary(MI, BB, 2, ARM64::BICWrr);
- case ARM64::ATOMIC_LOAD_NAND_I32:
- return EmitAtomicBinary(MI, BB, 4, ARM64::BICWrr);
- case ARM64::ATOMIC_LOAD_NAND_I64:
- return EmitAtomicBinary(MI, BB, 8, ARM64::BICXrr);
- case ARM64::ATOMIC_LOAD_NAND_I128:
- return EmitAtomicBinary128(MI, BB, ARM64::BICXrr, ARM64::BICXrr);
-
- case ARM64::ATOMIC_LOAD_SUB_I8:
- return EmitAtomicBinary(MI, BB, 1, ARM64::SUBWrr);
- case ARM64::ATOMIC_LOAD_SUB_I16:
- return EmitAtomicBinary(MI, BB, 2, ARM64::SUBWrr);
- case ARM64::ATOMIC_LOAD_SUB_I32:
- return EmitAtomicBinary(MI, BB, 4, ARM64::SUBWrr);
- case ARM64::ATOMIC_LOAD_SUB_I64:
- return EmitAtomicBinary(MI, BB, 8, ARM64::SUBXrr);
- case ARM64::ATOMIC_LOAD_SUB_I128:
- return EmitAtomicBinary128(MI, BB, ARM64::SUBSXrr, ARM64::SBCXr);
-
- case ARM64::ATOMIC_LOAD_MIN_I128:
- return EmitAtomicMinMax128(MI, BB, ARM64CC::LT);
-
- case ARM64::ATOMIC_LOAD_MAX_I128:
- return EmitAtomicMinMax128(MI, BB, ARM64CC::GT);
-
- case ARM64::ATOMIC_LOAD_UMIN_I128:
- return EmitAtomicMinMax128(MI, BB, ARM64CC::CC);
-
- case ARM64::ATOMIC_LOAD_UMAX_I128:
- return EmitAtomicMinMax128(MI, BB, ARM64CC::HI);
-
- case ARM64::ATOMIC_SWAP_I8:
- return EmitAtomicBinary(MI, BB, 1, 0);
- case ARM64::ATOMIC_SWAP_I16:
- return EmitAtomicBinary(MI, BB, 2, 0);
- case ARM64::ATOMIC_SWAP_I32:
- return EmitAtomicBinary(MI, BB, 4, 0);
- case ARM64::ATOMIC_SWAP_I64:
- return EmitAtomicBinary(MI, BB, 8, 0);
- case ARM64::ATOMIC_SWAP_I128:
- return EmitAtomicBinary128(MI, BB, 0, 0);
-
- case ARM64::ATOMIC_CMP_SWAP_I8:
- return EmitAtomicCmpSwap(MI, BB, 1);
- case ARM64::ATOMIC_CMP_SWAP_I16:
- return EmitAtomicCmpSwap(MI, BB, 2);
- case ARM64::ATOMIC_CMP_SWAP_I32:
- return EmitAtomicCmpSwap(MI, BB, 4);
- case ARM64::ATOMIC_CMP_SWAP_I64:
- return EmitAtomicCmpSwap(MI, BB, 8);
- case ARM64::ATOMIC_CMP_SWAP_I128:
- return EmitAtomicCmpSwap128(MI, BB);
-
case ARM64::F128CSEL:
return EmitF128CSEL(MI, BB);
case ISD::SETUGT:
return ARM64CC::HI;
case ISD::SETUGE:
- return ARM64CC::CS;
+ return ARM64CC::HS;
case ISD::SETULT:
- return ARM64CC::CC;
+ return ARM64CC::LO;
case ISD::SETULE:
return ARM64CC::LS;
}
}
}
+/// changeVectorFPCCToARM64CC - Convert a DAG fp condition code to an ARM64 CC
+/// usable with the vector instructions. Fewer operations are available without
+/// a real NZCV register, so we have to use less efficient combinations to get
+/// the same effect.
+static void changeVectorFPCCToARM64CC(ISD::CondCode CC,
+ ARM64CC::CondCode &CondCode,
+ ARM64CC::CondCode &CondCode2,
+ bool &Invert) {
+ Invert = false;
+ switch (CC) {
+ default:
+ // Mostly the scalar mappings work fine.
+ changeFPCCToARM64CC(CC, CondCode, CondCode2);
+ break;
+ case ISD::SETUO:
+ Invert = true; // Fallthrough
+ case ISD::SETO:
+ CondCode = ARM64CC::MI;
+ CondCode2 = ARM64CC::GE;
+ break;
+ case ISD::SETUEQ:
+ case ISD::SETULT:
+ case ISD::SETULE:
+ case ISD::SETUGT:
+ case ISD::SETUGE:
+ // All of the compare-mask comparisons are ordered, but we can switch
+ // between the two by a double inversion. E.g. ULE == !OGT.
+ Invert = true;
+ changeFPCCToARM64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
+ break;
+ }
+}
+
static bool isLegalArithImmed(uint64_t C) {
// Matches ARM64DAGToDAGISel::SelectArithImmed().
return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
}
-static SDValue emitComparison(SDValue LHS, SDValue RHS, SDLoc dl,
- SelectionDAG &DAG) {
+static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+ SDLoc dl, SelectionDAG &DAG) {
EVT VT = LHS.getValueType();
if (VT.isFloatingPoint())
// SUBS means that it's possible to get CSE with subtract operations.
// A later phase can perform the optimization of setting the destination
// register to WZR/XZR if it ends up being unused.
- return DAG.getNode(ARM64ISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
+ unsigned Opcode = ARM64ISD::SUBS;
+
+ if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) &&
+ cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
+ // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
+ // can be set differently by this operation. It comes down to whether
+ // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
+ // everything is fine. If not then the optimization is wrong. Thus general
+ // comparisons are only valid if op2 != 0.
+
+ // So, finally, the only LLVM-native comparisons that don't mention C and V
+ // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
+ // the absence of information about op2.
+ Opcode = ARM64ISD::ADDS;
+ RHS = RHS.getOperand(1);
+ } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) &&
+ cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
+ !isUnsignedIntSetCC(CC)) {
+ // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
+ // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
+ // of the signed comparisons.
+ Opcode = ARM64ISD::ANDS;
+ RHS = LHS.getOperand(1);
+ LHS = LHS.getOperand(0);
+ }
+
+ return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
.getValue(1);
}
}
}
- SDValue Cmp = emitComparison(LHS, RHS, dl, DAG);
+ SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
ARM64CC::CondCode ARM64CC = changeIntCCToARM64CC(CC);
ARM64cc = DAG.getConstant(ARM64CC, MVT::i32);
return Cmp;
break;
case ISD::UADDO:
Opc = ARM64ISD::ADDS;
- CC = ARM64CC::CS;
+ CC = ARM64CC::HS;
break;
case ISD::SSUBO:
Opc = ARM64ISD::SUBS;
break;
case ISD::USUBO:
Opc = ARM64ISD::SUBS;
- CC = ARM64CC::CC;
+ CC = ARM64CC::LO;
break;
// Multiply needs a little bit extra work.
case ISD::SMULO:
if (VT.getSizeInBits() == InVT.getSizeInBits())
return Op;
- if (InVT == MVT::v2f64) {
+ if (InVT == MVT::v2f64 || InVT == MVT::v4f32) {
SDLoc dl(Op);
- SDValue Cv = DAG.getNode(Op.getOpcode(), dl, MVT::v2i64, Op.getOperand(0));
+ SDValue Cv =
+ DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
+ Op.getOperand(0));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
+ } else if (InVT == MVT::v2f32) {
+ SDLoc dl(Op);
+ SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0));
+ return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
}
// Type changing conversions are illegal.
BuildVectorOps.push_back(Sclr);
}
- return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &BuildVectorOps[0],
- BuildVectorOps.size());
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, BuildVectorOps);
}
SDValue ARM64TargetLowering::LowerINT_TO_FP(SDValue Op,
assert(!Res && "Call operand has unhandled type");
(void)Res;
}
-
+ assert(ArgLocs.size() == Ins.size());
SmallVector<SDValue, 16> ArgValues;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
- // Arguments stored in registers.
- if (VA.isRegLoc()) {
+ if (Ins[i].Flags.isByVal()) {
+ // Byval is used for HFAs in the PCS, but the system should work in a
+ // non-compliant manner for larger structs.
+ EVT PtrTy = getPointerTy();
+ int Size = Ins[i].Flags.getByValSize();
+ unsigned NumRegs = (Size + 7) / 8;
+
+ // FIXME: This works on big-endian for composite byvals, which are the common
+ // case. It should also work for fundamental types too.
+ unsigned FrameIdx =
+ MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
+ SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
+ InVals.push_back(FrameIdxN);
+
+ continue;
+ } if (VA.isRegLoc()) {
+ // Arguments stored in registers.
EVT RegVT = VA.getLocVT();
SDValue ArgValue;
RC = &ARM64::GPR64RegClass;
else if (RegVT == MVT::f32)
RC = &ARM64::FPR32RegClass;
- else if (RegVT == MVT::f64 || RegVT == MVT::v1i64 ||
- RegVT == MVT::v1f64 || RegVT == MVT::v2i32 ||
- RegVT == MVT::v4i16 || RegVT == MVT::v8i8)
+ else if (RegVT == MVT::f64 || RegVT.is64BitVector())
RC = &ARM64::FPR64RegClass;
- else if (RegVT == MVT::v2i64 || RegVT == MVT::v4i32 ||
- RegVT == MVT::v8i16 || RegVT == MVT::v16i8)
+ else if (RegVT == MVT::f128 || RegVT.is128BitVector())
RC = &ARM64::FPR128RegClass;
else
llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
unsigned ArgOffset = VA.getLocMemOffset();
unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
- int FI = MFI->CreateFixedObject(ArgSize, ArgOffset, true);
+
+ uint32_t BEAlign = 0;
+ if (ArgSize < 8 && !Subtarget->isLittleEndian())
+ BEAlign = 8 - ArgSize;
+
+ int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
// Create load nodes to retrieve arguments from the stack.
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
- InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, FIN,
- MachinePointerInfo::getFixedStack(FI), false,
- false, false, 0));
+ SDValue ArgValue;
+
+ // If the loc type and val type are not the same, create an anyext load.
+ if (VA.getLocVT().getSizeInBits() != VA.getValVT().getSizeInBits()) {
+ // We should only get here if this is a pure integer.
+ assert(!VA.getValVT().isVector() && VA.getValVT().isInteger() &&
+ "Only integer extension supported!");
+ ArgValue = DAG.getExtLoad(ISD::EXTLOAD, DL, VA.getValVT(), Chain, FIN,
+ MachinePointerInfo::getFixedStack(FI),
+ VA.getLocVT(),
+ false, false, false, 0);
+ } else {
+ ArgValue = DAG.getLoad(VA.getValVT(), DL, Chain, FIN,
+ MachinePointerInfo::getFixedStack(FI), false,
+ false, false, 0);
+ }
+
+ InVals.push_back(ArgValue);
}
}
unsigned FirstVariadicGPR =
CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs);
- static const MCPhysReg FPRArgRegs[] = { ARM64::Q0, ARM64::Q1, ARM64::Q2,
- ARM64::Q3, ARM64::Q4, ARM64::Q5,
- ARM64::Q6, ARM64::Q7 };
- static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
- unsigned FirstVariadicFPR =
- CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs);
-
unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
int GPRIdx = 0;
if (GPRSaveSize != 0) {
DAG.getConstant(8, getPointerTy()));
}
}
+ FuncInfo->setVarArgsGPRIndex(GPRIdx);
+ FuncInfo->setVarArgsGPRSize(GPRSaveSize);
- unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
- int FPRIdx = 0;
- if (FPRSaveSize != 0) {
- FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
-
- SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
-
- for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
- unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &ARM64::FPR128RegClass);
- SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::v2i64);
- SDValue Store =
- DAG.getStore(Val.getValue(1), DL, Val, FIN,
- MachinePointerInfo::getStack(i * 16), false, false, 0);
- MemOps.push_back(Store);
- FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
- DAG.getConstant(16, getPointerTy()));
+ if (Subtarget->hasFPARMv8()) {
+ static const MCPhysReg FPRArgRegs[] = { ARM64::Q0, ARM64::Q1, ARM64::Q2,
+ ARM64::Q3, ARM64::Q4, ARM64::Q5,
+ ARM64::Q6, ARM64::Q7 };
+ static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
+ unsigned FirstVariadicFPR =
+ CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs);
+
+ unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
+ int FPRIdx = 0;
+ if (FPRSaveSize != 0) {
+ FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
+
+ SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
+
+ for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
+ unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &ARM64::FPR128RegClass);
+ SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
+
+ SDValue Store =
+ DAG.getStore(Val.getValue(1), DL, Val, FIN,
+ MachinePointerInfo::getStack(i * 16), false, false, 0);
+ MemOps.push_back(Store);
+ FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
+ DAG.getConstant(16, getPointerTy()));
+ }
}
+ FuncInfo->setVarArgsFPRIndex(FPRIdx);
+ FuncInfo->setVarArgsFPRSize(FPRSaveSize);
}
- FuncInfo->setVarArgsGPRIndex(GPRIdx);
- FuncInfo->setVarArgsGPRSize(GPRSaveSize);
- FuncInfo->setVarArgsFPRIndex(FPRIdx);
- FuncInfo->setVarArgsFPRSize(FPRSaveSize);
-
if (!MemOps.empty()) {
- Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
- MemOps.size());
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
}
}
IsTailCall = isEligibleForTailCallOptimization(
Callee, CallConv, IsVarArg, IsStructRet,
MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG);
+ if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
+ report_fatal_error("failed to perform tail call elimination on a call "
+ "site marked musttail");
// We don't support GuaranteedTailCallOpt, only automatically
// detected sibcalls.
// FIXME: Re-evaluate. Is this true? Should it be true?
// There's no reason we can't support stack args w/ tailcall, but
// we currently don't, so assert if we see one.
assert(!IsTailCall && "stack argument with tail call!?");
+
+ // FIXME: This works on big-endian for composite byvals, which are the common
+ // case. It should also work for fundamental types too.
+ uint32_t BEAlign = 0;
+ if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
+ unsigned OpSize = (VA.getLocVT().getSizeInBits() + 7) / 8;
+ if (OpSize < 8)
+ BEAlign = 8 - OpSize;
+ }
+
unsigned LocMemOffset = VA.getLocMemOffset();
- SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+ SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset + BEAlign);
PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
- // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
- // promoted to a legal register type i32, we should truncate Arg back to
- // i1/i8/i16.
- if (Arg.getValueType().isSimple() &&
- Arg.getValueType().getSimpleVT() == MVT::i32 &&
- (VA.getLocVT() == MVT::i1 || VA.getLocVT() == MVT::i8 ||
- VA.getLocVT() == MVT::i16))
- Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getLocVT(), Arg);
-
- SDValue Store = DAG.getStore(Chain, DL, Arg, PtrOff,
- MachinePointerInfo::getStack(LocMemOffset),
- false, false, 0);
- MemOpChains.push_back(Store);
+ if (Outs[i].Flags.isByVal()) {
+ SDValue SizeNode =
+ DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
+ SDValue Cpy = DAG.getMemcpy(
+ Chain, DL, PtrOff, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
+ /*isVolatile = */ false,
+ /*alwaysInline = */ false,
+ MachinePointerInfo::getStack(LocMemOffset), MachinePointerInfo());
+
+ MemOpChains.push_back(Cpy);
+ } else {
+ // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
+ // promoted to a legal register type i32, we should truncate Arg back to
+ // i1/i8/i16.
+ if (Arg.getValueType().isSimple() &&
+ Arg.getValueType().getSimpleVT() == MVT::i32 &&
+ (VA.getLocVT() == MVT::i1 || VA.getLocVT() == MVT::i8 ||
+ VA.getLocVT() == MVT::i16))
+ Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getLocVT(), Arg);
+
+ SDValue Store = DAG.getStore(Chain, DL, Arg, PtrOff,
+ MachinePointerInfo::getStack(LocMemOffset),
+ false, false, 0);
+ MemOpChains.push_back(Store);
+ }
}
}
if (!MemOpChains.empty())
- Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOpChains[0],
- MemOpChains.size());
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
// Build a sequence of copy-to-reg nodes chained together with token chain
// and flag operands which copy the outgoing args into the appropriate regs.
// If we're doing a tall call, use a TC_RETURN here rather than an
// actual call instruction.
if (IsTailCall)
- return DAG.getNode(ARM64ISD::TC_RETURN, DL, NodeTys, &Ops[0], Ops.size());
+ return DAG.getNode(ARM64ISD::TC_RETURN, DL, NodeTys, Ops);
// Returns a chain and a flag for retval copy to use.
- Chain = DAG.getNode(ARM64ISD::CALL, DL, NodeTys, &Ops[0], Ops.size());
+ Chain = DAG.getNode(ARM64ISD::CALL, DL, NodeTys, Ops);
InFlag = Chain.getValue(1);
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
if (Flag.getNode())
RetOps.push_back(Flag);
- return DAG.getNode(ARM64ISD::RET_FLAG, DL, MVT::Other, &RetOps[0],
- RetOps.size());
+ return DAG.getNode(ARM64ISD::RET_FLAG, DL, MVT::Other, RetOps);
}
//===----------------------------------------------------------------------===//
MFI->setAdjustsStack(true);
// TLS calls preserve all registers except those that absolutely must be
- // trashed: X0 (it takes an argument), LR (it's a call) and CPSR (let's not be
+ // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
// silly).
const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
const ARM64RegisterInfo *ARI = static_cast<const ARM64RegisterInfo *>(TRI);
/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
/// is a function pointer to carry out the resolution. This function takes the
/// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All
-/// other registers (except LR, CPSR) are preserved.
+/// other registers (except LR, NZCV) are preserved.
///
/// Thus, the ideal call sequence on AArch64 is:
///
SDValue Func = DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, SymAddr);
// TLS calls preserve all registers except those that absolutely must be
- // trashed: X0 (it takes an argument), LR (it's a call) and CPSR (let's not be
+ // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
// silly).
const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
const ARM64RegisterInfo *ARI = static_cast<const ARM64RegisterInfo *>(TRI);
Ops.push_back(Glue);
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
- Chain = DAG.getNode(ARM64ISD::TLSDESC_CALL, DL, NodeTys, &Ops[0], Ops.size());
+ Chain = DAG.getNode(ARM64ISD::TLSDESC_CALL, DL, NodeTys, Ops);
Glue = Chain.getValue(1);
return DAG.getCopyFromReg(Chain, DL, ARM64::X0, PtrVT, Glue);
// If softenSetCCOperands returned a scalar, we need to compare the result
// against zero to select between true and false values.
- if (RHS.getNode() == 0) {
+ if (!RHS.getNode()) {
RHS = DAG.getConstant(0, LHS.getValueType());
CC = ISD::SETNE;
}
// Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
// clean. Some of them require two branches to implement.
- SDValue Cmp = emitComparison(LHS, RHS, dl, DAG);
+ SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
ARM64CC::CondCode CC1, CC2;
changeFPCCToARM64CC(CC, CC1, CC2);
SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i)
BuildVectorOps.push_back(EltMask);
- SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT,
- &BuildVectorOps[0], BuildVectorOps.size());
+ SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, BuildVectorOps);
// If we couldn't materialize the mask above, then the mask vector will be
// the zero vector, and we need to negate it here.
softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
// If softenSetCCOperands returned a scalar, use it.
- if (RHS.getNode() == 0) {
+ if (!RHS.getNode()) {
assert(LHS.getValueType() == Op.getValueType() &&
"Unexpected setcc expansion!");
return LHS;
// If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
// and do the comparison.
- SDValue Cmp = emitComparison(LHS, RHS, dl, DAG);
+ SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
ARM64CC::CondCode CC1, CC2;
changeFPCCToARM64CC(CC, CC1, CC2);
// If softenSetCCOperands returned a scalar, we need to compare the result
// against zero to select between true and false values.
- if (RHS.getNode() == 0) {
+ if (!RHS.getNode()) {
RHS = DAG.getConstant(0, LHS.getValueType());
CC = ISD::SETNE;
}
// If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
// and do the comparison.
- SDValue Cmp = emitComparison(LHS, RHS, dl, DAG);
+ SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
// Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
// clean. Some of them require two CSELs to implement.
EVT PtrVT = getPointerTy();
SDLoc DL(Op);
+ if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+ !Subtarget->isTargetMachO()) {
+ const unsigned char MO_NC = ARM64II::MO_NC;
+ return DAG.getNode(
+ ARM64ISD::WrapperLarge, DL, PtrVT,
+ DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_G3),
+ DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_G2 | MO_NC),
+ DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_G1 | MO_NC),
+ DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_G0 | MO_NC));
+ }
+
SDValue Hi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_PAGE);
SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
VROffsAddr, MachinePointerInfo(SV, 28), false,
false, 4));
- return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
- MemOps.size());
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
}
SDValue ARM64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
DAG.getIntPtrConstant(1));
SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
// Merge the rounded value with the chain output of the load.
- return DAG.getMergeValues(Ops, 2, DL);
+ return DAG.getMergeValues(Ops, DL);
}
return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false,
return FrameAddr;
}
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned ARM64TargetLowering::getRegisterByName(const char* RegName) const {
+ unsigned Reg = StringSwitch<unsigned>(RegName)
+ .Case("sp", ARM64::SP)
+ .Default(0);
+ if (Reg)
+ return Reg;
+ report_fatal_error("Invalid register name global variable");
+}
+
SDValue ARM64TargetLowering::LowerRETURNADDR(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
DAG.getConstant(VTBits, MVT::i64));
SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
- SDValue Cmp =
- emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64), dl, DAG);
+ SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
+ ISD::SETGE, dl, DAG);
SDValue CCVal = DAG.getConstant(ARM64CC::GE, MVT::i32);
SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
DAG.getNode(ARM64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp);
SDValue Ops[2] = { Lo, Hi };
- return DAG.getMergeValues(Ops, 2, dl);
+ return DAG.getMergeValues(Ops, dl);
}
/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
- SDValue Cmp =
- emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64), dl, DAG);
+ SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
+ ISD::SETGE, dl, DAG);
SDValue CCVal = DAG.getConstant(ARM64CC::GE, MVT::i32);
SDValue Hi = DAG.getNode(ARM64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp);
DAG.getNode(ARM64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
SDValue Ops[2] = { Lo, Hi };
- return DAG.getMergeValues(Ops, 2, dl);
+ return DAG.getMergeValues(Ops, dl);
}
bool
Value *CallOperandVal = info.CallOperandVal;
// If we don't have a value, we can't do a match,
// but allow it at the lowest weight.
- if (CallOperandVal == NULL)
+ if (!CallOperandVal)
return CW_Default;
Type *type = CallOperandVal->getType();
// Look at the constraint type.
}
}
if (StringRef("{cc}").equals_lower(Constraint))
- return std::make_pair(unsigned(ARM64::CPSR), &ARM64::CCRRegClass);
+ return std::make_pair(unsigned(ARM64::NZCV), &ARM64::CCRRegClass);
// Use the default implementation in TargetLowering to convert the register
// constraint into a member of a register class.
Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
// Not found as a standard register?
- if (Res.second == 0) {
+ if (!Res.second) {
unsigned Size = Constraint.size();
if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
void ARM64TargetLowering::LowerAsmOperandForConstraint(
SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
SelectionDAG &DAG) const {
- SDValue Result(0, 0);
+ SDValue Result;
// Currently only support length 1 constraints.
if (Constraint.length() != 1)
VEXTOffsets[i] = 0;
continue;
} else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
- // It probably isn't worth padding out a smaller vector just to
- // break it down again in a shuffle.
- return SDValue();
+ // We can pad out the smaller vector for free, so if it's part of a
+ // shuffle...
+ ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, SourceVecs[i],
+ DAG.getUNDEF(SourceVecs[i].getValueType()));
+ continue;
}
// Don't attempt to extract subvectors from BUILD_VECTOR sources
// vector sources of the shuffle are different.
static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
unsigned &Imm) {
- unsigned NumElts = VT.getVectorNumElements();
- ReverseEXT = false;
+ // Look for the first non-undef element.
+ const int *FirstRealElt = std::find_if(M.begin(), M.end(),
+ [](int Elt) {return Elt >= 0;});
- // Assume that the first shuffle index is not UNDEF. Fail if it is.
- if (M[0] < 0)
+ // Benefit form APInt to handle overflow when calculating expected element.
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
+ APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
+ // The following shuffle indices must be the successive elements after the
+ // first real element.
+ const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
+ [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
+ if (FirstWrongElt != M.end())
return false;
- Imm = M[0];
-
- // If this is a VEXT shuffle, the immediate value is the index of the first
- // element. The other shuffle indices must be the successive elements after
- // the first one.
- unsigned ExpectedElt = Imm;
- for (unsigned i = 1; i < NumElts; ++i) {
- // Increment the expected index. If it wraps around, it may still be
- // a VEXT but the source vectors must be swapped.
- ExpectedElt += 1;
- if (ExpectedElt == NumElts * 2) {
- ExpectedElt = 0;
+ // The index of an EXT is the first element if it is not UNDEF.
+ // Watch out for the beginning UNDEFs. The EXT index should be the expected
+ // value of the first element.
+ // E.g. <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
+ // <-1, -1, 0, 1, ...> is treated as <IDX, IDX+1, 0, 1, ...>. IDX is
+ // equal to the ExpectedElt.
+ Imm = (M[0] >= 0) ? static_cast<unsigned>(M[0]) : ExpectedElt.getZExtValue();
+
+ // If no beginning UNDEFs, do swap when M[0] >= NumElts.
+ if (M[0] >= 0 && Imm >= NumElts) {
+ ReverseEXT = true;
+ Imm -= NumElts;
+ } else if (M[0] < 0) {
+ // Only do swap when beginning UNDEFs more than the first real element,
+ if (*FirstRealElt < FirstRealElt - M.begin())
ReverseEXT = true;
- }
-
- if (M[i] < 0)
- continue; // ignore UNDEF indices
- if (ExpectedElt != static_cast<unsigned>(M[i]))
- return false;
+ if (Imm >= NumElts)
+ Imm -= NumElts;
}
- // Adjust the index value if the source operands will be swapped.
- if (ReverseEXT)
- Imm -= NumElts;
-
return true;
}
return true;
}
+static bool isINSMask(ArrayRef<int> M, int NumInputElements,
+ bool &DstIsLeft, int &Anomaly) {
+ if (M.size() != static_cast<size_t>(NumInputElements))
+ return false;
+
+ int NumLHSMatch = 0, NumRHSMatch = 0;
+ int LastLHSMismatch = -1, LastRHSMismatch = -1;
+
+ for (int i = 0; i < NumInputElements; ++i) {
+ if (M[i] == -1) {
+ ++NumLHSMatch;
+ ++NumRHSMatch;
+ continue;
+ }
+
+ if (M[i] == i)
+ ++NumLHSMatch;
+ else
+ LastLHSMismatch = i;
+
+ if (M[i] == i + NumInputElements)
+ ++NumRHSMatch;
+ else
+ LastRHSMismatch = i;
+ }
+
+ if (NumLHSMatch == NumInputElements - 1) {
+ DstIsLeft = true;
+ Anomaly = LastLHSMismatch;
+ return true;
+ } else if (NumRHSMatch == NumInputElements - 1) {
+ DstIsLeft = false;
+ Anomaly = LastRHSMismatch;
+ return true;
+ }
+
+ return false;
+}
+
+static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
+ if (VT.getSizeInBits() != 128)
+ return false;
+
+ unsigned NumElts = VT.getVectorNumElements();
+
+ for (int I = 0, E = NumElts / 2; I != E; I++) {
+ if (Mask[I] != I)
+ return false;
+ }
+
+ int Offset = NumElts / 2;
+ for (int I = NumElts / 2, E = NumElts; I != E; I++) {
+ if (Mask[I] != I + SplitLHS * Offset)
+ return false;
+ }
+
+ return true;
+}
+
+static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ SDValue V0 = Op.getOperand(0);
+ SDValue V1 = Op.getOperand(1);
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
+
+ if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
+ VT.getVectorElementType() != V1.getValueType().getVectorElementType())
+ return SDValue();
+
+ bool SplitV0 = V0.getValueType().getSizeInBits() == 128;
+
+ if (!isConcatMask(Mask, VT, SplitV0))
+ return SDValue();
+
+ EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+ VT.getVectorNumElements() / 2);
+ if (SplitV0) {
+ V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
+ DAG.getConstant(0, MVT::i64));
+ }
+ if (V1.getValueType().getSizeInBits() == 128) {
+ V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
+ DAG.getConstant(0, MVT::i64));
+ }
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
+}
+
/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
/// the specified operations to build the shuffle.
static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
Shuffle = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
DAG.getConstant(Intrinsic::arm64_neon_tbl1, MVT::i32), V1Cst,
- DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, &TBLMask[0], IndexLen));
+ DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+ makeArrayRef(TBLMask.data(), IndexLen)));
} else {
if (IndexLen == 8) {
V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
Shuffle = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
DAG.getConstant(Intrinsic::arm64_neon_tbl1, MVT::i32), V1Cst,
- DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, &TBLMask[0], IndexLen));
+ DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+ makeArrayRef(TBLMask.data(), IndexLen)));
} else {
// FIXME: We cannot, for the moment, emit a TBL2 instruction because we
// cannot currently represent the register constraints on the input
Shuffle = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
DAG.getConstant(Intrinsic::arm64_neon_tbl2, MVT::i32), V1Cst, V2Cst,
- DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, &TBLMask[0], IndexLen));
+ DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+ makeArrayRef(TBLMask.data(), IndexLen)));
}
}
return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
}
+ SDValue Concat = tryFormConcatFromShuffle(Op, DAG);
+ if (Concat.getNode())
+ return Concat;
+
+ bool DstIsLeft;
+ int Anomaly;
+ int NumInputElements = V1.getValueType().getVectorNumElements();
+ if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
+ SDValue DstVec = DstIsLeft ? V1 : V2;
+ SDValue DstLaneV = DAG.getConstant(Anomaly, MVT::i64);
+
+ SDValue SrcVec = V1;
+ int SrcLane = ShuffleMask[Anomaly];
+ if (SrcLane >= NumInputElements) {
+ SrcVec = V2;
+ SrcLane -= VT.getVectorNumElements();
+ }
+ SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64);
+
+ EVT ScalarVT = VT.getVectorElementType();
+ if (ScalarVT.getSizeInBits() < 32)
+ ScalarVT = MVT::i32;
+
+ return DAG.getNode(
+ ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
+ DstLaneV);
+ }
+
// If the shuffle is not directly supported and it has 4 elements, use
// the PerfectShuffle-generated table to synthesize it from other shuffles.
unsigned NumElts = VT.getVectorNumElements();
if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
isConstant = false;
- if (isa<ConstantSDNode>(V)) {
+ if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) {
++NumConstantLanes;
if (!ConstantValue.getNode())
ConstantValue = V;
for (unsigned i = 0; i < NumElts; ++i)
Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
- SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, &Ops[0], NumElts);
+ SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
Val = LowerBUILD_VECTOR(Val, DAG);
if (Val.getNode())
return DAG.getNode(ISD::BITCAST, dl, VT, Val);
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
- if (!isa<ConstantSDNode>(V)) {
+ if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) {
// Note that type legalization likely mucked about with the VT of the
// source operand, so we may have to convert it here before inserting.
Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
return true;
}
- bool ReverseVEXT;
- unsigned Imm, WhichResult;
+ bool DummyBool;
+ int DummyInt;
+ unsigned DummyUnsigned;
return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
- isEXTMask(M, VT, ReverseVEXT, Imm) ||
+ isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
// isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
- isTRNMask(M, VT, WhichResult) || isUZPMask(M, VT, WhichResult) ||
- isZIPMask(M, VT, WhichResult) ||
- isTRN_v_undef_Mask(M, VT, WhichResult) ||
- isUZP_v_undef_Mask(M, VT, WhichResult) ||
- isZIP_v_undef_Mask(M, VT, WhichResult));
+ isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
+ isZIPMask(M, VT, DummyUnsigned) ||
+ isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
+ isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
+ isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
+ isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
+ isConcatMask(M, VT, VT.getSizeInBits() == 128));
}
/// getVShiftImm - Check if this is a valid build_vector for the immediate
return DAG.getNode(ARM64ISD::CMGE, dl, VT, RHS, LHS);
case ARM64CC::LS:
return DAG.getNode(ARM64ISD::CMHS, dl, VT, RHS, LHS);
- case ARM64CC::CC:
+ case ARM64CC::LO:
return DAG.getNode(ARM64ISD::CMHI, dl, VT, RHS, LHS);
case ARM64CC::LT:
if (IsZero)
return DAG.getNode(ARM64ISD::CMGT, dl, VT, RHS, LHS);
case ARM64CC::HI:
return DAG.getNode(ARM64ISD::CMHI, dl, VT, LHS, RHS);
- case ARM64CC::CS:
+ case ARM64CC::HS:
return DAG.getNode(ARM64ISD::CMHS, dl, VT, LHS, RHS);
}
}
// Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
// clean. Some of them require two branches to implement.
ARM64CC::CondCode CC1, CC2;
- changeFPCCToARM64CC(CC, CC1, CC2);
+ bool ShouldInvert;
+ changeVectorFPCCToARM64CC(CC, CC1, CC2, ShouldInvert);
bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
- SDValue Cmp1 =
+ SDValue Cmp =
EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG);
- if (!Cmp1.getNode())
+ if (!Cmp.getNode())
return SDValue();
if (CC2 != ARM64CC::AL) {
if (!Cmp2.getNode())
return SDValue();
- return DAG.getNode(ISD::OR, dl, Cmp1.getValueType(), Cmp1, Cmp2);
+ Cmp = DAG.getNode(ISD::OR, dl, Cmp.getValueType(), Cmp, Cmp2);
}
- return Cmp1;
+ if (ShouldInvert)
+ return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
+
+ return Cmp;
}
/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
case Intrinsic::arm64_neon_ld2:
case Intrinsic::arm64_neon_ld3:
case Intrinsic::arm64_neon_ld4:
+ case Intrinsic::arm64_neon_ld1x2:
+ case Intrinsic::arm64_neon_ld1x3:
+ case Intrinsic::arm64_neon_ld1x4:
case Intrinsic::arm64_neon_ld2lane:
case Intrinsic::arm64_neon_ld3lane:
case Intrinsic::arm64_neon_ld4lane:
case Intrinsic::arm64_neon_st2:
case Intrinsic::arm64_neon_st3:
case Intrinsic::arm64_neon_st4:
+ case Intrinsic::arm64_neon_st1x2:
+ case Intrinsic::arm64_neon_st1x3:
+ case Intrinsic::arm64_neon_st1x4:
case Intrinsic::arm64_neon_st2lane:
case Intrinsic::arm64_neon_st3lane:
case Intrinsic::arm64_neon_st4lane: {
Info.writeMem = true;
return true;
}
+ case Intrinsic::arm64_ldaxr:
case Intrinsic::arm64_ldxr: {
PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.writeMem = false;
return true;
}
+ case Intrinsic::arm64_stlxr:
case Intrinsic::arm64_stxr: {
PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.writeMem = true;
return true;
}
+ case Intrinsic::arm64_ldaxp:
case Intrinsic::arm64_ldxp: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::i128;
Info.writeMem = false;
return true;
}
+ case Intrinsic::arm64_stlxp:
case Intrinsic::arm64_stxp: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::i128;
// addressing mode). Just do two i64 store of zero-registers.
bool Fast;
const Function *F = MF.getFunction();
- if (!IsMemset && Size >= 16 &&
+ if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
Attribute::NoImplicitFloat) &&
(memOpAlign(SrcAlign, DstAlign, 16) ||
- (allowsUnalignedMemoryAccesses(MVT::v2i64, 0, &Fast) && Fast)))
- return MVT::v2i64;
+ (allowsUnalignedMemoryAccesses(MVT::f128, 0, &Fast) && Fast)))
+ return MVT::f128;
return Size >= 8 ? MVT::i64 : MVT::i32;
}
return ScratchRegs;
}
+bool ARM64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const {
+ EVT VT = N->getValueType(0);
+ // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
+ // it with shift to let it be lowered to UBFX.
+ if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
+ isa<ConstantSDNode>(N->getOperand(1))) {
+ uint64_t TruncMask = N->getConstantOperandVal(1);
+ if (isMask_64(TruncMask) &&
+ N->getOperand(0).getOpcode() == ISD::SRL &&
+ isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
+ return false;
+ }
+ return true;
+}
+
bool ARM64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const {
assert(Ty->isIntegerTy());
DAG.getConstant(ShiftRHS, MVT::i64));
}
+static SDValue tryCombineToBSL(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ EVT VT = N->getValueType(0);
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+
+ if (!VT.isVector())
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ if (N0.getOpcode() != ISD::AND)
+ return SDValue();
+
+ SDValue N1 = N->getOperand(1);
+ if (N1.getOpcode() != ISD::AND)
+ return SDValue();
+
+ // We only have to look for constant vectors here since the general, variable
+ // case can be handled in TableGen.
+ unsigned Bits = VT.getVectorElementType().getSizeInBits();
+ uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
+ for (int i = 1; i >= 0; --i)
+ for (int j = 1; j >= 0; --j) {
+ BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
+ BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
+ if (!BVN0 || !BVN1)
+ continue;
+
+ bool FoundMatch = true;
+ for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
+ ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
+ ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
+ if (!CN0 || !CN1 ||
+ CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
+ FoundMatch = false;
+ break;
+ }
+ }
+
+ if (FoundMatch)
+ return DAG.getNode(ARM64ISD::BSL, DL, VT, SDValue(BVN0, 0),
+ N0->getOperand(1 - i), N1->getOperand(1 - j));
+ }
+
+ return SDValue();
+}
+
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
const ARM64Subtarget *Subtarget) {
// Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
if (Res.getNode())
return Res;
+ Res = tryCombineToBSL(N, DCI);
+ if (Res.getNode())
+ return Res;
+
return SDValue();
}
S->getAlignment());
}
+/// Target-specific DAG combine function for NEON load/store intrinsics
+/// to merge base address updates.
+static SDValue performNEONPostLDSTCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+ return SDValue();
+
+ unsigned AddrOpIdx = N->getNumOperands() - 1;
+ SDValue Addr = N->getOperand(AddrOpIdx);
+
+ // Search for a use of the address operand that is an increment.
+ for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+ UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+ SDNode *User = *UI;
+ if (User->getOpcode() != ISD::ADD ||
+ UI.getUse().getResNo() != Addr.getResNo())
+ continue;
+
+ // Check that the add is independent of the load/store. Otherwise, folding
+ // it would create a cycle.
+ if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
+ continue;
+
+ // Find the new opcode for the updating load/store.
+ bool IsStore = false;
+ bool IsLaneOp = false;
+ bool IsDupOp = false;
+ unsigned NewOpc = 0;
+ unsigned NumVecs = 0;
+ unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default: llvm_unreachable("unexpected intrinsic for Neon base update");
+ case Intrinsic::arm64_neon_ld2: NewOpc = ARM64ISD::LD2post;
+ NumVecs = 2; break;
+ case Intrinsic::arm64_neon_ld3: NewOpc = ARM64ISD::LD3post;
+ NumVecs = 3; break;
+ case Intrinsic::arm64_neon_ld4: NewOpc = ARM64ISD::LD4post;
+ NumVecs = 4; break;
+ case Intrinsic::arm64_neon_st2: NewOpc = ARM64ISD::ST2post;
+ NumVecs = 2; IsStore = true; break;
+ case Intrinsic::arm64_neon_st3: NewOpc = ARM64ISD::ST3post;
+ NumVecs = 3; IsStore = true; break;
+ case Intrinsic::arm64_neon_st4: NewOpc = ARM64ISD::ST4post;
+ NumVecs = 4; IsStore = true; break;
+ case Intrinsic::arm64_neon_ld1x2: NewOpc = ARM64ISD::LD1x2post;
+ NumVecs = 2; break;
+ case Intrinsic::arm64_neon_ld1x3: NewOpc = ARM64ISD::LD1x3post;
+ NumVecs = 3; break;
+ case Intrinsic::arm64_neon_ld1x4: NewOpc = ARM64ISD::LD1x4post;
+ NumVecs = 4; break;
+ case Intrinsic::arm64_neon_st1x2: NewOpc = ARM64ISD::ST1x2post;
+ NumVecs = 2; IsStore = true; break;
+ case Intrinsic::arm64_neon_st1x3: NewOpc = ARM64ISD::ST1x3post;
+ NumVecs = 3; IsStore = true; break;
+ case Intrinsic::arm64_neon_st1x4: NewOpc = ARM64ISD::ST1x4post;
+ NumVecs = 4; IsStore = true; break;
+ case Intrinsic::arm64_neon_ld2r: NewOpc = ARM64ISD::LD2DUPpost;
+ NumVecs = 2; IsDupOp = true; break;
+ case Intrinsic::arm64_neon_ld3r: NewOpc = ARM64ISD::LD3DUPpost;
+ NumVecs = 3; IsDupOp = true; break;
+ case Intrinsic::arm64_neon_ld4r: NewOpc = ARM64ISD::LD4DUPpost;
+ NumVecs = 4; IsDupOp = true; break;
+ case Intrinsic::arm64_neon_ld2lane: NewOpc = ARM64ISD::LD2LANEpost;
+ NumVecs = 2; IsLaneOp = true; break;
+ case Intrinsic::arm64_neon_ld3lane: NewOpc = ARM64ISD::LD3LANEpost;
+ NumVecs = 3; IsLaneOp = true; break;
+ case Intrinsic::arm64_neon_ld4lane: NewOpc = ARM64ISD::LD4LANEpost;
+ NumVecs = 4; IsLaneOp = true; break;
+ case Intrinsic::arm64_neon_st2lane: NewOpc = ARM64ISD::ST2LANEpost;
+ NumVecs = 2; IsStore = true; IsLaneOp = true; break;
+ case Intrinsic::arm64_neon_st3lane: NewOpc = ARM64ISD::ST3LANEpost;
+ NumVecs = 3; IsStore = true; IsLaneOp = true; break;
+ case Intrinsic::arm64_neon_st4lane: NewOpc = ARM64ISD::ST4LANEpost;
+ NumVecs = 4; IsStore = true; IsLaneOp = true; break;
+ }
+
+ EVT VecTy;
+ if (IsStore)
+ VecTy = N->getOperand(2).getValueType();
+ else
+ VecTy = N->getValueType(0);
+
+ // If the increment is a constant, it must match the memory ref size.
+ SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+ if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+ uint32_t IncVal = CInc->getZExtValue();
+ unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+ if (IsLaneOp || IsDupOp)
+ NumBytes /= VecTy.getVectorNumElements();
+ if (IncVal != NumBytes)
+ continue;
+ Inc = DAG.getRegister(ARM64::XZR, MVT::i64);
+ }
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(N->getOperand(0)); // Incoming chain
+ // Load lane and store have vector list as input.
+ if (IsLaneOp || IsStore)
+ for (unsigned i = 2; i < AddrOpIdx; ++i)
+ Ops.push_back(N->getOperand(i));
+ Ops.push_back(N->getOperand(AddrOpIdx)); // Base register
+ Ops.push_back(Inc);
+
+ // Return Types.
+ EVT Tys[6];
+ unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
+ unsigned n;
+ for (n = 0; n < NumResultVecs; ++n)
+ Tys[n] = VecTy;
+ Tys[n++] = MVT::i64; // Type of write back register
+ Tys[n] = MVT::Other; // Type of the chain
+ SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs + 2));
+
+ MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
+ SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
+ MemInt->getMemoryVT(),
+ MemInt->getMemOperand());
+
+ // Update the uses.
+ std::vector<SDValue> NewResults;
+ for (unsigned i = 0; i < NumResultVecs; ++i) {
+ NewResults.push_back(SDValue(UpdN.getNode(), i));
+ }
+ NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
+ DCI.CombineTo(N, NewResults);
+ DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
+
+ break;
+ }
+ return SDValue();
+}
+
// Optimize compare with zero and branch.
static SDValue performBRCONDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
IfTrue, IfFalse);
}
+/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
+/// the compare-mask instructions rather than going via NZCV, even if LHS and
+/// RHS are really scalar. This replaces any scalar setcc in the above pattern
+/// with a vector one followed by a DUP shuffle on the result.
+static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
+ SDValue N0 = N->getOperand(0);
+ EVT ResVT = N->getValueType(0);
+
+ if (!N->getOperand(1).getValueType().isVector())
+ return SDValue();
+
+ if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
+ return SDValue();
+
+ SDLoc DL(N0);
+
+ EVT SrcVT = N0.getOperand(0).getValueType();
+ SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT,
+ ResVT.getSizeInBits() / SrcVT.getSizeInBits());
+ EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
+
+ // First perform a vector comparison, where lane 0 is the one we're interested
+ // in.
+ SDValue LHS =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
+ SDValue RHS =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
+ SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
+
+ // Now duplicate the comparison mask we want across all other lanes.
+ SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
+ SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
+ Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(),
+ Mask);
+
+ return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
+}
+
SDValue ARM64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
return performBitcastCombine(N, DCI, DAG);
case ISD::CONCAT_VECTORS:
return performConcatVectorsCombine(N, DCI, DAG);
+ case ISD::SELECT:
+ return performSelectCombine(N, DAG);
case ISD::VSELECT:
return performVSelectCombine(N, DCI.DAG);
case ISD::STORE:
return performSTORECombine(N, DCI, DAG, Subtarget);
case ARM64ISD::BRCOND:
return performBRCONDCombine(N, DCI, DAG);
+ case ISD::INTRINSIC_VOID:
+ case ISD::INTRINSIC_W_CHAIN:
+ switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+ case Intrinsic::arm64_neon_ld2:
+ case Intrinsic::arm64_neon_ld3:
+ case Intrinsic::arm64_neon_ld4:
+ case Intrinsic::arm64_neon_ld1x2:
+ case Intrinsic::arm64_neon_ld1x3:
+ case Intrinsic::arm64_neon_ld1x4:
+ case Intrinsic::arm64_neon_ld2lane:
+ case Intrinsic::arm64_neon_ld3lane:
+ case Intrinsic::arm64_neon_ld4lane:
+ case Intrinsic::arm64_neon_ld2r:
+ case Intrinsic::arm64_neon_ld3r:
+ case Intrinsic::arm64_neon_ld4r:
+ case Intrinsic::arm64_neon_st2:
+ case Intrinsic::arm64_neon_st3:
+ case Intrinsic::arm64_neon_st4:
+ case Intrinsic::arm64_neon_st1x2:
+ case Intrinsic::arm64_neon_st1x3:
+ case Intrinsic::arm64_neon_st1x4:
+ case Intrinsic::arm64_neon_st2lane:
+ case Intrinsic::arm64_neon_st3lane:
+ case Intrinsic::arm64_neon_st4lane:
+ return performNEONPostLDSTCombine(N, DCI, DAG);
+ default:
+ break;
+ }
}
return SDValue();
}
return false;
bool HasRet = false;
- for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
- UI != UE; ++UI) {
- if (UI->getOpcode() != ARM64ISD::RET_FLAG)
+ for (SDNode *Node : Copy->uses()) {
+ if (Node->getOpcode() != ARM64ISD::RET_FLAG)
return false;
HasRet = true;
}
return true;
}
-/// The only 128-bit atomic operation is an stxp that succeeds. In particular
-/// neither ldp nor ldxp are atomic. So the canonical sequence for an atomic
-/// load is:
-/// loop:
-/// ldxp x0, x1, [x8]
-/// stxp w2, x0, x1, [x8]
-/// cbnz w2, loop
-/// If the stxp succeeds then the ldxp managed to get both halves without an
-/// intervening stxp from a different thread and the read was atomic.
-static void ReplaceATOMIC_LOAD_128(SDNode *N, SmallVectorImpl<SDValue> &Results,
- SelectionDAG &DAG) {
- SDLoc DL(N);
- AtomicSDNode *AN = cast<AtomicSDNode>(N);
- EVT VT = AN->getMemoryVT();
- SDValue Zero = DAG.getConstant(0, VT);
-
- // FIXME: Really want ATOMIC_LOAD_NOP but that doesn't fit into the existing
- // scheme very well. Given the complexity of what we're already generating, an
- // extra couple of ORRs probably won't make much difference.
- SDValue Result = DAG.getAtomic(ISD::ATOMIC_LOAD_OR, DL, AN->getMemoryVT(),
- N->getOperand(0), N->getOperand(1), Zero,
- AN->getMemOperand(), AN->getOrdering(),
- AN->getSynchScope());
-
- Results.push_back(Result.getValue(0)); // Value
- Results.push_back(Result.getValue(1)); // Chain
-}
-
-static void ReplaceATOMIC_OP_128(SDNode *N, SmallVectorImpl<SDValue> &Results,
- SelectionDAG &DAG, unsigned NewOp) {
- SDLoc DL(N);
- AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
- assert(N->getValueType(0) == MVT::i128 &&
- "Only know how to expand i128 atomics");
-
- SmallVector<SDValue, 6> Ops;
- Ops.push_back(N->getOperand(1)); // Ptr
- // Low part of Val1
- Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64,
- N->getOperand(2), DAG.getIntPtrConstant(0)));
- // High part of Val1
- Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64,
- N->getOperand(2), DAG.getIntPtrConstant(1)));
- if (NewOp == ARM64::ATOMIC_CMP_SWAP_I128) {
- // Low part of Val2
- Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64,
- N->getOperand(3), DAG.getIntPtrConstant(0)));
- // High part of Val2
- Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64,
- N->getOperand(3), DAG.getIntPtrConstant(1)));
- }
-
- Ops.push_back(DAG.getTargetConstant(Ordering, MVT::i32));
- Ops.push_back(N->getOperand(0)); // Chain
-
- SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
- SDNode *Result = DAG.getMachineNode(NewOp, DL, Tys, Ops);
- SDValue OpsF[] = { SDValue(Result, 0), SDValue(Result, 1) };
- Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, OpsF, 2));
- Results.push_back(SDValue(Result, 2));
-}
-
void ARM64TargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
switch (N->getOpcode()) {
default:
llvm_unreachable("Don't know how to custom expand this");
- case ISD::ATOMIC_LOAD:
- ReplaceATOMIC_LOAD_128(N, Results, DAG);
- return;
- case ISD::ATOMIC_LOAD_ADD:
- ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_ADD_I128);
- return;
- case ISD::ATOMIC_LOAD_SUB:
- ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_SUB_I128);
- return;
- case ISD::ATOMIC_LOAD_AND:
- ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_AND_I128);
- return;
- case ISD::ATOMIC_LOAD_OR:
- ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_OR_I128);
- return;
- case ISD::ATOMIC_LOAD_XOR:
- ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_XOR_I128);
- return;
- case ISD::ATOMIC_LOAD_NAND:
- ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_NAND_I128);
- return;
- case ISD::ATOMIC_SWAP:
- ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_SWAP_I128);
- return;
- case ISD::ATOMIC_LOAD_MIN:
- ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_MIN_I128);
- return;
- case ISD::ATOMIC_LOAD_MAX:
- ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_MAX_I128);
- return;
- case ISD::ATOMIC_LOAD_UMIN:
- ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_UMIN_I128);
- return;
- case ISD::ATOMIC_LOAD_UMAX:
- ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_UMAX_I128);
- return;
- case ISD::ATOMIC_CMP_SWAP:
- ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_CMP_SWAP_I128);
- return;
case ISD::FP_TO_UINT:
case ISD::FP_TO_SINT:
assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
return;
}
}
+
+bool ARM64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
+ // Loads and stores less than 128-bits are already atomic; ones above that
+ // are doomed anyway, so defer to the default libcall and blame the OS when
+ // things go wrong:
+ if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+ return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128;
+ else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+ return LI->getType()->getPrimitiveSizeInBits() == 128;
+
+ // For the real atomic operations, we have ldxr/stxr up to 128 bits.
+ return Inst->getType()->getPrimitiveSizeInBits() <= 128;
+}
+
+Value *ARM64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+ AtomicOrdering Ord) const {
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
+ bool IsAcquire =
+ Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+
+ // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
+ // intrinsic must return {i64, i64} and we have to recombine them into a
+ // single i128 here.
+ if (ValTy->getPrimitiveSizeInBits() == 128) {
+ Intrinsic::ID Int =
+ IsAcquire ? Intrinsic::arm64_ldaxp : Intrinsic::arm64_ldxp;
+ Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int);
+
+ Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+ Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
+
+ Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
+ Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
+ Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
+ Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
+ return Builder.CreateOr(
+ Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
+ }
+
+ Type *Tys[] = { Addr->getType() };
+ Intrinsic::ID Int =
+ IsAcquire ? Intrinsic::arm64_ldaxr : Intrinsic::arm64_ldxr;
+ Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys);
+
+ return Builder.CreateTruncOrBitCast(
+ Builder.CreateCall(Ldxr, Addr),
+ cast<PointerType>(Addr->getType())->getElementType());
+}
+
+Value *ARM64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
+ Value *Val, Value *Addr,
+ AtomicOrdering Ord) const {
+ Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+ bool IsRelease =
+ Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+
+ // Since the intrinsics must have legal type, the i128 intrinsics take two
+ // parameters: "i64, i64". We must marshal Val into the appropriate form
+ // before the call.
+ if (Val->getType()->getPrimitiveSizeInBits() == 128) {
+ Intrinsic::ID Int =
+ IsRelease ? Intrinsic::arm64_stlxp : Intrinsic::arm64_stxp;
+ Function *Stxr = Intrinsic::getDeclaration(M, Int);
+ Type *Int64Ty = Type::getInt64Ty(M->getContext());
+
+ Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
+ Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
+ Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+ return Builder.CreateCall3(Stxr, Lo, Hi, Addr);
+ }
+
+ Intrinsic::ID Int =
+ IsRelease ? Intrinsic::arm64_stlxr : Intrinsic::arm64_stxr;
+ Type *Tys[] = { Addr->getType() };
+ Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
+
+ return Builder.CreateCall2(
+ Stxr, Builder.CreateZExtOrBitCast(
+ Val, Stxr->getFunctionType()->getParamType(0)),
+ Addr);
+}