X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FAArch64%2FAArch64InstrNEON.td;h=42eb868c10b49308897abdf6fdabdd889f313675;hb=1296bb3ba673268c0a69e45b758548625d3f4026;hp=355de5376868f978ea3878f377f6bedc9f8330f6;hpb=6a5a667517160ca1b557002a29d08868ae029451;p=oota-llvm.git diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td index 355de537686..42eb868c10b 100644 --- a/lib/Target/AArch64/AArch64InstrNEON.td +++ b/lib/Target/AArch64/AArch64InstrNEON.td @@ -41,16 +41,31 @@ def Neon_cmpz : SDNode<"AArch64ISD::NEON_CMPZ", SDTypeProfile<1, 3, def Neon_tst : SDNode<"AArch64ISD::NEON_TST", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<1, 2>]>>; -def Neon_dupImm : SDNode<"AArch64ISD::NEON_DUPIMM", SDTypeProfile<1, 1, - [SDTCisVec<0>, SDTCisVT<1, i32>]>>; - def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]>; def Neon_sqrshlImm : SDNode<"AArch64ISD::NEON_QSHLs", SDTARMVSH>; def Neon_uqrshlImm : SDNode<"AArch64ISD::NEON_QSHLu", SDTARMVSH>; +def SDTPERMUTE : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>]>; +def Neon_uzp1 : SDNode<"AArch64ISD::NEON_UZP1", SDTPERMUTE>; +def Neon_uzp2 : SDNode<"AArch64ISD::NEON_UZP2", SDTPERMUTE>; +def Neon_zip1 : SDNode<"AArch64ISD::NEON_ZIP1", SDTPERMUTE>; +def Neon_zip2 : SDNode<"AArch64ISD::NEON_ZIP2", SDTPERMUTE>; +def Neon_trn1 : SDNode<"AArch64ISD::NEON_TRN1", SDTPERMUTE>; +def Neon_trn2 : SDNode<"AArch64ISD::NEON_TRN2", SDTPERMUTE>; + +def SDTVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>; +def Neon_rev64 : SDNode<"AArch64ISD::NEON_REV64", SDTVSHUF>; +def Neon_rev32 : SDNode<"AArch64ISD::NEON_REV32", SDTVSHUF>; +def Neon_rev16 : SDNode<"AArch64ISD::NEON_REV16", SDTVSHUF>; +def Neon_vdup : SDNode<"AArch64ISD::NEON_VDUP", SDTypeProfile<1, 1, + [SDTCisVec<0>]>>; def Neon_vduplane : SDNode<"AArch64ISD::NEON_VDUPLANE", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisVT<2, i64>]>>; +def Neon_vextract : SDNode<"AArch64ISD::NEON_VEXTRACT", SDTypeProfile<1, 3, + [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, SDTCisVT<3, i64>]>>; //===----------------------------------------------------------------------===// // Multiclasses @@ -59,8 +74,7 @@ def Neon_vduplane : SDNode<"AArch64ISD::NEON_VDUPLANE", SDTypeProfile<1, 2, multiclass NeonI_3VSame_B_sizes size, bits<5> opcode, string asmop, SDPatternOperator opnode8B, SDPatternOperator opnode16B, - bit Commutable = 0> -{ + bit Commutable = 0> { let isCommutable = Commutable in { def _8B : NeonI_3VSame<0b0, u, size, opcode, (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm), @@ -81,8 +95,7 @@ multiclass NeonI_3VSame_B_sizes size, bits<5> opcode, multiclass NeonI_3VSame_HS_sizes opcode, string asmop, SDPatternOperator opnode, - bit Commutable = 0> -{ + bit Commutable = 0> { let isCommutable = Commutable in { def _4H : NeonI_3VSame<0b0, u, 0b01, opcode, (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm), @@ -116,8 +129,7 @@ multiclass NeonI_3VSame_HS_sizes opcode, multiclass NeonI_3VSame_BHS_sizes opcode, string asmop, SDPatternOperator opnode, bit Commutable = 0> - : NeonI_3VSame_HS_sizes -{ + : NeonI_3VSame_HS_sizes { let isCommutable = Commutable in { def _8B : NeonI_3VSame<0b0, u, 0b00, opcode, (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm), @@ -138,8 +150,7 @@ multiclass NeonI_3VSame_BHS_sizes opcode, multiclass NeonI_3VSame_BHSD_sizes opcode, string asmop, SDPatternOperator opnode, bit Commutable = 0> - : NeonI_3VSame_BHS_sizes -{ + : NeonI_3VSame_BHS_sizes { let isCommutable = Commutable in { def _2D : NeonI_3VSame<0b1, u, 0b11, opcode, (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), @@ -157,8 +168,7 @@ multiclass NeonI_3VSame_SD_sizes opcode, SDPatternOperator opnode4S, SDPatternOperator opnode2D, ValueType ResTy2S, ValueType ResTy4S, - ValueType ResTy2D, bit Commutable = 0> -{ + ValueType ResTy2D, bit Commutable = 0> { let isCommutable = Commutable in { def _2S : NeonI_3VSame<0b0, u, {size, 0b0}, opcode, (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm), @@ -217,7 +227,7 @@ defm PMULvvv : NeonI_3VSame_B_sizes<0b1, 0b00, 0b10011, "pmul", // class NeonI_3VSame_Constraint_impl: NeonI_3VSame with no data type and // two operands constraints. class NeonI_3VSame_Constraint_impl size, + RegisterOperand VPRC, ValueType OpTy, bit q, bit u, bits<2> size, bits<5> opcode, SDPatternOperator opnode> : NeonI_3VSame; -def Neon_immAllOnes: PatLeaf<(Neon_movi (i32 timm), (i32 imm)), [{ - ConstantSDNode *ImmConstVal = cast(N->getOperand(0)); - ConstantSDNode *OpCmodeConstVal = cast(N->getOperand(1)); - unsigned EltBits; - uint64_t EltVal = A64Imms::decodeNeonModImm(ImmConstVal->getZExtValue(), - OpCmodeConstVal->getZExtValue(), EltBits); - return (EltBits == 8 && EltVal == 0xff); -}]>; - +// The MOVI instruction takes two immediate operands. The first is the +// immediate encoding, while the second is the cmode. A cmode of 14, or +// 0b1110, produces a MOVI operation, rather than a MVNI, ORR, or BIC. +def Neon_AllZero : PatFrag<(ops), (Neon_movi (i32 0), (i32 14))>; +def Neon_AllOne : PatFrag<(ops), (Neon_movi (i32 255), (i32 14))>; def Neon_not8B : PatFrag<(ops node:$in), - (xor node:$in, (bitconvert (v8i8 Neon_immAllOnes)))>; + (xor node:$in, (bitconvert (v8i8 Neon_AllOne)))>; def Neon_not16B : PatFrag<(ops node:$in), - (xor node:$in, (bitconvert (v16i8 Neon_immAllOnes)))>; + (xor node:$in, (bitconvert (v16i8 Neon_AllOne)))>; def Neon_orn8B : PatFrag<(ops node:$Rn, node:$Rm), (or node:$Rn, (Neon_not8B node:$Rm))>; @@ -460,6 +466,9 @@ multiclass Neon_bitwise3V_patterns; + def : Pat<(v1f64 (int_arm_neon_vbsl (v1f64 VPR64:$src), + (v1f64 VPR64:$Rn), (v1f64 VPR64:$Rm))), + (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>; def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 VPR128:$src), (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))), (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>; @@ -1059,7 +1068,7 @@ def neon_uimm8_asmoperand : AsmOperandClass def neon_uimm8 : Operand, ImmLeaf { let ParserMatchClass = neon_uimm8_asmoperand; - let PrintMethod = "printNeonUImm8Operand"; + let PrintMethod = "printUImmHexOperand"; } def neon_uimm64_mask_asmoperand : AsmOperandClass @@ -1084,7 +1093,7 @@ multiclass NeonI_mov_imm_lsl_sizes; class NeonI_mov_imm_lsl_aliases - : NeonInstAlias; // Aliases for Vector Move Immediate Shifted @@ -1436,20 +1445,25 @@ def FMOVvi_4S : NeonI_FMOV_impl<".4s", VPR128, v4f32, fmov32_operand, 0b1, 0b0>; def FMOVvi_2D : NeonI_FMOV_impl<".2d", VPR128, v2f64, fmov64_operand, 0b1, 0b1>; } -// Vector Shift (Immediate) +// Vector Shift (Immediate) // Immediate in [0, 63] def imm0_63 : Operand { let ParserMatchClass = uimm6_asmoperand; } -// Shift Right Immediate - A shift right immediate is encoded differently from -// other shift immediates. The immh:immb field is encoded like so: +// Shift Right/Left Immediate - The immh:immb field of these shifts are encoded +// as follows: // // Offset Encoding // 8 immh:immb<6:3> = '0001xxx', is encoded in immh:immb<2:0> // 16 immh:immb<6:4> = '001xxxx', is encoded in immh:immb<3:0> // 32 immh:immb<6:5> = '01xxxxx', is encoded in immh:immb<4:0> // 64 immh:immb<6> = '1xxxxxx', is encoded in immh:immb<5:0> +// +// The shift right immediate amount, in the range 1 to element bits, is computed +// as Offset - UInt(immh:immb). The shift left immediate amount, in the range 0 +// to element bits - 1, is computed as UInt(immh:immb) - Offset. + class shr_imm_asmoperands : AsmOperandClass { let Name = "ShrImm" # OFFSET; let RenderMethod = "addImmOperands"; @@ -1459,7 +1473,7 @@ class shr_imm_asmoperands : AsmOperandClass { class shr_imm : Operand { let EncoderMethod = "getShiftRightImm" # OFFSET; let DecoderMethod = "DecodeShiftRightImm" # OFFSET; - let ParserMatchClass = + let ParserMatchClass = !cast("shr_imm" # OFFSET # "_asmoperand"); } @@ -1468,10 +1482,33 @@ def shr_imm16_asmoperand : shr_imm_asmoperands<"16">; def shr_imm32_asmoperand : shr_imm_asmoperands<"32">; def shr_imm64_asmoperand : shr_imm_asmoperands<"64">; -def shr_imm8 : shr_imm<"8">; -def shr_imm16 : shr_imm<"16">; -def shr_imm32 : shr_imm<"32">; -def shr_imm64 : shr_imm<"64">; +def shr_imm8 : shr_imm<"8">, ImmLeaf 0 && Imm <= 8;}]>; +def shr_imm16 : shr_imm<"16">, ImmLeaf 0 && Imm <= 16;}]>; +def shr_imm32 : shr_imm<"32">, ImmLeaf 0 && Imm <= 32;}]>; +def shr_imm64 : shr_imm<"64">, ImmLeaf 0 && Imm <= 64;}]>; + +class shl_imm_asmoperands : AsmOperandClass { + let Name = "ShlImm" # OFFSET; + let RenderMethod = "addImmOperands"; + let DiagnosticType = "ShlImm" # OFFSET; +} + +class shl_imm : Operand { + let EncoderMethod = "getShiftLeftImm" # OFFSET; + let DecoderMethod = "DecodeShiftLeftImm" # OFFSET; + let ParserMatchClass = + !cast("shl_imm" # OFFSET # "_asmoperand"); +} + +def shl_imm8_asmoperand : shl_imm_asmoperands<"8">; +def shl_imm16_asmoperand : shl_imm_asmoperands<"16">; +def shl_imm32_asmoperand : shl_imm_asmoperands<"32">; +def shl_imm64_asmoperand : shl_imm_asmoperands<"64">; + +def shl_imm8 : shl_imm<"8">, ImmLeaf= 0 && Imm < 8;}]>; +def shl_imm16 : shl_imm<"16">, ImmLeaf= 0 && Imm < 16;}]>; +def shl_imm32 : shl_imm<"32">, ImmLeaf= 0 && Imm < 32;}]>; +def shl_imm64 : shl_imm<"64">, ImmLeaf= 0 && Imm < 64;}]>; class N2VShift opcode, string asmop, string T, RegisterOperand VPRC, ValueType Ty, Operand ImmTy, SDNode OpNode> @@ -1480,37 +1517,37 @@ class N2VShift opcode, string asmop, string T, asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm", [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$Rn), - (Ty (Neon_dupImm (i32 imm:$Imm))))))], + (Ty (Neon_vdup (i32 ImmTy:$Imm))))))], NoItinerary>; multiclass NeonI_N2VShL opcode, string asmop> { // 64-bit vector types. - def _8B : N2VShift<0b0, u, opcode, asmop, "8b", VPR64, v8i8, uimm3, shl> { + def _8B : N2VShift<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8, shl> { let Inst{22-19} = 0b0001; // immh:immb = 0001xxx } - def _4H : N2VShift<0b0, u, opcode, asmop, "4h", VPR64, v4i16, uimm4, shl> { + def _4H : N2VShift<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16, shl> { let Inst{22-20} = 0b001; // immh:immb = 001xxxx } - def _2S : N2VShift<0b0, u, opcode, asmop, "2s", VPR64, v2i32, uimm5, shl> { + def _2S : N2VShift<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32, shl> { let Inst{22-21} = 0b01; // immh:immb = 01xxxxx } // 128-bit vector types. - def _16B : N2VShift<0b1, u, opcode, asmop, "16b", VPR128, v16i8, uimm3, shl> { + def _16B : N2VShift<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8, shl> { let Inst{22-19} = 0b0001; // immh:immb = 0001xxx } - def _8H : N2VShift<0b1, u, opcode, asmop, "8h", VPR128, v8i16, uimm4, shl> { + def _8H : N2VShift<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16, shl> { let Inst{22-20} = 0b001; // immh:immb = 001xxxx } - def _4S : N2VShift<0b1, u, opcode, asmop, "4s", VPR128, v4i32, uimm5, shl> { + def _4S : N2VShift<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32, shl> { let Inst{22-21} = 0b01; // immh:immb = 01xxxxx } - def _2D : N2VShift<0b1, u, opcode, asmop, "2d", VPR128, v2i64, imm0_63, shl> { + def _2D : N2VShift<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64, shl> { let Inst{22} = 0b1; // immh:immb = 1xxxxxx } } @@ -1565,16 +1602,31 @@ def Neon_High8H : PatFrag<(ops node:$in), (extract_subvector (v8i16 node:$in), (iPTR 4))>; def Neon_High4S : PatFrag<(ops node:$in), (extract_subvector (v4i32 node:$in), (iPTR 2))>; - -def Neon_low8H : PatFrag<(ops node:$in), +def Neon_High2D : PatFrag<(ops node:$in), + (extract_subvector (v2i64 node:$in), (iPTR 1))>; +def Neon_High4float : PatFrag<(ops node:$in), + (extract_subvector (v4f32 node:$in), (iPTR 2))>; +def Neon_High2double : PatFrag<(ops node:$in), + (extract_subvector (v2f64 node:$in), (iPTR 1))>; + +def Neon_Low16B : PatFrag<(ops node:$in), + (v8i8 (extract_subvector (v16i8 node:$in), + (iPTR 0)))>; +def Neon_Low8H : PatFrag<(ops node:$in), (v4i16 (extract_subvector (v8i16 node:$in), (iPTR 0)))>; -def Neon_low4S : PatFrag<(ops node:$in), +def Neon_Low4S : PatFrag<(ops node:$in), (v2i32 (extract_subvector (v4i32 node:$in), (iPTR 0)))>; -def Neon_low4f : PatFrag<(ops node:$in), - (v2f32 (extract_subvector (v4f32 node:$in), +def Neon_Low2D : PatFrag<(ops node:$in), + (v1i64 (extract_subvector (v2i64 node:$in), (iPTR 0)))>; +def Neon_Low4float : PatFrag<(ops node:$in), + (v2f32 (extract_subvector (v4f32 node:$in), + (iPTR 0)))>; +def Neon_Low2double : PatFrag<(ops node:$in), + (v1f64 (extract_subvector (v2f64 node:$in), + (iPTR 0)))>; class N2VShiftLong opcode, string asmop, string DestT, string SrcT, ValueType DestTy, ValueType SrcTy, @@ -1585,7 +1637,7 @@ class N2VShiftLong opcode, string asmop, string DestT, [(set (DestTy VPR128:$Rd), (DestTy (shl (DestTy (ExtOp (SrcTy VPR64:$Rn))), - (DestTy (Neon_dupImm (i32 imm:$Imm))))))], + (DestTy (Neon_vdup (i32 ImmTy:$Imm))))))], NoItinerary>; class N2VShiftLongHigh opcode, string asmop, string DestT, @@ -1599,40 +1651,40 @@ class N2VShiftLongHigh opcode, string asmop, string DestT, (DestTy (shl (DestTy (ExtOp (SrcTy (getTop VPR128:$Rn)))), - (DestTy (Neon_dupImm (i32 imm:$Imm))))))], + (DestTy (Neon_vdup (i32 ImmTy:$Imm))))))], NoItinerary>; multiclass NeonI_N2VShLL opcode, string asmop, SDNode ExtOp> { // 64-bit vector types. def _8B : N2VShiftLong<0b0, u, opcode, asmop, "8h", "8b", v8i16, v8i8, - uimm3, ExtOp> { + shl_imm8, ExtOp> { let Inst{22-19} = 0b0001; // immh:immb = 0001xxx } def _4H : N2VShiftLong<0b0, u, opcode, asmop, "4s", "4h", v4i32, v4i16, - uimm4, ExtOp> { + shl_imm16, ExtOp> { let Inst{22-20} = 0b001; // immh:immb = 001xxxx } def _2S : N2VShiftLong<0b0, u, opcode, asmop, "2d", "2s", v2i64, v2i32, - uimm5, ExtOp> { + shl_imm32, ExtOp> { let Inst{22-21} = 0b01; // immh:immb = 01xxxxx } // 128-bit vector types - def _16B : N2VShiftLongHigh<0b1, u, opcode, asmop, "8h", "16b", - v8i16, v8i8, 8, uimm3, ExtOp, Neon_High16B> { + def _16B : N2VShiftLongHigh<0b1, u, opcode, asmop, "8h", "16b", v8i16, v8i8, + 8, shl_imm8, ExtOp, Neon_High16B> { let Inst{22-19} = 0b0001; // immh:immb = 0001xxx } - def _8H : N2VShiftLongHigh<0b1, u, opcode, asmop, "4s", "8h", - v4i32, v4i16, 4, uimm4, ExtOp, Neon_High8H> { + def _8H : N2VShiftLongHigh<0b1, u, opcode, asmop, "4s", "8h", v4i32, v4i16, + 4, shl_imm16, ExtOp, Neon_High8H> { let Inst{22-20} = 0b001; // immh:immb = 001xxxx } - def _4S : N2VShiftLongHigh<0b1, u, opcode, asmop, "2d", "4s", - v2i64, v2i32, 2, uimm5, ExtOp, Neon_High4S> { + def _4S : N2VShiftLongHigh<0b1, u, opcode, asmop, "2d", "4s", v2i64, v2i32, + 2, shl_imm32, ExtOp, Neon_High4S> { let Inst{22-21} = 0b01; // immh:immb = 01xxxxx } @@ -1668,7 +1720,7 @@ class N2VShift_RQ opcode, string asmop, string T, (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm), asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm", [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$Rn), - (i32 imm:$Imm))))], + (i32 ImmTy:$Imm))))], NoItinerary>; // shift right (vector by immediate) @@ -1713,38 +1765,38 @@ multiclass NeonI_N2VShR_RQ opcode, string asmop, multiclass NeonI_N2VShL_Q opcode, string asmop, SDPatternOperator OpNode> { // 64-bit vector types. - def _8B : N2VShift_RQ<0b0, u, opcode, asmop, "8b", VPR64, v8i8, uimm3, + def _8B : N2VShift_RQ<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8, OpNode> { let Inst{22-19} = 0b0001; } - def _4H : N2VShift_RQ<0b0, u, opcode, asmop, "4h", VPR64, v4i16, uimm4, + def _4H : N2VShift_RQ<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16, OpNode> { let Inst{22-20} = 0b001; } - def _2S : N2VShift_RQ<0b0, u, opcode, asmop, "2s", VPR64, v2i32, uimm5, + def _2S : N2VShift_RQ<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32, OpNode> { let Inst{22-21} = 0b01; } // 128-bit vector types. - def _16B : N2VShift_RQ<0b1, u, opcode, asmop, "16b", VPR128, v16i8, uimm3, + def _16B : N2VShift_RQ<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8, OpNode> { let Inst{22-19} = 0b0001; } - def _8H : N2VShift_RQ<0b1, u, opcode, asmop, "8h", VPR128, v8i16, uimm4, + def _8H : N2VShift_RQ<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16, OpNode> { let Inst{22-20} = 0b001; } - def _4S : N2VShift_RQ<0b1, u, opcode, asmop, "4s", VPR128, v4i32, uimm5, + def _4S : N2VShift_RQ<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32, OpNode> { let Inst{22-21} = 0b01; } - def _2D : N2VShift_RQ<0b1, u, opcode, asmop, "2d", VPR128, v2i64, imm0_63, + def _2D : N2VShift_RQ<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64, OpNode> { let Inst{22} = 0b1; } @@ -1771,7 +1823,7 @@ class N2VShiftAdd opcode, string asmop, string T, asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm", [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src), (Ty (OpNode (Ty VPRC:$Rn), - (Ty (Neon_dupImm (i32 imm:$Imm))))))))], + (Ty (Neon_vdup (i32 ImmTy:$Imm))))))))], NoItinerary> { let Constraints = "$src = $Rd"; } @@ -1826,7 +1878,7 @@ class N2VShiftAdd_R opcode, string asmop, string T, (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm), asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm", [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src), - (Ty (OpNode (Ty VPRC:$Rn), (i32 imm:$Imm))))))], + (Ty (OpNode (Ty VPRC:$Rn), (i32 ImmTy:$Imm))))))], NoItinerary> { let Constraints = "$src = $Rd"; } @@ -1881,45 +1933,45 @@ class N2VShiftIns opcode, string asmop, string T, (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm), asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm", [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$src), (Ty VPRC:$Rn), - (i32 imm:$Imm))))], + (i32 ImmTy:$Imm))))], NoItinerary> { let Constraints = "$src = $Rd"; } // shift left insert (vector by immediate) multiclass NeonI_N2VShLIns opcode, string asmop> { - def _8B : N2VShiftIns<0b0, u, opcode, asmop, "8b", VPR64, v8i8, uimm3, + def _8B : N2VShiftIns<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8, int_aarch64_neon_vsli> { let Inst{22-19} = 0b0001; } - def _4H : N2VShiftIns<0b0, u, opcode, asmop, "4h", VPR64, v4i16, uimm4, + def _4H : N2VShiftIns<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16, int_aarch64_neon_vsli> { let Inst{22-20} = 0b001; } - def _2S : N2VShiftIns<0b0, u, opcode, asmop, "2s", VPR64, v2i32, uimm5, + def _2S : N2VShiftIns<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32, int_aarch64_neon_vsli> { let Inst{22-21} = 0b01; } // 128-bit vector types - def _16B : N2VShiftIns<0b1, u, opcode, asmop, "16b", VPR128, v16i8, uimm3, + def _16B : N2VShiftIns<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8, int_aarch64_neon_vsli> { let Inst{22-19} = 0b0001; } - def _8H : N2VShiftIns<0b1, u, opcode, asmop, "8h", VPR128, v8i16, uimm4, + def _8H : N2VShiftIns<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16, int_aarch64_neon_vsli> { let Inst{22-20} = 0b001; } - def _4S : N2VShiftIns<0b1, u, opcode, asmop, "4s", VPR128, v4i32, uimm5, + def _4S : N2VShiftIns<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32, int_aarch64_neon_vsli> { let Inst{22-21} = 0b01; } - def _2D : N2VShiftIns<0b1, u, opcode, asmop, "2d", VPR128, v2i64, imm0_63, + def _2D : N2VShiftIns<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64, int_aarch64_neon_vsli> { let Inst{22} = 0b1; } @@ -2048,72 +2100,75 @@ def Neon_combine_2d : PatFrag<(ops node:$Rm, node:$Rn), def Neon_lshrImm8H : PatFrag<(ops node:$lhs, node:$rhs), (v8i16 (srl (v8i16 node:$lhs), - (v8i16 (Neon_dupImm (i32 node:$rhs)))))>; + (v8i16 (Neon_vdup (i32 node:$rhs)))))>; def Neon_lshrImm4S : PatFrag<(ops node:$lhs, node:$rhs), (v4i32 (srl (v4i32 node:$lhs), - (v4i32 (Neon_dupImm (i32 node:$rhs)))))>; + (v4i32 (Neon_vdup (i32 node:$rhs)))))>; def Neon_lshrImm2D : PatFrag<(ops node:$lhs, node:$rhs), (v2i64 (srl (v2i64 node:$lhs), - (v2i64 (Neon_dupImm (i32 node:$rhs)))))>; + (v2i64 (Neon_vdup (i32 node:$rhs)))))>; def Neon_ashrImm8H : PatFrag<(ops node:$lhs, node:$rhs), (v8i16 (sra (v8i16 node:$lhs), - (v8i16 (Neon_dupImm (i32 node:$rhs)))))>; + (v8i16 (Neon_vdup (i32 node:$rhs)))))>; def Neon_ashrImm4S : PatFrag<(ops node:$lhs, node:$rhs), (v4i32 (sra (v4i32 node:$lhs), - (v4i32 (Neon_dupImm (i32 node:$rhs)))))>; + (v4i32 (Neon_vdup (i32 node:$rhs)))))>; def Neon_ashrImm2D : PatFrag<(ops node:$lhs, node:$rhs), (v2i64 (sra (v2i64 node:$lhs), - (v2i64 (Neon_dupImm (i32 node:$rhs)))))>; + (v2i64 (Neon_vdup (i32 node:$rhs)))))>; // Normal shift right narrow is matched by IR (srl/sra, trunc, concat_vectors) multiclass Neon_shiftNarrow_patterns { def : Pat<(v8i8 (trunc (!cast("Neon_" # shr # "Imm8H") VPR128:$Rn, - imm:$Imm))), + (i32 shr_imm8:$Imm)))), (SHRNvvi_8B VPR128:$Rn, imm:$Imm)>; def : Pat<(v4i16 (trunc (!cast("Neon_" # shr # "Imm4S") VPR128:$Rn, - imm:$Imm))), + (i32 shr_imm16:$Imm)))), (SHRNvvi_4H VPR128:$Rn, imm:$Imm)>; def : Pat<(v2i32 (trunc (!cast("Neon_" # shr # "Imm2D") VPR128:$Rn, - imm:$Imm))), + (i32 shr_imm32:$Imm)))), (SHRNvvi_2S VPR128:$Rn, imm:$Imm)>; def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert (v8i8 (trunc (!cast("Neon_" # shr # "Imm8H") - VPR128:$Rn, imm:$Imm)))))), - (SHRNvvi_16B (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), + VPR128:$Rn, (i32 shr_imm8:$Imm))))))), + (SHRNvvi_16B (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)), VPR128:$Rn, imm:$Imm)>; def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert (v4i16 (trunc (!cast("Neon_" # shr # "Imm4S") - VPR128:$Rn, imm:$Imm)))))), + VPR128:$Rn, (i32 shr_imm16:$Imm))))))), (SHRNvvi_8H (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), VPR128:$Rn, imm:$Imm)>; def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert (v2i32 (trunc (!cast("Neon_" # shr # "Imm2D") - VPR128:$Rn, imm:$Imm)))))), + VPR128:$Rn, (i32 shr_imm32:$Imm))))))), (SHRNvvi_4S (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), VPR128:$Rn, imm:$Imm)>; } multiclass Neon_shiftNarrow_QR_patterns { - def : Pat<(v8i8 (op (v8i16 VPR128:$Rn), imm:$Imm)), + def : Pat<(v8i8 (op (v8i16 VPR128:$Rn), shr_imm8:$Imm)), (!cast(prefix # "_8B") VPR128:$Rn, imm:$Imm)>; - def : Pat<(v4i16 (op (v4i32 VPR128:$Rn), imm:$Imm)), + def : Pat<(v4i16 (op (v4i32 VPR128:$Rn), shr_imm16:$Imm)), (!cast(prefix # "_4H") VPR128:$Rn, imm:$Imm)>; - def : Pat<(v2i32 (op (v2i64 VPR128:$Rn), imm:$Imm)), + def : Pat<(v2i32 (op (v2i64 VPR128:$Rn), shr_imm32:$Imm)), (!cast(prefix # "_2S") VPR128:$Rn, imm:$Imm)>; def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), - (v1i64 (bitconvert (v8i8 (op (v8i16 VPR128:$Rn), imm:$Imm))))), + (v1i64 (bitconvert (v8i8 + (op (v8i16 VPR128:$Rn), shr_imm8:$Imm))))), (!cast(prefix # "_16B") (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), VPR128:$Rn, imm:$Imm)>; def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), - (v1i64 (bitconvert (v4i16 (op (v4i32 VPR128:$Rn), imm:$Imm))))), + (v1i64 (bitconvert (v4i16 + (op (v4i32 VPR128:$Rn), shr_imm16:$Imm))))), (!cast(prefix # "_8H") (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), VPR128:$Rn, imm:$Imm)>; def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), - (v1i64 (bitconvert (v2i32 (op (v2i64 VPR128:$Rn), imm:$Imm))))), + (v1i64 (bitconvert (v2i32 + (op (v2i64 VPR128:$Rn), shr_imm32:$Imm))))), (!cast(prefix # "_4S") (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), VPR128:$Rn, imm:$Imm)>; @@ -2138,7 +2193,7 @@ class N2VCvt_Fx opcode, string asmop, string T, (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm), asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm", [(set (DestTy VPRC:$Rd), (DestTy (IntOp (SrcTy VPRC:$Rn), - (i32 imm:$Imm))))], + (i32 ImmTy:$Imm))))], NoItinerary>; multiclass NeonI_N2VCvt_Fx2fp opcode, string asmop, @@ -2307,8 +2362,7 @@ defm ADDV : NeonI_2VAcross_2<0b0, 0b11011, "addv", int_aarch64_neon_vaddv>; // Variant 3 multiclass NeonI_2VAcross_3 opcode, bits<2> size, - string asmop, SDPatternOperator opnode> -{ + string asmop, SDPatternOperator opnode> { def _1s4s: NeonI_2VAcross<0b1, u, size, opcode, (outs FPR32:$Rd), (ins VPR128:$Rn), asmop # "\t$Rd, $Rn.4s", @@ -2327,6 +2381,61 @@ defm FMAXV : NeonI_2VAcross_3<0b1, 0b01111, 0b00, "fmaxv", defm FMINV : NeonI_2VAcross_3<0b1, 0b01111, 0b10, "fminv", int_aarch64_neon_vminv>; +// The followings are for instruction class (Perm) + +class NeonI_Permute size, bits<3> opcode, + string asmop, RegisterOperand OpVPR, string OpS, + SDPatternOperator opnode, ValueType Ty> + : NeonI_Perm; + +multiclass NeonI_Perm_pat opcode, string asmop, + SDPatternOperator opnode> { + def _8b : NeonI_Permute<0b0, 0b00, opcode, asmop, + VPR64, "8b", opnode, v8i8>; + def _16b : NeonI_Permute<0b1, 0b00, opcode, asmop, + VPR128, "16b",opnode, v16i8>; + def _4h : NeonI_Permute<0b0, 0b01, opcode, asmop, + VPR64, "4h", opnode, v4i16>; + def _8h : NeonI_Permute<0b1, 0b01, opcode, asmop, + VPR128, "8h", opnode, v8i16>; + def _2s : NeonI_Permute<0b0, 0b10, opcode, asmop, + VPR64, "2s", opnode, v2i32>; + def _4s : NeonI_Permute<0b1, 0b10, opcode, asmop, + VPR128, "4s", opnode, v4i32>; + def _2d : NeonI_Permute<0b1, 0b11, opcode, asmop, + VPR128, "2d", opnode, v2i64>; +} + +defm UZP1vvv : NeonI_Perm_pat<0b001, "uzp1", Neon_uzp1>; +defm TRN1vvv : NeonI_Perm_pat<0b010, "trn1", Neon_trn1>; +defm ZIP1vvv : NeonI_Perm_pat<0b011, "zip1", Neon_zip1>; +defm UZP2vvv : NeonI_Perm_pat<0b101, "uzp2", Neon_uzp2>; +defm TRN2vvv : NeonI_Perm_pat<0b110, "trn2", Neon_trn2>; +defm ZIP2vvv : NeonI_Perm_pat<0b111, "zip2", Neon_zip2>; + +multiclass NeonI_Perm_float_pat { + def : Pat<(v2f32 (opnode (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))), + (!cast(INS # "_2s") VPR64:$Rn, VPR64:$Rm)>; + + def : Pat<(v4f32 (opnode (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))), + (!cast(INS # "_4s") VPR128:$Rn, VPR128:$Rm)>; + + def : Pat<(v2f64 (opnode (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))), + (!cast(INS # "_2d") VPR128:$Rn, VPR128:$Rm)>; +} + +defm : NeonI_Perm_float_pat<"UZP1vvv", Neon_uzp1>; +defm : NeonI_Perm_float_pat<"UZP2vvv", Neon_uzp2>; +defm : NeonI_Perm_float_pat<"ZIP1vvv", Neon_zip1>; +defm : NeonI_Perm_float_pat<"ZIP2vvv", Neon_zip2>; +defm : NeonI_Perm_float_pat<"TRN1vvv", Neon_trn1>; +defm : NeonI_Perm_float_pat<"TRN2vvv", Neon_trn2>; + // The followings are for instruction class (3V Diff) // normal long/long2 pattern @@ -2345,8 +2454,7 @@ class NeonI_3VDL size, bits<4> opcode, multiclass NeonI_3VDL_s opcode, string asmop, SDPatternOperator opnode, - bit Commutable = 0> -{ + bit Commutable = 0> { let isCommutable = Commutable in { def _8h8b : NeonI_3VDL<0b0, u, 0b00, opcode, asmop, "8h", "8b", opnode, sext, VPR64, v8i16, v8i8>; @@ -2357,10 +2465,8 @@ multiclass NeonI_3VDL_s opcode, } } -multiclass NeonI_3VDL2_s opcode, - string asmop, SDPatternOperator opnode, - bit Commutable = 0> -{ +multiclass NeonI_3VDL2_s opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { let isCommutable = Commutable in { def _8h16b : NeonI_3VDL<0b1, u, 0b00, opcode, asmop, "8h", "16b", opnode, NI_sext_high_v8i8, VPR128, v8i16, v16i8>; @@ -2371,10 +2477,8 @@ multiclass NeonI_3VDL2_s opcode, } } -multiclass NeonI_3VDL_u opcode, - string asmop, SDPatternOperator opnode, - bit Commutable = 0> -{ +multiclass NeonI_3VDL_u opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { let isCommutable = Commutable in { def _8h8b : NeonI_3VDL<0b0, u, 0b00, opcode, asmop, "8h", "8b", opnode, zext, VPR64, v8i16, v8i8>; @@ -2385,10 +2489,8 @@ multiclass NeonI_3VDL_u opcode, } } -multiclass NeonI_3VDL2_u opcode, - string asmop, SDPatternOperator opnode, - bit Commutable = 0> -{ +multiclass NeonI_3VDL2_u opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { let isCommutable = Commutable in { def _8h16b : NeonI_3VDL<0b1, u, 0b00, opcode, asmop, "8h", "16b", opnode, NI_zext_high_v8i8, VPR128, v8i16, v16i8>; @@ -2425,9 +2527,8 @@ class NeonI_3VDW size, bits<4> opcode, (ResTy (ext (OpTy OpVPR:$Rm))))))], NoItinerary>; -multiclass NeonI_3VDW_s opcode, - string asmop, SDPatternOperator opnode> -{ +multiclass NeonI_3VDW_s opcode, string asmop, + SDPatternOperator opnode> { def _8h8b : NeonI_3VDW<0b0, u, 0b00, opcode, asmop, "8h", "8b", opnode, sext, VPR64, v8i16, v8i8>; def _4s4h : NeonI_3VDW<0b0, u, 0b01, opcode, asmop, "4s", "4h", @@ -2439,9 +2540,8 @@ multiclass NeonI_3VDW_s opcode, defm SADDWvvv : NeonI_3VDW_s<0b0, 0b0001, "saddw", add>; defm SSUBWvvv : NeonI_3VDW_s<0b0, 0b0011, "ssubw", sub>; -multiclass NeonI_3VDW2_s opcode, - string asmop, SDPatternOperator opnode> -{ +multiclass NeonI_3VDW2_s opcode, string asmop, + SDPatternOperator opnode> { def _8h16b : NeonI_3VDW<0b1, u, 0b00, opcode, asmop, "8h", "16b", opnode, NI_sext_high_v8i8, VPR128, v8i16, v16i8>; def _4s8h : NeonI_3VDW<0b1, u, 0b01, opcode, asmop, "4s", "8h", @@ -2453,9 +2553,8 @@ multiclass NeonI_3VDW2_s opcode, defm SADDW2vvv : NeonI_3VDW2_s<0b0, 0b0001, "saddw2", add>; defm SSUBW2vvv : NeonI_3VDW2_s<0b0, 0b0011, "ssubw2", sub>; -multiclass NeonI_3VDW_u opcode, - string asmop, SDPatternOperator opnode> -{ +multiclass NeonI_3VDW_u opcode, string asmop, + SDPatternOperator opnode> { def _8h8b : NeonI_3VDW<0b0, u, 0b00, opcode, asmop, "8h", "8b", opnode, zext, VPR64, v8i16, v8i8>; def _4s4h : NeonI_3VDW<0b0, u, 0b01, opcode, asmop, "4s", "4h", @@ -2467,9 +2566,8 @@ multiclass NeonI_3VDW_u opcode, defm UADDWvvv : NeonI_3VDW_u<0b1, 0b0001, "uaddw", add>; defm USUBWvvv : NeonI_3VDW_u<0b1, 0b0011, "usubw", sub>; -multiclass NeonI_3VDW2_u opcode, - string asmop, SDPatternOperator opnode> -{ +multiclass NeonI_3VDW2_u opcode, string asmop, + SDPatternOperator opnode> { def _8h16b : NeonI_3VDW<0b1, u, 0b00, opcode, asmop, "8h", "16b", opnode, NI_zext_high_v8i8, VPR128, v8i16, v16i8>; def _4s8h : NeonI_3VDW<0b1, u, 0b01, opcode, asmop, "4s", "8h", @@ -2482,17 +2580,16 @@ defm UADDW2vvv : NeonI_3VDW2_u<0b1, 0b0001, "uaddw2", add>; defm USUBW2vvv : NeonI_3VDW2_u<0b1, 0b0011, "usubw2", sub>; // Get the high half part of the vector element. -multiclass NeonI_get_high -{ +multiclass NeonI_get_high { def _8h : PatFrag<(ops node:$Rn), (v8i8 (trunc (v8i16 (srl (v8i16 node:$Rn), - (v8i16 (Neon_dupImm 8))))))>; + (v8i16 (Neon_vdup (i32 8)))))))>; def _4s : PatFrag<(ops node:$Rn), (v4i16 (trunc (v4i32 (srl (v4i32 node:$Rn), - (v4i32 (Neon_dupImm 16))))))>; + (v4i32 (Neon_vdup (i32 16)))))))>; def _2d : PatFrag<(ops node:$Rn), (v2i32 (trunc (v2i64 (srl (v2i64 node:$Rn), - (v2i64 (Neon_dupImm 32))))))>; + (v2i64 (Neon_vdup (i32 32)))))))>; } defm NI_get_hi : NeonI_get_high; @@ -2511,10 +2608,8 @@ class NeonI_3VDN_addhn_2Op size, bits<4> opcode, (OpTy VPR128:$Rm))))))], NoItinerary>; -multiclass NeonI_3VDN_addhn_2Op opcode, - string asmop, SDPatternOperator opnode, - bit Commutable = 0> -{ +multiclass NeonI_3VDN_addhn_2Op opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { let isCommutable = Commutable in { def _8b8h : NeonI_3VDN_addhn_2Op<0b0, u, 0b00, opcode, asmop, "8b", "8h", opnode, NI_get_hi_8h, v8i8, v8i16>; @@ -2542,10 +2637,8 @@ class NeonI_3VD_2Op size, bits<4> opcode, NoItinerary>; // normal narrow pattern -multiclass NeonI_3VDN_2Op opcode, - string asmop, SDPatternOperator opnode, - bit Commutable = 0> -{ +multiclass NeonI_3VDN_2Op opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { let isCommutable = Commutable in { def _8b8h : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8b", "8h", opnode, VPR64, VPR128, v8i8, v8i16>; @@ -2570,8 +2663,7 @@ class NeonI_3VDN_3Op size, bits<4> opcode, let neverHasSideEffects = 1; } -multiclass NeonI_3VDN_3Op_v1 opcode, - string asmop> { +multiclass NeonI_3VDN_3Op_v1 opcode, string asmop> { def _16b8h : NeonI_3VDN_3Op<0b1, u, 0b00, opcode, asmop, "16b", "8h">; def _8h4s : NeonI_3VDN_3Op<0b1, u, 0b01, opcode, asmop, "8h", "4s">; def _4s2d : NeonI_3VDN_3Op<0b1, u, 0b10, opcode, asmop, "4s", "2d">; @@ -2633,10 +2725,8 @@ class NeonI_3VDL_Ext size, bits<4> opcode, (OpTy OpVPR:$Rm))))))], NoItinerary>; -multiclass NeonI_3VDL_zext opcode, - string asmop, SDPatternOperator opnode, - bit Commutable = 0> -{ +multiclass NeonI_3VDL_zext opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { let isCommutable = Commutable in { def _8h8b : NeonI_3VDL_Ext<0b0, u, 0b00, opcode, asmop, "8h", "8b", opnode, VPR64, v8i16, v8i8, v8i8>; @@ -2650,15 +2740,16 @@ multiclass NeonI_3VDL_zext opcode, defm SABDLvvv : NeonI_3VDL_zext<0b0, 0b0111, "sabdl", int_arm_neon_vabds, 1>; defm UABDLvvv : NeonI_3VDL_zext<0b1, 0b0111, "uabdl", int_arm_neon_vabdu, 1>; -multiclass NeonI_Op_High -{ +multiclass NeonI_Op_High { def _16B : PatFrag<(ops node:$Rn, node:$Rm), - (op (v8i8 (Neon_High16B node:$Rn)), (v8i8 (Neon_High16B node:$Rm)))>; + (op (v8i8 (Neon_High16B node:$Rn)), + (v8i8 (Neon_High16B node:$Rm)))>; def _8H : PatFrag<(ops node:$Rn, node:$Rm), - (op (v4i16 (Neon_High8H node:$Rn)), (v4i16 (Neon_High8H node:$Rm)))>; + (op (v4i16 (Neon_High8H node:$Rn)), + (v4i16 (Neon_High8H node:$Rm)))>; def _4S : PatFrag<(ops node:$Rn, node:$Rm), - (op (v2i32 (Neon_High4S node:$Rn)), (v2i32 (Neon_High4S node:$Rm)))>; - + (op (v2i32 (Neon_High4S node:$Rn)), + (v2i32 (Neon_High4S node:$Rm)))>; } defm NI_sabdl_hi : NeonI_Op_High; @@ -2668,10 +2759,8 @@ defm NI_umull_hi : NeonI_Op_High; defm NI_qdmull_hi : NeonI_Op_High; defm NI_pmull_hi : NeonI_Op_High; -multiclass NeonI_3VDL_Abd_u opcode, - string asmop, string opnode, - bit Commutable = 0> -{ +multiclass NeonI_3VDL_Abd_u opcode, string asmop, string opnode, + bit Commutable = 0> { let isCommutable = Commutable in { def _8h8b : NeonI_3VDL_Ext<0b1, u, 0b00, opcode, asmop, "8h", "16b", !cast(opnode # "_16B"), @@ -2690,26 +2779,24 @@ defm UABDL2vvv : NeonI_3VDL_Abd_u<0b1, 0b0111, "uabdl2", "NI_uabdl_hi", 1>; // For pattern that need two operators being chained. class NeonI_3VDL_Aba size, bits<4> opcode, - string asmop, string ResS, string OpS, + string asmop, string ResS, string OpS, SDPatternOperator opnode, SDPatternOperator subop, RegisterOperand OpVPR, ValueType ResTy, ValueType OpTy, ValueType OpSTy> : NeonI_3VDiff { let Constraints = "$src = $Rd"; } -multiclass NeonI_3VDL_Aba_v1 opcode, - string asmop, SDPatternOperator opnode, - SDPatternOperator subop> -{ +multiclass NeonI_3VDL_Aba_v1 opcode, string asmop, + SDPatternOperator opnode, SDPatternOperator subop>{ def _8h8b : NeonI_3VDL_Aba<0b0, u, 0b00, opcode, asmop, "8h", "8b", opnode, subop, VPR64, v8i16, v8i8, v8i8>; def _4s4h : NeonI_3VDL_Aba<0b0, u, 0b01, opcode, asmop, "4s", "4h", @@ -2723,18 +2810,16 @@ defm SABALvvv : NeonI_3VDL_Aba_v1<0b0, 0b0101, "sabal", defm UABALvvv : NeonI_3VDL_Aba_v1<0b1, 0b0101, "uabal", add, int_arm_neon_vabdu>; -multiclass NeonI_3VDL2_Aba_v1 opcode, - string asmop, SDPatternOperator opnode, - string subop> -{ +multiclass NeonI_3VDL2_Aba_v1 opcode, string asmop, + SDPatternOperator opnode, string subop> { def _8h8b : NeonI_3VDL_Aba<0b1, u, 0b00, opcode, asmop, "8h", "16b", - opnode, !cast(subop # "_16B"), + opnode, !cast(subop # "_16B"), VPR128, v8i16, v16i8, v8i8>; def _4s4h : NeonI_3VDL_Aba<0b1, u, 0b01, opcode, asmop, "4s", "8h", - opnode, !cast(subop # "_8H"), + opnode, !cast(subop # "_8H"), VPR128, v4i32, v8i16, v4i16>; def _2d2s : NeonI_3VDL_Aba<0b1, u, 0b10, opcode, asmop, "2d", "4s", - opnode, !cast(subop # "_4S"), + opnode, !cast(subop # "_4S"), VPR128, v2i64, v4i32, v2i32>; } @@ -2744,10 +2829,8 @@ defm UABAL2vvv : NeonI_3VDL2_Aba_v1<0b1, 0b0101, "uabal2", add, "NI_uabdl_hi">; // Long pattern with 2 operands -multiclass NeonI_3VDL_2Op opcode, - string asmop, SDPatternOperator opnode, - bit Commutable = 0> -{ +multiclass NeonI_3VDL_2Op opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { let isCommutable = Commutable in { def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b", opnode, VPR128, VPR64, v8i16, v8i8>; @@ -2772,12 +2855,8 @@ class NeonI_3VDL2_2Op_mull size, bits<4> opcode, (ResTy (opnode (OpTy VPR128:$Rn), (OpTy VPR128:$Rm))))], NoItinerary>; - -multiclass NeonI_3VDL2_2Op_mull_v1 opcode, - string asmop, - string opnode, - bit Commutable = 0> -{ +multiclass NeonI_3VDL2_2Op_mull_v1 opcode, string asmop, + string opnode, bit Commutable = 0> { let isCommutable = Commutable in { def _8h16b : NeonI_3VDL2_2Op_mull<0b1, u, 0b00, opcode, asmop, "8h", "16b", !cast(opnode # "_16B"), @@ -2812,9 +2891,8 @@ class NeonI_3VDL_3Op size, bits<4> opcode, let Constraints = "$src = $Rd"; } -multiclass NeonI_3VDL_3Op_v1 opcode, - string asmop, SDPatternOperator opnode> -{ +multiclass NeonI_3VDL_3Op_v1 opcode, string asmop, + SDPatternOperator opnode> { def _8h8b : NeonI_3VDL_3Op<0b0, u, 0b00, opcode, asmop, "8h", "8b", opnode, v8i16, v8i8>; def _4s4h : NeonI_3VDL_3Op<0b0, u, 0b01, opcode, asmop, "4s", "4h", @@ -2861,16 +2939,13 @@ class NeonI_3VDL2_3Op_mlas size, bits<4> opcode, let Constraints = "$src = $Rd"; } -multiclass NeonI_3VDL2_3Op_mlas_v1 opcode, - string asmop, - SDPatternOperator subop, - string opnode> -{ +multiclass NeonI_3VDL2_3Op_mlas_v1 opcode, string asmop, + SDPatternOperator subop, string opnode> { def _8h16b : NeonI_3VDL2_3Op_mlas<0b1, u, 0b00, opcode, asmop, "8h", "16b", subop, !cast(opnode # "_16B"), VPR128, v8i16, v16i8>; def _4s8h : NeonI_3VDL2_3Op_mlas<0b1, u, 0b01, opcode, asmop, "4s", "8h", - subop, !cast(opnode # "_8H"), + subop, !cast(opnode # "_8H"), VPR128, v4i32, v8i16>; def _2d4s : NeonI_3VDL2_3Op_mlas<0b1, u, 0b10, opcode, asmop, "2d", "4s", subop, !cast(opnode # "_4S"), @@ -2887,9 +2962,8 @@ defm SMLSL2vvv : NeonI_3VDL2_3Op_mlas_v1<0b0, 0b1010, "smlsl2", defm UMLSL2vvv : NeonI_3VDL2_3Op_mlas_v1<0b1, 0b1010, "umlsl2", sub, "NI_umull_hi">; -multiclass NeonI_3VDL_qdmlal_3Op_v2 opcode, - string asmop, SDPatternOperator opnode> -{ +multiclass NeonI_3VDL_qdmlal_3Op_v2 opcode, string asmop, + SDPatternOperator opnode> { def _4s4h : NeonI_3VDL2_3Op_mlas<0b0, u, 0b01, opcode, asmop, "4s", "4h", opnode, int_arm_neon_vqdmull, VPR64, v4i32, v4i16>; @@ -2903,10 +2977,8 @@ defm SQDMLALvvv : NeonI_3VDL_qdmlal_3Op_v2<0b0, 0b1001, "sqdmlal", defm SQDMLSLvvv : NeonI_3VDL_qdmlal_3Op_v2<0b0, 0b1011, "sqdmlsl", int_arm_neon_vqsubs>; -multiclass NeonI_3VDL_v2 opcode, - string asmop, SDPatternOperator opnode, - bit Commutable = 0> -{ +multiclass NeonI_3VDL_v2 opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { let isCommutable = Commutable in { def _4s4h : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4s", "4h", opnode, VPR128, VPR64, v4i32, v4i16>; @@ -2918,11 +2990,8 @@ multiclass NeonI_3VDL_v2 opcode, defm SQDMULLvvv : NeonI_3VDL_v2<0b0, 0b1101, "sqdmull", int_arm_neon_vqdmull, 1>; -multiclass NeonI_3VDL2_2Op_mull_v2 opcode, - string asmop, - string opnode, - bit Commutable = 0> -{ +multiclass NeonI_3VDL2_2Op_mull_v2 opcode, string asmop, + string opnode, bit Commutable = 0> { let isCommutable = Commutable in { def _4s8h : NeonI_3VDL2_2Op_mull<0b1, u, 0b01, opcode, asmop, "4s", "8h", !cast(opnode # "_8H"), @@ -2933,13 +3002,11 @@ multiclass NeonI_3VDL2_2Op_mull_v2 opcode, } } -defm SQDMULL2vvv : NeonI_3VDL2_2Op_mull_v2<0b0, 0b1101, "sqdmull2", +defm SQDMULL2vvv : NeonI_3VDL2_2Op_mull_v2<0b0, 0b1101, "sqdmull2", "NI_qdmull_hi", 1>; -multiclass NeonI_3VDL2_3Op_qdmlal_v2 opcode, - string asmop, - SDPatternOperator opnode> -{ +multiclass NeonI_3VDL2_3Op_qdmlal_v2 opcode, string asmop, + SDPatternOperator opnode> { def _4s8h : NeonI_3VDL2_3Op_mlas<0b1, u, 0b01, opcode, asmop, "4s", "8h", opnode, NI_qdmull_hi_8H, VPR128, v4i32, v8i16>; @@ -2953,32 +3020,37 @@ defm SQDMLAL2vvv : NeonI_3VDL2_3Op_qdmlal_v2<0b0, 0b1001, "sqdmlal2", defm SQDMLSL2vvv : NeonI_3VDL2_3Op_qdmlal_v2<0b0, 0b1011, "sqdmlsl2", int_arm_neon_vqsubs>; -multiclass NeonI_3VDL_v3 opcode, - string asmop, SDPatternOperator opnode, - bit Commutable = 0> -{ +multiclass NeonI_3VDL_v3 opcode, string asmop, + SDPatternOperator opnode, bit Commutable = 0> { let isCommutable = Commutable in { def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b", opnode, VPR128, VPR64, v8i16, v8i8>; + + def _1q1d : NeonI_3VDiff<0b0, u, 0b11, opcode, + (outs VPR128:$Rd), (ins VPR64:$Rn, VPR64:$Rm), + asmop # "\t$Rd.1q, $Rn.1d, $Rm.1d", + [], NoItinerary>; } } defm PMULLvvv : NeonI_3VDL_v3<0b0, 0b1110, "pmull", int_arm_neon_vmullp, 1>; -multiclass NeonI_3VDL2_2Op_mull_v3 opcode, - string asmop, - string opnode, - bit Commutable = 0> -{ +multiclass NeonI_3VDL2_2Op_mull_v3 opcode, string asmop, + string opnode, bit Commutable = 0> { let isCommutable = Commutable in { def _8h16b : NeonI_3VDL2_2Op_mull<0b1, u, 0b00, opcode, asmop, "8h", "16b", !cast(opnode # "_16B"), v8i16, v16i8>; + + def _1q2d : NeonI_3VDiff<0b1, u, 0b11, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm), + asmop # "\t$Rd.1q, $Rn.2d, $Rm.2d", + [], NoItinerary>; } } -defm PMULL2vvv : NeonI_3VDL2_2Op_mull_v3<0b0, 0b1110, "pmull2", - "NI_pmull_hi", 1>; +defm PMULL2vvv : NeonI_3VDL2_2Op_mull_v3<0b0, 0b1110, "pmull2", "NI_pmull_hi", + 1>; // End of implementation for instruction class (3V Diff) @@ -2990,7 +3062,7 @@ defm PMULL2vvv : NeonI_3VDL2_2Op_mull_v3<0b0, 0b1110, "pmull2", // The structure consists of a sequence of sets of N values. // The first element of the structure is placed in the first lane // of the first first vector, the second element in the first lane -// of the second vector, and so on. +// of the second vector, and so on. // E.g. LD1_3V_2S will load 32-bit elements {A, B, C, D, E, F} sequentially into // the three 64-bit vectors list {BA, DC, FE}. // E.g. LD3_2S will load 32-bit elements {A, B, C, D, E, F} into the three @@ -3043,21 +3115,21 @@ defm LD3 : LDVList_BHSD<0b0100, "VTriple", "ld3">; defm LD4 : LDVList_BHSD<0b0000, "VQuad", "ld4">; // Load multiple 1-element structure to N consecutive registers (N = 2,3,4) -defm LD1_2V : LDVList_BHSD<0b1010, "VPair", "ld1">; -def LD1_2V_1D : NeonI_LDVList<0, 0b1010, 0b11, VPair1D_operand, "ld1">; +defm LD1x2 : LDVList_BHSD<0b1010, "VPair", "ld1">; +def LD1x2_1D : NeonI_LDVList<0, 0b1010, 0b11, VPair1D_operand, "ld1">; -defm LD1_3V : LDVList_BHSD<0b0110, "VTriple", "ld1">; -def LD1_3V_1D : NeonI_LDVList<0, 0b0110, 0b11, VTriple1D_operand, "ld1">; +defm LD1x3 : LDVList_BHSD<0b0110, "VTriple", "ld1">; +def LD1x3_1D : NeonI_LDVList<0, 0b0110, 0b11, VTriple1D_operand, "ld1">; -defm LD1_4V : LDVList_BHSD<0b0010, "VQuad", "ld1">; -def LD1_4V_1D : NeonI_LDVList<0, 0b0010, 0b11, VQuad1D_operand, "ld1">; +defm LD1x4 : LDVList_BHSD<0b0010, "VQuad", "ld1">; +def LD1x4_1D : NeonI_LDVList<0, 0b0010, 0b11, VQuad1D_operand, "ld1">; class NeonI_STVList opcode, bits<2> size, RegisterOperand VecList, string asmop> : NeonI_LdStMult { let mayStore = 1; let neverHasSideEffects = 1; @@ -3097,1026 +3169,4096 @@ defm ST3 : STVList_BHSD<0b0100, "VTriple", "st3">; defm ST4 : STVList_BHSD<0b0000, "VQuad", "st4">; // Store multiple 1-element structures from N consecutive registers (N = 2,3,4) -defm ST1_2V : STVList_BHSD<0b1010, "VPair", "st1">; -def ST1_2V_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">; +defm ST1x2 : STVList_BHSD<0b1010, "VPair", "st1">; +def ST1x2_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">; -defm ST1_3V : STVList_BHSD<0b0110, "VTriple", "st1">; -def ST1_3V_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">; +defm ST1x3 : STVList_BHSD<0b0110, "VTriple", "st1">; +def ST1x3_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">; -defm ST1_4V : STVList_BHSD<0b0010, "VQuad", "st1">; -def ST1_4V_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">; +defm ST1x4 : STVList_BHSD<0b0010, "VQuad", "st1">; +def ST1x4_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">; -// End of vector load/store multiple N-element structure(class SIMD lselem) +def : Pat<(v2f64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>; +def : Pat<(v2i64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>; -// Scalar Arithmetic +def : Pat<(v4f32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>; +def : Pat<(v4i32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>; -class NeonI_Scalar3Same_D_size opcode, string asmop> - : NeonI_Scalar3Same; +def : Pat<(v8i16 (load GPR64xsp:$addr)), (LD1_8H GPR64xsp:$addr)>; +def : Pat<(v16i8 (load GPR64xsp:$addr)), (LD1_16B GPR64xsp:$addr)>; -multiclass NeonI_Scalar3Same_HS_sizes opcode, - string asmop, bit Commutable = 0> -{ - let isCommutable = Commutable in { - def hhh : NeonI_Scalar3Same; - def sss : NeonI_Scalar3Same; - } -} +def : Pat<(v1f64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>; +def : Pat<(v1i64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>; -multiclass NeonI_Scalar3Same_SD_sizes opcode, - string asmop, bit Commutable = 0> -{ - let isCommutable = Commutable in { - def sss : NeonI_Scalar3Same; - def ddd : NeonI_Scalar3Same; - } +def : Pat<(v2f32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>; +def : Pat<(v2i32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>; + +def : Pat<(v4i16 (load GPR64xsp:$addr)), (LD1_4H GPR64xsp:$addr)>; +def : Pat<(v8i8 (load GPR64xsp:$addr)), (LD1_8B GPR64xsp:$addr)>; + +def : Pat<(store (v2i64 VPR128:$value), GPR64xsp:$addr), + (ST1_2D GPR64xsp:$addr, VPR128:$value)>; +def : Pat<(store (v2f64 VPR128:$value), GPR64xsp:$addr), + (ST1_2D GPR64xsp:$addr, VPR128:$value)>; + +def : Pat<(store (v4i32 VPR128:$value), GPR64xsp:$addr), + (ST1_4S GPR64xsp:$addr, VPR128:$value)>; +def : Pat<(store (v4f32 VPR128:$value), GPR64xsp:$addr), + (ST1_4S GPR64xsp:$addr, VPR128:$value)>; + +def : Pat<(store (v8i16 VPR128:$value), GPR64xsp:$addr), + (ST1_8H GPR64xsp:$addr, VPR128:$value)>; +def : Pat<(store (v16i8 VPR128:$value), GPR64xsp:$addr), + (ST1_16B GPR64xsp:$addr, VPR128:$value)>; + +def : Pat<(store (v1i64 VPR64:$value), GPR64xsp:$addr), + (ST1_1D GPR64xsp:$addr, VPR64:$value)>; +def : Pat<(store (v1f64 VPR64:$value), GPR64xsp:$addr), + (ST1_1D GPR64xsp:$addr, VPR64:$value)>; + +def : Pat<(store (v2i32 VPR64:$value), GPR64xsp:$addr), + (ST1_2S GPR64xsp:$addr, VPR64:$value)>; +def : Pat<(store (v2f32 VPR64:$value), GPR64xsp:$addr), + (ST1_2S GPR64xsp:$addr, VPR64:$value)>; + +def : Pat<(store (v4i16 VPR64:$value), GPR64xsp:$addr), + (ST1_4H GPR64xsp:$addr, VPR64:$value)>; +def : Pat<(store (v8i8 VPR64:$value), GPR64xsp:$addr), + (ST1_8B GPR64xsp:$addr, VPR64:$value)>; + +// End of vector load/store multiple N-element structure(class SIMD lselem) + +// The followings are post-index vector load/store multiple N-element +// structure(class SIMD lselem-post) +def exact1_asmoperand : AsmOperandClass { + let Name = "Exact1"; + let PredicateMethod = "isExactImm<1>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact1 : Operand, ImmLeaf { + let ParserMatchClass = exact1_asmoperand; } -multiclass NeonI_Scalar3Same_BHSD_sizes opcode, - string asmop, bit Commutable = 0> -{ - let isCommutable = Commutable in { - def bbb : NeonI_Scalar3Same; - def hhh : NeonI_Scalar3Same; - def sss : NeonI_Scalar3Same; - def ddd : NeonI_Scalar3Same; - } +def exact2_asmoperand : AsmOperandClass { + let Name = "Exact2"; + let PredicateMethod = "isExactImm<2>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact2 : Operand, ImmLeaf { + let ParserMatchClass = exact2_asmoperand; } -multiclass Neon_Scalar3Same_D_size_patterns { - def : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))), - (INSTD FPR64:$Rn, FPR64:$Rm)>; +def exact3_asmoperand : AsmOperandClass { + let Name = "Exact3"; + let PredicateMethod = "isExactImm<3>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact3 : Operand, ImmLeaf { + let ParserMatchClass = exact3_asmoperand; } -multiclass Neon_Scalar3Same_BHSD_size_patterns - : Neon_Scalar3Same_D_size_patterns { - def: Pat<(v1i8 (opnode (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))), - (INSTB FPR8:$Rn, FPR8:$Rm)>; +def exact4_asmoperand : AsmOperandClass { + let Name = "Exact4"; + let PredicateMethod = "isExactImm<4>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact4 : Operand, ImmLeaf { + let ParserMatchClass = exact4_asmoperand; +} - def: Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))), - (INSTH FPR16:$Rn, FPR16:$Rm)>; +def exact6_asmoperand : AsmOperandClass { + let Name = "Exact6"; + let PredicateMethod = "isExactImm<6>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact6 : Operand, ImmLeaf { + let ParserMatchClass = exact6_asmoperand; +} - def: Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))), - (INSTS FPR32:$Rn, FPR32:$Rm)>; +def exact8_asmoperand : AsmOperandClass { + let Name = "Exact8"; + let PredicateMethod = "isExactImm<8>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact8 : Operand, ImmLeaf { + let ParserMatchClass = exact8_asmoperand; } -multiclass Neon_Scalar3Same_HS_size_patterns { - def : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))), - (INSTH FPR16:$Rn, FPR16:$Rm)>; - def : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))), - (INSTS FPR32:$Rn, FPR32:$Rm)>; +def exact12_asmoperand : AsmOperandClass { + let Name = "Exact12"; + let PredicateMethod = "isExactImm<12>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact12 : Operand, ImmLeaf { + let ParserMatchClass = exact12_asmoperand; } -multiclass Neon_Scalar3Same_SD_size_patterns { - def : Pat<(v1f32 (opnode (v1f32 FPR32:$Rn), (v1f32 FPR32:$Rm))), - (INSTS FPR32:$Rn, FPR32:$Rm)>; - def : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), - (INSTD FPR64:$Rn, FPR64:$Rm)>; +def exact16_asmoperand : AsmOperandClass { + let Name = "Exact16"; + let PredicateMethod = "isExactImm<16>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact16 : Operand, ImmLeaf { + let ParserMatchClass = exact16_asmoperand; } -// Scalar Two Registers Miscellaneous - -multiclass NeonI_Scalar2SameMisc_SD_size opcode, - string asmop> { - def ss : NeonI_Scalar2SameMisc; - def dd : NeonI_Scalar2SameMisc; +def exact24_asmoperand : AsmOperandClass { + let Name = "Exact24"; + let PredicateMethod = "isExactImm<24>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact24 : Operand, ImmLeaf { + let ParserMatchClass = exact24_asmoperand; } -multiclass Neon_Scalar2SameMisc_cvt_SD_size_patterns { - def : Pat<(v1f32 (Sopnode (v1i32 FPR32:$Rn))), - (INSTS FPR32:$Rn)>; - def : Pat<(v1f64 (Dopnode (v1i64 FPR64:$Rn))), - (INSTD FPR64:$Rn)>; +def exact32_asmoperand : AsmOperandClass { + let Name = "Exact32"; + let PredicateMethod = "isExactImm<32>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact32 : Operand, ImmLeaf { + let ParserMatchClass = exact32_asmoperand; } -multiclass Neon_Scalar2SameMisc_SD_size_patterns { - def : Pat<(v1f32 (opnode (v1f32 FPR32:$Rn))), - (INSTS FPR32:$Rn)>; - def : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), - (INSTD FPR64:$Rn)>; +def exact48_asmoperand : AsmOperandClass { + let Name = "Exact48"; + let PredicateMethod = "isExactImm<48>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact48 : Operand, ImmLeaf { + let ParserMatchClass = exact48_asmoperand; } -// Scalar Integer Add -let isCommutable = 1 in { -def ADDddd : NeonI_Scalar3Same_D_size<0b0, 0b10000, "add">; +def exact64_asmoperand : AsmOperandClass { + let Name = "Exact64"; + let PredicateMethod = "isExactImm<64>"; + let RenderMethod = "addImmOperands"; +} +def uimm_exact64 : Operand, ImmLeaf { + let ParserMatchClass = exact64_asmoperand; } -// Scalar Integer Sub -def SUBddd : NeonI_Scalar3Same_D_size<0b1, 0b10000, "sub">; +multiclass NeonI_LDWB_VList opcode, bits<2> size, + RegisterOperand VecList, Operand ImmTy, + string asmop> { + let Constraints = "$Rn = $wb", mayLoad = 1, neverHasSideEffects = 1, + DecoderMethod = "DecodeVLDSTPostInstruction" in { + def _fixed : NeonI_LdStMult_Post { + let Rm = 0b11111; + } -// Pattern for Scalar Integer Add and Sub with D register only -defm : Neon_Scalar3Same_D_size_patterns; -defm : Neon_Scalar3Same_D_size_patterns; + def _register : NeonI_LdStMult_Post; + } +} -// Patterns to match llvm.aarch64.* intrinsic for Scalar Add, Sub -defm : Neon_Scalar3Same_D_size_patterns; -defm : Neon_Scalar3Same_D_size_patterns; -defm : Neon_Scalar3Same_D_size_patterns; -defm : Neon_Scalar3Same_D_size_patterns; +multiclass LDWB_VList_BHSD opcode, string List, Operand ImmTy, + Operand ImmTy2, string asmop> { + defm _8B : NeonI_LDWB_VList<0, opcode, 0b00, + !cast(List # "8B_operand"), + ImmTy, asmop>; + + defm _4H : NeonI_LDWB_VList<0, opcode, 0b01, + !cast(List # "4H_operand"), + ImmTy, asmop>; + + defm _2S : NeonI_LDWB_VList<0, opcode, 0b10, + !cast(List # "2S_operand"), + ImmTy, asmop>; + + defm _16B : NeonI_LDWB_VList<1, opcode, 0b00, + !cast(List # "16B_operand"), + ImmTy2, asmop>; + + defm _8H : NeonI_LDWB_VList<1, opcode, 0b01, + !cast(List # "8H_operand"), + ImmTy2, asmop>; + + defm _4S : NeonI_LDWB_VList<1, opcode, 0b10, + !cast(List # "4S_operand"), + ImmTy2, asmop>; + + defm _2D : NeonI_LDWB_VList<1, opcode, 0b11, + !cast(List # "2D_operand"), + ImmTy2, asmop>; +} + +// Post-index load multiple N-element structures from N registers (N = 1,2,3,4) +defm LD1WB : LDWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "ld1">; +defm LD1WB_1D : NeonI_LDWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8, + "ld1">; + +defm LD2WB : LDWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "ld2">; + +defm LD3WB : LDWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48, + "ld3">; + +defm LD4WB : LDWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "ld4">; + +// Post-index load multiple 1-element structures from N consecutive registers +// (N = 2,3,4) +defm LD1x2WB : LDWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32, + "ld1">; +defm LD1x2WB_1D : NeonI_LDWB_VList<0, 0b1010, 0b11, VPair1D_operand, + uimm_exact16, "ld1">; + +defm LD1x3WB : LDWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48, + "ld1">; +defm LD1x3WB_1D : NeonI_LDWB_VList<0, 0b0110, 0b11, VTriple1D_operand, + uimm_exact24, "ld1">; + +defm LD1x4WB : LDWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64, + "ld1">; +defm LD1x4WB_1D : NeonI_LDWB_VList<0, 0b0010, 0b11, VQuad1D_operand, + uimm_exact32, "ld1">; + +multiclass NeonI_STWB_VList opcode, bits<2> size, + RegisterOperand VecList, Operand ImmTy, + string asmop> { + let Constraints = "$Rn = $wb", mayStore = 1, neverHasSideEffects = 1, + DecoderMethod = "DecodeVLDSTPostInstruction" in { + def _fixed : NeonI_LdStMult_Post { + let Rm = 0b11111; + } -// Scalar Integer Saturating Add (Signed, Unsigned) -defm SQADD : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00001, "sqadd", 1>; -defm UQADD : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00001, "uqadd", 1>; + def _register : NeonI_LdStMult_Post; + } +} -// Scalar Integer Saturating Sub (Signed, Unsigned) -defm SQSUB : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00101, "sqsub", 0>; -defm UQSUB : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00101, "uqsub", 0>; +multiclass STWB_VList_BHSD opcode, string List, Operand ImmTy, + Operand ImmTy2, string asmop> { + defm _8B : NeonI_STWB_VList<0, opcode, 0b00, + !cast(List # "8B_operand"), ImmTy, asmop>; -// Patterns to match llvm.arm.* intrinsic for -// Scalar Integer Saturating Add, Sub (Signed, Unsigned) -defm : Neon_Scalar3Same_D_size_patterns; -defm : Neon_Scalar3Same_D_size_patterns; -defm : Neon_Scalar3Same_D_size_patterns; -defm : Neon_Scalar3Same_D_size_patterns; + defm _4H : NeonI_STWB_VList<0, opcode, 0b01, + !cast(List # "4H_operand"), + ImmTy, asmop>; -// Patterns to match llvm.aarch64.* intrinsic for -// Scalar Integer Saturating Add, Sub (Signed, Unsigned) -defm : Neon_Scalar3Same_BHSD_size_patterns; -defm : Neon_Scalar3Same_BHSD_size_patterns; -defm : Neon_Scalar3Same_BHSD_size_patterns; -defm : Neon_Scalar3Same_BHSD_size_patterns; + defm _2S : NeonI_STWB_VList<0, opcode, 0b10, + !cast(List # "2S_operand"), + ImmTy, asmop>; -// Scalar Integer Saturating Doubling Multiply Half High -defm SQDMULH : NeonI_Scalar3Same_HS_sizes<0b0, 0b10110, "sqdmulh", 1>; + defm _16B : NeonI_STWB_VList<1, opcode, 0b00, + !cast(List # "16B_operand"), + ImmTy2, asmop>; -// Scalar Integer Saturating Rounding Doubling Multiply Half High -defm SQRDMULH : NeonI_Scalar3Same_HS_sizes<0b1, 0b10110, "sqrdmulh", 1>; + defm _8H : NeonI_STWB_VList<1, opcode, 0b01, + !cast(List # "8H_operand"), + ImmTy2, asmop>; -// Patterns to match llvm.arm.* intrinsic for -// Scalar Integer Saturating Doubling Multiply Half High and -// Scalar Integer Saturating Rounding Doubling Multiply Half High -defm : Neon_Scalar3Same_HS_size_patterns; -defm : Neon_Scalar3Same_HS_size_patterns; + defm _4S : NeonI_STWB_VList<1, opcode, 0b10, + !cast(List # "4S_operand"), + ImmTy2, asmop>; -// Scalar Floating-point Multiply Extended -defm FMULX : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11011, "fmulx", 1>; + defm _2D : NeonI_STWB_VList<1, opcode, 0b11, + !cast(List # "2D_operand"), + ImmTy2, asmop>; +} -// Scalar Floating-point Reciprocal Step -defm FRECPS : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11111, "frecps", 0>; +// Post-index load multiple N-element structures from N registers (N = 1,2,3,4) +defm ST1WB : STWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "st1">; +defm ST1WB_1D : NeonI_STWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8, + "st1">; -// Scalar Floating-point Reciprocal Square Root Step -defm FRSQRTS : NeonI_Scalar3Same_SD_sizes<0b0, 0b1, 0b11111, "frsqrts", 0>; +defm ST2WB : STWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "st2">; -// Patterns to match llvm.arm.* intrinsic for -// Scalar Floating-point Reciprocal Step and -// Scalar Floating-point Reciprocal Square Root Step -defm : Neon_Scalar3Same_SD_size_patterns; -defm : Neon_Scalar3Same_SD_size_patterns; +defm ST3WB : STWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48, + "st3">; -// Patterns to match llvm.aarch64.* intrinsic for -// Scalar Floating-point Multiply Extended, -defm : Neon_Scalar3Same_SD_size_patterns; +defm ST4WB : STWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "st4">; -// Scalar Integer Shift Left (Signed, Unsigned) -def SSHLddd : NeonI_Scalar3Same_D_size<0b0, 0b01000, "sshl">; -def USHLddd : NeonI_Scalar3Same_D_size<0b1, 0b01000, "ushl">; +// Post-index load multiple 1-element structures from N consecutive registers +// (N = 2,3,4) +defm ST1x2WB : STWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32, + "st1">; +defm ST1x2WB_1D : NeonI_STWB_VList<0, 0b1010, 0b11, VPair1D_operand, + uimm_exact16, "st1">; -// Patterns to match llvm.arm.* intrinsic for -// Scalar Integer Shift Left (Signed, Unsigned) -defm : Neon_Scalar3Same_D_size_patterns; -defm : Neon_Scalar3Same_D_size_patterns; +defm ST1x3WB : STWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48, + "st1">; +defm ST1x3WB_1D : NeonI_STWB_VList<0, 0b0110, 0b11, VTriple1D_operand, + uimm_exact24, "st1">; -// Patterns to match llvm.aarch64.* intrinsic for -// Scalar Integer Shift Left (Signed, Unsigned) -defm : Neon_Scalar3Same_D_size_patterns; -defm : Neon_Scalar3Same_D_size_patterns; +defm ST1x4WB : STWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64, + "st1">; +defm ST1x4WB_1D : NeonI_STWB_VList<0, 0b0010, 0b11, VQuad1D_operand, + uimm_exact32, "st1">; -// Scalar Integer Saturating Shift Left (Signed, Unsigned) -defm SQSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01001, "sqshl", 0>; -defm UQSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01001, "uqshl", 0>; +// End of post-index vector load/store multiple N-element structure +// (class SIMD lselem-post) -// Patterns to match llvm.aarch64.* intrinsic for -// Scalar Integer Saturating Shift Letf (Signed, Unsigned) -defm : Neon_Scalar3Same_BHSD_size_patterns; -defm : Neon_Scalar3Same_BHSD_size_patterns; - -// Patterns to match llvm.arm.* intrinsic for -// Scalar Integer Saturating Shift Letf (Signed, Unsigned) -defm : Neon_Scalar3Same_D_size_patterns; -defm : Neon_Scalar3Same_D_size_patterns; - -// Scalar Integer Rounding Shift Left (Signed, Unsigned) -def SRSHLddd: NeonI_Scalar3Same_D_size<0b0, 0b01010, "srshl">; -def URSHLddd: NeonI_Scalar3Same_D_size<0b1, 0b01010, "urshl">; +// The followings are vector load/store single N-element structure +// (class SIMD lsone). +def neon_uimm0_bare : Operand, + ImmLeaf { + let ParserMatchClass = neon_uimm0_asmoperand; + let PrintMethod = "printUImmBareOperand"; +} -// Patterns to match llvm.aarch64.* intrinsic for -// Scalar Integer Rounding Shift Left (Signed, Unsigned) -defm : Neon_Scalar3Same_D_size_patterns; -defm : Neon_Scalar3Same_D_size_patterns; +def neon_uimm1_bare : Operand, + ImmLeaf { + let ParserMatchClass = neon_uimm1_asmoperand; + let PrintMethod = "printUImmBareOperand"; +} -// Patterns to match llvm.arm.* intrinsic for -// Scalar Integer Rounding Shift Left (Signed, Unsigned) -defm : Neon_Scalar3Same_D_size_patterns; -defm : Neon_Scalar3Same_D_size_patterns; +def neon_uimm2_bare : Operand, + ImmLeaf { + let ParserMatchClass = neon_uimm2_asmoperand; + let PrintMethod = "printUImmBareOperand"; +} -// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned) -defm SQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01011, "sqrshl", 0>; -defm UQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01011, "uqrshl", 0>; +def neon_uimm3_bare : Operand, + ImmLeaf { + let ParserMatchClass = uimm3_asmoperand; + let PrintMethod = "printUImmBareOperand"; +} -// Patterns to match llvm.aarch64.* intrinsic for -// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned) -defm : Neon_Scalar3Same_BHSD_size_patterns; -defm : Neon_Scalar3Same_BHSD_size_patterns; +def neon_uimm4_bare : Operand, + ImmLeaf { + let ParserMatchClass = uimm4_asmoperand; + let PrintMethod = "printUImmBareOperand"; +} -// Patterns to match llvm.arm.* intrinsic for -// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned) -defm : Neon_Scalar3Same_D_size_patterns; -defm : Neon_Scalar3Same_D_size_patterns; +class NeonI_LDN_Dup opcode, bits<2> size, + RegisterOperand VecList, string asmop> + : NeonI_LdOne_Dup { + let mayLoad = 1; + let neverHasSideEffects = 1; +} -// Scalar Signed Integer Convert To Floating-point -defm SCVTF : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11101, "scvtf">; -defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns; +multiclass LDN_Dup_BHSD opcode, string List, string asmop> { + def _8B : NeonI_LDN_Dup<0, r, opcode, 0b00, + !cast(List # "8B_operand"), asmop>; -// Scalar Unsigned Integer Convert To Floating-point -defm UCVTF : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11101, "ucvtf">; -defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns; + def _4H : NeonI_LDN_Dup<0, r, opcode, 0b01, + !cast(List # "4H_operand"), asmop>; -// Scalar Floating-point Reciprocal Estimate -defm FRECPE : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11101, "frecpe">; -defm : Neon_Scalar2SameMisc_SD_size_patterns; + def _2S : NeonI_LDN_Dup<0, r, opcode, 0b10, + !cast(List # "2S_operand"), asmop>; -// Scalar Floating-point Reciprocal Exponent -defm FRECPX : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11111, "frecpx">; -defm : Neon_Scalar2SameMisc_SD_size_patterns; + def _1D : NeonI_LDN_Dup<0, r, opcode, 0b11, + !cast(List # "1D_operand"), asmop>; -// Scalar Floating-point Reciprocal Square Root Estimate -defm FRSQRTE: NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11101, "frsqrte">; -defm : Neon_Scalar2SameMisc_SD_size_patterns; + def _16B : NeonI_LDN_Dup<1, r, opcode, 0b00, + !cast(List # "16B_operand"), asmop>; -// Scalar Reduce Pairwise + def _8H : NeonI_LDN_Dup<1, r, opcode, 0b01, + !cast(List # "8H_operand"), asmop>; -multiclass NeonI_ScalarPair_D_sizes opcode, - string asmop, bit Commutable = 0> { - let isCommutable = Commutable in { - def _D_2D : NeonI_ScalarPair; - } -} + def _4S : NeonI_LDN_Dup<1, r, opcode, 0b10, + !cast(List # "4S_operand"), asmop>; -multiclass NeonI_ScalarPair_SD_sizes opcode, - string asmop, bit Commutable = 0> - : NeonI_ScalarPair_D_sizes { - let isCommutable = Commutable in { - def _S_2S : NeonI_ScalarPair; - } + def _2D : NeonI_LDN_Dup<1, r, opcode, 0b11, + !cast(List # "2D_operand"), asmop>; } -// Scalar Reduce Addition Pairwise (Integer) with -// Pattern to match llvm.arm.* intrinsic -defm ADDPvv : NeonI_ScalarPair_D_sizes<0b0, 0b1, 0b11011, "addp", 0>; +// Load single 1-element structure to all lanes of 1 register +defm LD1R : LDN_Dup_BHSD<0b0, 0b110, "VOne", "ld1r">; -// Pattern to match llvm.aarch64.* intrinsic for -// Scalar Reduce Addition Pairwise (Integer) -def : Pat<(v1i64 (int_aarch64_neon_vpadd (v2i64 VPR128:$Rn))), - (ADDPvv_D_2D VPR128:$Rn)>; +// Load single N-element structure to all lanes of N consecutive +// registers (N = 2,3,4) +defm LD2R : LDN_Dup_BHSD<0b1, 0b110, "VPair", "ld2r">; +defm LD3R : LDN_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r">; +defm LD4R : LDN_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r">; -// Scalar Reduce Addition Pairwise (Floating Point) -defm FADDPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01101, "faddp", 0>; -// Scalar Reduce Maximum Pairwise (Floating Point) -defm FMAXPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01111, "fmaxp", 0>; +class LD1R_pattern + : Pat<(VTy (Neon_vdup (DTy (LoadOp GPR64xsp:$Rn)))), + (VTy (INST GPR64xsp:$Rn))>; -// Scalar Reduce Minimum Pairwise (Floating Point) -defm FMINPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01111, "fminp", 0>; +// Match all LD1R instructions +def : LD1R_pattern; -// Scalar Reduce maxNum Pairwise (Floating Point) -defm FMAXNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01100, "fmaxnmp", 0>; +def : LD1R_pattern; -// Scalar Reduce minNum Pairwise (Floating Point) -defm FMINNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01100, "fminnmp", 0>; +def : LD1R_pattern; -multiclass Neon_ScalarPair_SD_size_patterns { - def : Pat<(v1f32 (opnodeS (v2f32 VPR64:$Rn))), - (INSTS VPR64:$Rn)>; - def : Pat<(v1f64 (opnodeD (v2f64 VPR128:$Rn))), - (INSTD VPR128:$Rn)>; -} +def : LD1R_pattern; -// Patterns to match llvm.aarch64.* intrinsic for -// Scalar Reduce Add, Max, Min, MaxiNum, MinNum Pairwise (Floating Point) -defm : Neon_ScalarPair_SD_size_patterns; +def : LD1R_pattern; +def : LD1R_pattern; -defm : Neon_ScalarPair_SD_size_patterns; +def : LD1R_pattern; +def : LD1R_pattern; -defm : Neon_ScalarPair_SD_size_patterns; +def : LD1R_pattern; +def : LD1R_pattern; -defm : Neon_ScalarPair_SD_size_patterns; +def : LD1R_pattern; +def : LD1R_pattern; -defm : Neon_ScalarPair_SD_size_patterns; +multiclass VectorList_Bare_BHSD { + defm B : VectorList_operands; + defm H : VectorList_operands; + defm S : VectorList_operands; + defm D : VectorList_operands; +} +// Special vector list operand of 128-bit vectors with bare layout. +// i.e. only show ".b", ".h", ".s", ".d" +defm VOne : VectorList_Bare_BHSD<"VOne", 1, FPR128>; +defm VPair : VectorList_Bare_BHSD<"VPair", 2, QPair>; +defm VTriple : VectorList_Bare_BHSD<"VTriple", 3, QTriple>; +defm VQuad : VectorList_Bare_BHSD<"VQuad", 4, QQuad>; -//===----------------------------------------------------------------------===// -// Non-Instruction Patterns -//===----------------------------------------------------------------------===// +class NeonI_LDN_Lane op2_1, bit op0, RegisterOperand VList, + Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane<1, r, op2_1, op0, + (outs VList:$Rt), + (ins GPR64xsp:$Rn, VList:$src, ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn]", + [], + NoItinerary> { + let mayLoad = 1; + let neverHasSideEffects = 1; + let hasExtraDefRegAllocReq = 1; + let Constraints = "$src = $Rt"; +} -// 64-bit vector bitcasts... +multiclass LDN_Lane_BHSD { + def _B : NeonI_LDN_Lane(List # "B_operand"), + neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } -def : Pat<(v1i64 (bitconvert (v8i8 VPR64:$src))), (v1i64 VPR64:$src)>; -def : Pat<(v2f32 (bitconvert (v8i8 VPR64:$src))), (v2f32 VPR64:$src)>; -def : Pat<(v2i32 (bitconvert (v8i8 VPR64:$src))), (v2i32 VPR64:$src)>; -def : Pat<(v4i16 (bitconvert (v8i8 VPR64:$src))), (v4i16 VPR64:$src)>; + def _H : NeonI_LDN_Lane(List # "H_operand"), + neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } -def : Pat<(v1i64 (bitconvert (v4i16 VPR64:$src))), (v1i64 VPR64:$src)>; -def : Pat<(v2i32 (bitconvert (v4i16 VPR64:$src))), (v2i32 VPR64:$src)>; -def : Pat<(v2f32 (bitconvert (v4i16 VPR64:$src))), (v2f32 VPR64:$src)>; -def : Pat<(v8i8 (bitconvert (v4i16 VPR64:$src))), (v8i8 VPR64:$src)>; + def _S : NeonI_LDN_Lane(List # "S_operand"), + neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; + } -def : Pat<(v1i64 (bitconvert (v2i32 VPR64:$src))), (v1i64 VPR64:$src)>; -def : Pat<(v2f32 (bitconvert (v2i32 VPR64:$src))), (v2f32 VPR64:$src)>; -def : Pat<(v4i16 (bitconvert (v2i32 VPR64:$src))), (v4i16 VPR64:$src)>; -def : Pat<(v8i8 (bitconvert (v2i32 VPR64:$src))), (v8i8 VPR64:$src)>; + def _D : NeonI_LDN_Lane(List # "D_operand"), + neon_uimm1_bare, asmop> { + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; + } +} -def : Pat<(v1i64 (bitconvert (v2f32 VPR64:$src))), (v1i64 VPR64:$src)>; -def : Pat<(v2i32 (bitconvert (v2f32 VPR64:$src))), (v2i32 VPR64:$src)>; -def : Pat<(v4i16 (bitconvert (v2f32 VPR64:$src))), (v4i16 VPR64:$src)>; -def : Pat<(v8i8 (bitconvert (v2f32 VPR64:$src))), (v8i8 VPR64:$src)>; +// Load single 1-element structure to one lane of 1 register. +defm LD1LN : LDN_Lane_BHSD<0b0, 0b0, "VOne", "ld1">; + +// Load single N-element structure to one lane of N consecutive registers +// (N = 2,3,4) +defm LD2LN : LDN_Lane_BHSD<0b1, 0b0, "VPair", "ld2">; +defm LD3LN : LDN_Lane_BHSD<0b0, 0b1, "VTriple", "ld3">; +defm LD4LN : LDN_Lane_BHSD<0b1, 0b1, "VQuad", "ld4">; + +multiclass LD1LN_patterns { + def : Pat<(VTy (vector_insert (VTy VPR64:$src), + (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp:$lane))), + (VTy (EXTRACT_SUBREG + (INST GPR64xsp:$Rn, + (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64), + ImmOp:$lane), + sub_64))>; + + def : Pat<(VTy2 (vector_insert (VTy2 VPR128:$src), + (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp2:$lane))), + (VTy2 (INST GPR64xsp:$Rn, VPR128:$src, ImmOp2:$lane))>; +} + +// Match all LD1LN instructions +defm : LD1LN_patterns; + +defm : LD1LN_patterns; + +defm : LD1LN_patterns; +defm : LD1LN_patterns; + +defm : LD1LN_patterns; +defm : LD1LN_patterns; + +class NeonI_STN_Lane op2_1, bit op0, RegisterOperand VList, + Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane<0, r, op2_1, op0, + (outs), (ins GPR64xsp:$Rn, VList:$Rt, ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn]", + [], + NoItinerary> { + let mayStore = 1; + let neverHasSideEffects = 1; + let hasExtraDefRegAllocReq = 1; +} -def : Pat<(v2f32 (bitconvert (v1i64 VPR64:$src))), (v2f32 VPR64:$src)>; -def : Pat<(v2i32 (bitconvert (v1i64 VPR64:$src))), (v2i32 VPR64:$src)>; -def : Pat<(v4i16 (bitconvert (v1i64 VPR64:$src))), (v4i16 VPR64:$src)>; -def : Pat<(v8i8 (bitconvert (v1i64 VPR64:$src))), (v8i8 VPR64:$src)>; +multiclass STN_Lane_BHSD { + def _B : NeonI_STN_Lane(List # "B_operand"), + neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } -// ..and 128-bit vector bitcasts... + def _H : NeonI_STN_Lane(List # "H_operand"), + neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } -def : Pat<(v2f64 (bitconvert (v16i8 VPR128:$src))), (v2f64 VPR128:$src)>; -def : Pat<(v2i64 (bitconvert (v16i8 VPR128:$src))), (v2i64 VPR128:$src)>; -def : Pat<(v4f32 (bitconvert (v16i8 VPR128:$src))), (v4f32 VPR128:$src)>; -def : Pat<(v4i32 (bitconvert (v16i8 VPR128:$src))), (v4i32 VPR128:$src)>; -def : Pat<(v8i16 (bitconvert (v16i8 VPR128:$src))), (v8i16 VPR128:$src)>; + def _S : NeonI_STN_Lane(List # "S_operand"), + neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; + } -def : Pat<(v2f64 (bitconvert (v8i16 VPR128:$src))), (v2f64 VPR128:$src)>; -def : Pat<(v2i64 (bitconvert (v8i16 VPR128:$src))), (v2i64 VPR128:$src)>; -def : Pat<(v4i32 (bitconvert (v8i16 VPR128:$src))), (v4i32 VPR128:$src)>; -def : Pat<(v4f32 (bitconvert (v8i16 VPR128:$src))), (v4f32 VPR128:$src)>; -def : Pat<(v16i8 (bitconvert (v8i16 VPR128:$src))), (v16i8 VPR128:$src)>; + def _D : NeonI_STN_Lane(List # "D_operand"), + neon_uimm1_bare, asmop>{ + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; + } +} -def : Pat<(v2f64 (bitconvert (v4i32 VPR128:$src))), (v2f64 VPR128:$src)>; -def : Pat<(v2i64 (bitconvert (v4i32 VPR128:$src))), (v2i64 VPR128:$src)>; -def : Pat<(v4f32 (bitconvert (v4i32 VPR128:$src))), (v4f32 VPR128:$src)>; -def : Pat<(v8i16 (bitconvert (v4i32 VPR128:$src))), (v8i16 VPR128:$src)>; -def : Pat<(v16i8 (bitconvert (v4i32 VPR128:$src))), (v16i8 VPR128:$src)>; +// Store single 1-element structure from one lane of 1 register. +defm ST1LN : STN_Lane_BHSD<0b0, 0b0, "VOne", "st1">; + +// Store single N-element structure from one lane of N consecutive registers +// (N = 2,3,4) +defm ST2LN : STN_Lane_BHSD<0b1, 0b0, "VPair", "st2">; +defm ST3LN : STN_Lane_BHSD<0b0, 0b1, "VTriple", "st3">; +defm ST4LN : STN_Lane_BHSD<0b1, 0b1, "VQuad", "st4">; + +multiclass ST1LN_patterns { + def : Pat<(StoreOp (DTy (vector_extract (VTy VPR64:$Rt), ImmOp:$lane)), + GPR64xsp:$Rn), + (INST GPR64xsp:$Rn, + (SUBREG_TO_REG (i64 0), VPR64:$Rt, sub_64), + ImmOp:$lane)>; + + def : Pat<(StoreOp (DTy (vector_extract (VTy2 VPR128:$Rt), ImmOp2:$lane)), + GPR64xsp:$Rn), + (INST GPR64xsp:$Rn, VPR128:$Rt, ImmOp2:$lane)>; +} + +// Match all ST1LN instructions +defm : ST1LN_patterns; + +defm : ST1LN_patterns; + +defm : ST1LN_patterns; +defm : ST1LN_patterns; + +defm : ST1LN_patterns; +defm : ST1LN_patterns; + +// End of vector load/store single N-element structure (class SIMD lsone). + + +// The following are post-index load/store single N-element instructions +// (class SIMD lsone-post) + +multiclass NeonI_LDN_WB_Dup opcode, bits<2> size, + RegisterOperand VecList, Operand ImmTy, + string asmop> { + let mayLoad = 1, neverHasSideEffects = 1, Constraints = "$wb = $Rn", + DecoderMethod = "DecodeVLDSTLanePostInstruction" in { + def _fixed : NeonI_LdOne_Dup_Post { + let Rm = 0b11111; + } + + def _register : NeonI_LdOne_Dup_Post; + } +} -def : Pat<(v2f64 (bitconvert (v4f32 VPR128:$src))), (v2f64 VPR128:$src)>; -def : Pat<(v2i64 (bitconvert (v4f32 VPR128:$src))), (v2i64 VPR128:$src)>; -def : Pat<(v4i32 (bitconvert (v4f32 VPR128:$src))), (v4i32 VPR128:$src)>; -def : Pat<(v8i16 (bitconvert (v4f32 VPR128:$src))), (v8i16 VPR128:$src)>; -def : Pat<(v16i8 (bitconvert (v4f32 VPR128:$src))), (v16i8 VPR128:$src)>; +multiclass LDWB_Dup_BHSD opcode, string List, string asmop, + Operand uimm_b, Operand uimm_h, + Operand uimm_s, Operand uimm_d> { + defm _8B : NeonI_LDN_WB_Dup<0, r, opcode, 0b00, + !cast(List # "8B_operand"), + uimm_b, asmop>; + + defm _4H : NeonI_LDN_WB_Dup<0, r, opcode, 0b01, + !cast(List # "4H_operand"), + uimm_h, asmop>; + + defm _2S : NeonI_LDN_WB_Dup<0, r, opcode, 0b10, + !cast(List # "2S_operand"), + uimm_s, asmop>; + + defm _1D : NeonI_LDN_WB_Dup<0, r, opcode, 0b11, + !cast(List # "1D_operand"), + uimm_d, asmop>; + + defm _16B : NeonI_LDN_WB_Dup<1, r, opcode, 0b00, + !cast(List # "16B_operand"), + uimm_b, asmop>; + + defm _8H : NeonI_LDN_WB_Dup<1, r, opcode, 0b01, + !cast(List # "8H_operand"), + uimm_h, asmop>; + + defm _4S : NeonI_LDN_WB_Dup<1, r, opcode, 0b10, + !cast(List # "4S_operand"), + uimm_s, asmop>; + + defm _2D : NeonI_LDN_WB_Dup<1, r, opcode, 0b11, + !cast(List # "2D_operand"), + uimm_d, asmop>; +} + +// Post-index load single 1-element structure to all lanes of 1 register +defm LD1R_WB : LDWB_Dup_BHSD<0b0, 0b110, "VOne", "ld1r", uimm_exact1, + uimm_exact2, uimm_exact4, uimm_exact8>; + +// Post-index load single N-element structure to all lanes of N consecutive +// registers (N = 2,3,4) +defm LD2R_WB : LDWB_Dup_BHSD<0b1, 0b110, "VPair", "ld2r", uimm_exact2, + uimm_exact4, uimm_exact8, uimm_exact16>; +defm LD3R_WB : LDWB_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r", uimm_exact3, + uimm_exact6, uimm_exact12, uimm_exact24>; +defm LD4R_WB : LDWB_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r", uimm_exact4, + uimm_exact8, uimm_exact16, uimm_exact32>; + +let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1, + Constraints = "$Rn = $wb, $Rt = $src", + DecoderMethod = "DecodeVLDSTLanePostInstruction" in { + class LDN_WBFx_Lane op2_1, bit op0, RegisterOperand VList, + Operand ImmTy, Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0, + (outs VList:$Rt, GPR64xsp:$wb), + (ins GPR64xsp:$Rn, ImmTy:$amt, + VList:$src, ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn], $amt", + [], + NoItinerary> { + let Rm = 0b11111; + } -def : Pat<(v2f64 (bitconvert (v2i64 VPR128:$src))), (v2f64 VPR128:$src)>; -def : Pat<(v4f32 (bitconvert (v2i64 VPR128:$src))), (v4f32 VPR128:$src)>; -def : Pat<(v4i32 (bitconvert (v2i64 VPR128:$src))), (v4i32 VPR128:$src)>; -def : Pat<(v8i16 (bitconvert (v2i64 VPR128:$src))), (v8i16 VPR128:$src)>; -def : Pat<(v16i8 (bitconvert (v2i64 VPR128:$src))), (v16i8 VPR128:$src)>; + class LDN_WBReg_Lane op2_1, bit op0, RegisterOperand VList, + Operand ImmTy, Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0, + (outs VList:$Rt, GPR64xsp:$wb), + (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, + VList:$src, ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn], $Rm", + [], + NoItinerary>; +} -def : Pat<(v2i64 (bitconvert (v2f64 VPR128:$src))), (v2i64 VPR128:$src)>; -def : Pat<(v4f32 (bitconvert (v2f64 VPR128:$src))), (v4f32 VPR128:$src)>; -def : Pat<(v4i32 (bitconvert (v2f64 VPR128:$src))), (v4i32 VPR128:$src)>; -def : Pat<(v8i16 (bitconvert (v2f64 VPR128:$src))), (v8i16 VPR128:$src)>; -def : Pat<(v16i8 (bitconvert (v2f64 VPR128:$src))), (v16i8 VPR128:$src)>; +multiclass LD_Lane_WB_BHSD { + def _B_fixed : LDN_WBFx_Lane(List # "B_operand"), + uimm_b, neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } + def _B_register : LDN_WBReg_Lane(List # "B_operand"), + uimm_b, neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } -// ...and scalar bitcasts... -def : Pat<(f16 (bitconvert (v1i16 FPR16:$src))), (f16 FPR16:$src)>; -def : Pat<(f32 (bitconvert (v1i32 FPR32:$src))), (f32 FPR32:$src)>; -def : Pat<(f64 (bitconvert (v1i64 FPR64:$src))), (f64 FPR64:$src)>; -def : Pat<(f32 (bitconvert (v1f32 FPR32:$src))), (f32 FPR32:$src)>; -def : Pat<(f64 (bitconvert (v1f64 FPR64:$src))), (f64 FPR64:$src)>; + def _H_fixed : LDN_WBFx_Lane(List # "H_operand"), + uimm_h, neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } -def : Pat<(i64 (bitconvert (v1i64 FPR64:$src))), (FMOVxd $src)>; -def : Pat<(i32 (bitconvert (v1i32 FPR32:$src))), (FMOVws $src)>; + def _H_register : LDN_WBReg_Lane(List # "H_operand"), + uimm_h, neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } -def : Pat<(v8i8 (bitconvert (v1i64 VPR64:$src))), (v8i8 VPR64:$src)>; -def : Pat<(v4i16 (bitconvert (v1i64 VPR64:$src))), (v4i16 VPR64:$src)>; -def : Pat<(v2i32 (bitconvert (v1i64 VPR64:$src))), (v2i32 VPR64:$src)>; + def _S_fixed : LDN_WBFx_Lane(List # "S_operand"), + uimm_s, neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; + } -def : Pat<(f64 (bitconvert (v8i8 VPR64:$src))), (f64 VPR64:$src)>; -def : Pat<(f64 (bitconvert (v4i16 VPR64:$src))), (f64 VPR64:$src)>; -def : Pat<(f64 (bitconvert (v2i32 VPR64:$src))), (f64 VPR64:$src)>; -def : Pat<(f64 (bitconvert (v2f32 VPR64:$src))), (f64 VPR64:$src)>; -def : Pat<(f64 (bitconvert (v1i64 VPR64:$src))), (f64 VPR64:$src)>; + def _S_register : LDN_WBReg_Lane(List # "S_operand"), + uimm_s, neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; + } -def : Pat<(f128 (bitconvert (v16i8 VPR128:$src))), (f128 VPR128:$src)>; -def : Pat<(f128 (bitconvert (v8i16 VPR128:$src))), (f128 VPR128:$src)>; -def : Pat<(f128 (bitconvert (v4i32 VPR128:$src))), (f128 VPR128:$src)>; -def : Pat<(f128 (bitconvert (v2i64 VPR128:$src))), (f128 VPR128:$src)>; -def : Pat<(f128 (bitconvert (v4f32 VPR128:$src))), (f128 VPR128:$src)>; -def : Pat<(f128 (bitconvert (v2f64 VPR128:$src))), (f128 VPR128:$src)>; + def _D_fixed : LDN_WBFx_Lane(List # "D_operand"), + uimm_d, neon_uimm1_bare, asmop> { + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; + } -def : Pat<(v1i16 (bitconvert (f16 FPR16:$src))), (v1i16 FPR16:$src)>; -def : Pat<(v1i32 (bitconvert (f32 FPR32:$src))), (v1i32 FPR32:$src)>; -def : Pat<(v1i64 (bitconvert (f64 FPR64:$src))), (v1i64 FPR64:$src)>; -def : Pat<(v1f32 (bitconvert (f32 FPR32:$src))), (v1f32 FPR32:$src)>; -def : Pat<(v1f64 (bitconvert (f64 FPR64:$src))), (v1f64 FPR64:$src)>; + def _D_register : LDN_WBReg_Lane(List # "D_operand"), + uimm_d, neon_uimm1_bare, asmop> { + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; + } +} -def : Pat<(v1i64 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>; -def : Pat<(v1i32 (bitconvert (i32 GPR32:$src))), (FMOVsw $src)>; - -def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), (v8i8 FPR64:$src)>; -def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), (v4i16 FPR64:$src)>; -def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (v2i32 FPR64:$src)>; -def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 FPR64:$src)>; -def : Pat<(v1i64 (bitconvert (f64 FPR64:$src))), (v1i64 FPR64:$src)>; - -def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>; -def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))), (v8i16 FPR128:$src)>; -def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), (v4i32 FPR128:$src)>; -def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), (v2i64 FPR128:$src)>; -def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>; -def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), (v2f64 FPR128:$src)>; - -def neon_uimm0_bare : Operand, - ImmLeaf { - let ParserMatchClass = neon_uimm0_asmoperand; - let PrintMethod = "printNeonUImm8OperandBare"; -} - -def neon_uimm1_bare : Operand, - ImmLeaf { - let ParserMatchClass = neon_uimm1_asmoperand; - let PrintMethod = "printNeonUImm8OperandBare"; -} - -def neon_uimm2_bare : Operand, - ImmLeaf { - let ParserMatchClass = neon_uimm2_asmoperand; - let PrintMethod = "printNeonUImm8OperandBare"; -} - -def neon_uimm3_bare : Operand, - ImmLeaf { - let ParserMatchClass = uimm3_asmoperand; - let PrintMethod = "printNeonUImm8OperandBare"; -} - -def neon_uimm4_bare : Operand, - ImmLeaf { - let ParserMatchClass = uimm4_asmoperand; - let PrintMethod = "printNeonUImm8OperandBare"; -} +// Post-index load single 1-element structure to one lane of 1 register. +defm LD1LN_WB : LD_Lane_WB_BHSD<0b0, 0b0, "VOne", "ld1", uimm_exact1, + uimm_exact2, uimm_exact4, uimm_exact8>; + +// Post-index load single N-element structure to one lane of N consecutive +// registers +// (N = 2,3,4) +defm LD2LN_WB : LD_Lane_WB_BHSD<0b1, 0b0, "VPair", "ld2", uimm_exact2, + uimm_exact4, uimm_exact8, uimm_exact16>; +defm LD3LN_WB : LD_Lane_WB_BHSD<0b0, 0b1, "VTriple", "ld3", uimm_exact3, + uimm_exact6, uimm_exact12, uimm_exact24>; +defm LD4LN_WB : LD_Lane_WB_BHSD<0b1, 0b1, "VQuad", "ld4", uimm_exact4, + uimm_exact8, uimm_exact16, uimm_exact32>; + +let mayStore = 1, neverHasSideEffects = 1, + hasExtraDefRegAllocReq = 1, Constraints = "$Rn = $wb", + DecoderMethod = "DecodeVLDSTLanePostInstruction" in { + class STN_WBFx_Lane op2_1, bit op0, RegisterOperand VList, + Operand ImmTy, Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0, + (outs GPR64xsp:$wb), + (ins GPR64xsp:$Rn, ImmTy:$amt, + VList:$Rt, ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn], $amt", + [], + NoItinerary> { + let Rm = 0b11111; + } -class NeonI_INS_main - : NeonI_copy<0b1, 0b0, 0b0011, - (outs VPR128:$Rd), (ins VPR128:$src, OpGPR:$Rn, OpImm:$Imm), - asmop # "\t$Rd." # Res # "[$Imm], $Rn", - [(set (ResTy VPR128:$Rd), - (ResTy (vector_insert - (ResTy VPR128:$src), - (OpTy OpGPR:$Rn), - (OpImm:$Imm))))], - NoItinerary> { - bits<4> Imm; - let Constraints = "$src = $Rd"; + class STN_WBReg_Lane op2_1, bit op0, RegisterOperand VList, + Operand ImmTy, Operand ImmOp, string asmop> + : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0, + (outs GPR64xsp:$wb), + (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, VList:$Rt, + ImmOp:$lane), + asmop # "\t$Rt[$lane], [$Rn], $Rm", + [], + NoItinerary>; } -// The followings are for instruction class (3V Elem) +multiclass ST_Lane_WB_BHSD { + def _B_fixed : STN_WBFx_Lane(List # "B_operand"), + uimm_b, neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } -// Variant 1 + def _B_register : STN_WBReg_Lane(List # "B_operand"), + uimm_b, neon_uimm4_bare, asmop> { + let Inst{12-10} = lane{2-0}; + let Inst{30} = lane{3}; + } -class NI_2VE size, bits<4> opcode, - string asmop, string ResS, string OpS, string EleOpS, - Operand OpImm, RegisterOperand ResVPR, - RegisterOperand OpVPR, RegisterOperand EleOpVPR> - : NeonI_2VElem { - bits<3> Index; - bits<5> Re; + def _H_fixed : STN_WBFx_Lane(List # "H_operand"), + uimm_h, neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } - let Constraints = "$src = $Rd"; -} + def _H_register : STN_WBReg_Lane(List # "H_operand"), + uimm_h, neon_uimm3_bare, asmop> { + let Inst{12-10} = {lane{1}, lane{0}, 0b0}; + let Inst{30} = lane{2}; + } -multiclass NI_2VE_v1 opcode, string asmop> -{ - // vector register class for element is always 128-bit to cover the max index - def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", - neon_uimm2_bare, VPR64, VPR64, VPR128> { - let Inst{11} = {Index{1}}; - let Inst{21} = {Index{0}}; - let Inst{20-16} = Re; + def _S_fixed : STN_WBFx_Lane(List # "S_operand"), + uimm_s, neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; } - def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", - neon_uimm2_bare, VPR128, VPR128, VPR128> { - let Inst{11} = {Index{1}}; - let Inst{21} = {Index{0}}; - let Inst{20-16} = Re; + def _S_register : STN_WBReg_Lane(List # "S_operand"), + uimm_s, neon_uimm2_bare, asmop> { + let Inst{12-10} = {lane{0}, 0b0, 0b0}; + let Inst{30} = lane{1}; } - // Index operations on 16-bit(H) elements are restricted to using v0-v15. - def _4h8h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h", - neon_uimm3_bare, VPR64, VPR64, VPR128Lo> { - let Inst{11} = {Index{2}}; - let Inst{21} = {Index{1}}; - let Inst{20} = {Index{0}}; - let Inst{19-16} = Re{3-0}; + def _D_fixed : STN_WBFx_Lane(List # "D_operand"), + uimm_d, neon_uimm1_bare, asmop> { + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; } - def _8h8h : NI_2VE<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h", - neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { - let Inst{11} = {Index{2}}; - let Inst{21} = {Index{1}}; - let Inst{20} = {Index{0}}; - let Inst{19-16} = Re{3-0}; + def _D_register : STN_WBReg_Lane(List # "D_operand"), + uimm_d, neon_uimm1_bare, asmop> { + let Inst{12-10} = 0b001; + let Inst{30} = lane{0}; } } -defm MLAvve : NI_2VE_v1<0b1, 0b0000, "mla">; -defm MLSvve : NI_2VE_v1<0b1, 0b0100, "mls">; +// Post-index store single 1-element structure from one lane of 1 register. +defm ST1LN_WB : ST_Lane_WB_BHSD<0b0, 0b0, "VOne", "st1", uimm_exact1, + uimm_exact2, uimm_exact4, uimm_exact8>; -// Pattern for lane in 128-bit vector -class NI_2VE_laneq - : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn), - (OpTy (coreop (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), - (INST ResVPR:$src, OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>; +// Post-index store single N-element structure from one lane of N consecutive +// registers (N = 2,3,4) +defm ST2LN_WB : ST_Lane_WB_BHSD<0b1, 0b0, "VPair", "st2", uimm_exact2, + uimm_exact4, uimm_exact8, uimm_exact16>; +defm ST3LN_WB : ST_Lane_WB_BHSD<0b0, 0b1, "VTriple", "st3", uimm_exact3, + uimm_exact6, uimm_exact12, uimm_exact24>; +defm ST4LN_WB : ST_Lane_WB_BHSD<0b1, 0b1, "VQuad", "st4", uimm_exact4, + uimm_exact8, uimm_exact16, uimm_exact32>; -// Pattern for lane in 64-bit vector -class NI_2VE_lane - : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn), - (OpTy (coreop (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), - (INST ResVPR:$src, OpVPR:$Rn, - (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; +// End of post-index load/store single N-element instructions +// (class SIMD lsone-post) -multiclass NI_2VE_v1_pat -{ - def : NI_2VE_laneq(subop # "_2s4s"), neon_uimm2_bare, - op, VPR64, VPR64, VPR128, v2i32, v2i32, v4i32, - BinOpFrag<(Neon_vduplane - (Neon_low4S node:$LHS), node:$RHS)>>; +// Neon Scalar instructions implementation +// Scalar Three Same - def : NI_2VE_laneq(subop # "_4s4s"), neon_uimm2_bare, - op, VPR128, VPR128, VPR128, v4i32, v4i32, v4i32, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; +class NeonI_Scalar3Same_size size, bits<5> opcode, string asmop, + RegisterClass FPRC> + : NeonI_Scalar3Same; - def : NI_2VE_laneq(subop # "_4h8h"), neon_uimm3_bare, - op, VPR64, VPR64, VPR128Lo, v4i16, v4i16, v8i16, - BinOpFrag<(Neon_vduplane - (Neon_low8H node:$LHS), node:$RHS)>>; +class NeonI_Scalar3Same_D_size opcode, string asmop> + : NeonI_Scalar3Same_size; - def : NI_2VE_laneq(subop # "_8h8h"), neon_uimm3_bare, - op, VPR128, VPR128, VPR128Lo, v8i16, v8i16, v8i16, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; +multiclass NeonI_Scalar3Same_HS_sizes opcode, string asmop, + bit Commutable = 0> { + let isCommutable = Commutable in { + def hhh : NeonI_Scalar3Same_size; + def sss : NeonI_Scalar3Same_size; + } +} - // Index can only be half of the max value for lane in 64-bit vector +multiclass NeonI_Scalar3Same_SD_sizes opcode, + string asmop, bit Commutable = 0> { + let isCommutable = Commutable in { + def sss : NeonI_Scalar3Same_size; + def ddd : NeonI_Scalar3Same_size; + } +} - def : NI_2VE_lane(subop # "_2s4s"), neon_uimm1_bare, - op, VPR64, VPR64, VPR64, v2i32, v2i32, v2i32, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; +multiclass NeonI_Scalar3Same_BHSD_sizes opcode, + string asmop, bit Commutable = 0> { + let isCommutable = Commutable in { + def bbb : NeonI_Scalar3Same_size; + def hhh : NeonI_Scalar3Same_size; + def sss : NeonI_Scalar3Same_size; + def ddd : NeonI_Scalar3Same_size; + } +} - def : NI_2VE_lane(subop # "_4s4s"), neon_uimm1_bare, - op, VPR128, VPR128, VPR64, v4i32, v4i32, v2i32, - BinOpFrag<(Neon_vduplane - (Neon_combine_4S node:$LHS, undef), - node:$RHS)>>; +multiclass Neon_Scalar3Same_D_size_patterns { + def : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))), + (INSTD FPR64:$Rn, FPR64:$Rm)>; +} - def : NI_2VE_lane(subop # "_4h8h"), neon_uimm2_bare, - op, VPR64, VPR64, VPR64Lo, v4i16, v4i16, v4i16, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; +multiclass Neon_Scalar3Same_BHSD_size_patterns + : Neon_Scalar3Same_D_size_patterns { + def: Pat<(v1i8 (opnode (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))), + (INSTB FPR8:$Rn, FPR8:$Rm)>; + + def: Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))), + (INSTH FPR16:$Rn, FPR16:$Rm)>; - def : NI_2VE_lane(subop # "_8h8h"), neon_uimm2_bare, - op, VPR128, VPR128, VPR64Lo, v8i16, v8i16, v4i16, - BinOpFrag<(Neon_vduplane - (Neon_combine_8H node:$LHS, undef), - node:$RHS)>>; + def: Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))), + (INSTS FPR32:$Rn, FPR32:$Rm)>; } -defm MLA_lane_v1 : NI_2VE_v1_pat<"MLAvve", Neon_mla>; -defm MLS_lane_v1 : NI_2VE_v1_pat<"MLSvve", Neon_mls>; +class Neon_Scalar3Same_cmp_D_size_patterns + : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))), + (INSTD FPR64:$Rn, FPR64:$Rm)>; -class NI_2VE_2op size, bits<4> opcode, - string asmop, string ResS, string OpS, string EleOpS, - Operand OpImm, RegisterOperand ResVPR, - RegisterOperand OpVPR, RegisterOperand EleOpVPR> - : NeonI_2VElem { - bits<3> Index; - bits<5> Re; +multiclass Neon_Scalar3Same_HS_size_patterns { + def : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))), + (INSTH FPR16:$Rn, FPR16:$Rm)>; + def : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))), + (INSTS FPR32:$Rn, FPR32:$Rm)>; } -multiclass NI_2VE_v1_2op opcode, string asmop> -{ - // vector register class for element is always 128-bit to cover the max index - def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", - neon_uimm2_bare, VPR64, VPR64, VPR128> { - let Inst{11} = {Index{1}}; - let Inst{21} = {Index{0}}; - let Inst{20-16} = Re; - } +multiclass Neon_Scalar3Same_SD_size_patterns { + def : Pat<(v1f32 (opnode (v1f32 FPR32:$Rn), (v1f32 FPR32:$Rm))), + (INSTS FPR32:$Rn, FPR32:$Rm)>; + def : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (INSTD FPR64:$Rn, FPR64:$Rm)>; +} - def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", - neon_uimm2_bare, VPR128, VPR128, VPR128> { - let Inst{11} = {Index{1}}; - let Inst{21} = {Index{0}}; - let Inst{20-16} = Re; - } +multiclass Neon_Scalar3Same_cmp_SD_size_patterns { + def : Pat<(v1i32 (opnode (v1f32 FPR32:$Rn), (v1f32 FPR32:$Rm))), + (INSTS FPR32:$Rn, FPR32:$Rm)>; + def : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (INSTD FPR64:$Rn, FPR64:$Rm)>; +} - // Index operations on 16-bit(H) elements are restricted to using v0-v15. - def _4h8h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h", - neon_uimm3_bare, VPR64, VPR64, VPR128Lo> { - let Inst{11} = {Index{2}}; - let Inst{21} = {Index{1}}; - let Inst{20} = {Index{0}}; - let Inst{19-16} = Re{3-0}; +// Scalar Three Different + +class NeonI_Scalar3Diff_size size, bits<4> opcode, string asmop, + RegisterClass FPRCD, RegisterClass FPRCS> + : NeonI_Scalar3Diff; + +multiclass NeonI_Scalar3Diff_HS_size opcode, string asmop> { + def shh : NeonI_Scalar3Diff_size; + def dss : NeonI_Scalar3Diff_size; +} + +multiclass NeonI_Scalar3Diff_ml_HS_size opcode, string asmop> { + let Constraints = "$Src = $Rd" in { + def shh : NeonI_Scalar3Diff; + def dss : NeonI_Scalar3Diff; } +} - def _8h8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h", - neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { - let Inst{11} = {Index{2}}; - let Inst{21} = {Index{1}}; - let Inst{20} = {Index{0}}; - let Inst{19-16} = Re{3-0}; - } +multiclass Neon_Scalar3Diff_HS_size_patterns { + def : Pat<(v1i32 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))), + (INSTH FPR16:$Rn, FPR16:$Rm)>; + def : Pat<(v1i64 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))), + (INSTS FPR32:$Rn, FPR32:$Rm)>; } -defm MULve : NI_2VE_v1_2op<0b0, 0b1000, "mul">; -defm SQDMULHve : NI_2VE_v1_2op<0b0, 0b1100, "sqdmulh">; -defm SQRDMULHve : NI_2VE_v1_2op<0b0, 0b1101, "sqrdmulh">; +multiclass Neon_Scalar3Diff_ml_HS_size_patterns { + def : Pat<(v1i32 (opnode (v1i32 FPR32:$Src), (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))), + (INSTH FPR32:$Src, FPR16:$Rn, FPR16:$Rm)>; + def : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))), + (INSTS FPR64:$Src, FPR32:$Rn, FPR32:$Rm)>; +} -// Pattern for lane in 128-bit vector -class NI_2VE_mul_laneq - : Pat<(ResTy (op (OpTy OpVPR:$Rn), - (OpTy (coreop (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), - (INST OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>; +// Scalar Two Registers Miscellaneous -// Pattern for lane in 64-bit vector -class NI_2VE_mul_lane - : Pat<(ResTy (op (OpTy OpVPR:$Rn), - (OpTy (coreop (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), - (INST OpVPR:$Rn, - (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; +class NeonI_Scalar2SameMisc_size size, bits<5> opcode, string asmop, + RegisterClass FPRCD, RegisterClass FPRCS> + : NeonI_Scalar2SameMisc; -multiclass NI_2VE_mul_v1_pat -{ - def : NI_2VE_mul_laneq(subop # "_2s4s"), neon_uimm2_bare, - op, VPR64, VPR128, v2i32, v2i32, v4i32, - BinOpFrag<(Neon_vduplane - (Neon_low4S node:$LHS), node:$RHS)>>; +multiclass NeonI_Scalar2SameMisc_SD_size opcode, + string asmop> { + def ss : NeonI_Scalar2SameMisc_size; + def dd : NeonI_Scalar2SameMisc_size; +} - def : NI_2VE_mul_laneq(subop # "_4s4s"), neon_uimm2_bare, - op, VPR128, VPR128, v4i32, v4i32, v4i32, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; +multiclass NeonI_Scalar2SameMisc_D_size opcode, string asmop> { + def dd : NeonI_Scalar2SameMisc_size; +} - def : NI_2VE_mul_laneq(subop # "_4h8h"), neon_uimm3_bare, - op, VPR64, VPR128Lo, v4i16, v4i16, v8i16, - BinOpFrag<(Neon_vduplane - (Neon_low8H node:$LHS), node:$RHS)>>; +multiclass NeonI_Scalar2SameMisc_BHSD_size opcode, string asmop> + : NeonI_Scalar2SameMisc_D_size { + def bb : NeonI_Scalar2SameMisc_size; + def hh : NeonI_Scalar2SameMisc_size; + def ss : NeonI_Scalar2SameMisc_size; +} - def : NI_2VE_mul_laneq(subop # "_8h8h"), neon_uimm3_bare, - op, VPR128, VPR128Lo, v8i16, v8i16, v8i16, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; +class NeonI_Scalar2SameMisc_fcvtxn_D_size opcode, string asmop> + : NeonI_Scalar2SameMisc_size; - // Index can only be half of the max value for lane in 64-bit vector +multiclass NeonI_Scalar2SameMisc_narrow_HSD_size opcode, + string asmop> { + def bh : NeonI_Scalar2SameMisc_size; + def hs : NeonI_Scalar2SameMisc_size; + def sd : NeonI_Scalar2SameMisc_size; +} - def : NI_2VE_mul_lane(subop # "_2s4s"), neon_uimm1_bare, - op, VPR64, VPR64, v2i32, v2i32, v2i32, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; +class NeonI_Scalar2SameMisc_accum_size size, bits<5> opcode, + string asmop, RegisterClass FPRC> + : NeonI_Scalar2SameMisc; - def : NI_2VE_mul_lane(subop # "_4s4s"), neon_uimm1_bare, - op, VPR128, VPR64, v4i32, v4i32, v2i32, - BinOpFrag<(Neon_vduplane - (Neon_combine_4S node:$LHS, undef), - node:$RHS)>>; - - def : NI_2VE_mul_lane(subop # "_4h8h"), neon_uimm2_bare, - op, VPR64, VPR64Lo, v4i16, v4i16, v4i16, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; - - def : NI_2VE_mul_lane(subop # "_8h8h"), neon_uimm2_bare, - op, VPR128, VPR64Lo, v8i16, v8i16, v4i16, - BinOpFrag<(Neon_vduplane - (Neon_combine_8H node:$LHS, undef), - node:$RHS)>>; -} - -defm MUL_lane_v1 : NI_2VE_mul_v1_pat<"MULve", mul>; -defm SQDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQDMULHve", int_arm_neon_vqdmulh>; -defm SQRDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQRDMULHve", int_arm_neon_vqrdmulh>; - -// Variant 2 - -multiclass NI_2VE_v2_2op opcode, string asmop> -{ - // vector register class for element is always 128-bit to cover the max index - def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", - neon_uimm2_bare, VPR64, VPR64, VPR128> { - let Inst{11} = {Index{1}}; - let Inst{21} = {Index{0}}; - let Inst{20-16} = Re; - } - - def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", - neon_uimm2_bare, VPR128, VPR128, VPR128> { - let Inst{11} = {Index{1}}; - let Inst{21} = {Index{0}}; - let Inst{20-16} = Re; - } - - // _1d2d doesn't exist! +multiclass NeonI_Scalar2SameMisc_accum_BHSD_size opcode, + string asmop> { - def _2d2d : NI_2VE_2op<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d", - neon_uimm1_bare, VPR128, VPR128, VPR128> { - let Inst{11} = {Index{0}}; - let Inst{21} = 0b0; - let Inst{20-16} = Re; + let Constraints = "$Src = $Rd" in { + def bb : NeonI_Scalar2SameMisc_accum_size; + def hh : NeonI_Scalar2SameMisc_accum_size; + def ss : NeonI_Scalar2SameMisc_accum_size; + def dd : NeonI_Scalar2SameMisc_accum_size; } } -defm FMULve : NI_2VE_v2_2op<0b0, 0b1001, "fmul">; -defm FMULXve : NI_2VE_v2_2op<0b1, 0b1001, "fmulx">; +class Neon_Scalar2SameMisc_fcvtxn_D_size_patterns + : Pat<(v1f32 (opnode (v1f64 FPR64:$Rn))), + (INSTD FPR64:$Rn)>; -class NI_2VE_mul_lane_2d - : Pat<(ResTy (op (OpTy OpVPR:$Rn), - (OpTy (coreop (EleOpTy EleOpVPR:$Re), (EleOpTy EleOpVPR:$Re))))), - (INST OpVPR:$Rn, - (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), 0)>; +multiclass Neon_Scalar2SameMisc_fcvt_SD_size_patterns { + def : Pat<(v1i32 (opnode (v1f32 FPR32:$Rn))), + (INSTS FPR32:$Rn)>; + def : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn))), + (INSTD FPR64:$Rn)>; +} -multiclass NI_2VE_mul_v2_pat -{ - def : NI_2VE_mul_laneq(subop # "_2s4s"), neon_uimm2_bare, - op, VPR64, VPR128, v2f32, v2f32, v4f32, - BinOpFrag<(Neon_vduplane - (Neon_low4f node:$LHS), node:$RHS)>>; +multiclass Neon_Scalar2SameMisc_cvt_SD_size_patterns { + def : Pat<(f32 (Sopnode (v1i32 FPR32:$Rn))), + (INSTS FPR32:$Rn)>; + def : Pat<(f64 (Dopnode (v1i64 FPR64:$Rn))), + (INSTD FPR64:$Rn)>; +} - def : NI_2VE_mul_laneq(subop # "_4s4s"), neon_uimm2_bare, - op, VPR128, VPR128, v4f32, v4f32, v4f32, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; +multiclass Neon_Scalar2SameMisc_SD_size_patterns { + def : Pat<(v1f32 (opnode (v1f32 FPR32:$Rn))), + (INSTS FPR32:$Rn)>; + def : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), + (INSTD FPR64:$Rn)>; +} - def : NI_2VE_mul_laneq(subop # "_2d2d"), neon_uimm1_bare, - op, VPR128, VPR128, v2f64, v2f64, v2f64, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; +class NeonI_Scalar2SameMisc_cmpz_D_size opcode, string asmop> + : NeonI_Scalar2SameMisc; + +multiclass NeonI_Scalar2SameMisc_cmpz_SD_size opcode, + string asmop> { + def ssi : NeonI_Scalar2SameMisc; + def ddi : NeonI_Scalar2SameMisc; +} + +class Neon_Scalar2SameMisc_cmpz_D_size_patterns + : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), + (v1i64 (bitconvert (v8i8 Neon_AllZero))))), + (INSTD FPR64:$Rn, 0)>; + +multiclass Neon_Scalar2SameMisc_cmpz_SD_size_patterns { + def : Pat<(v1i32 (opnode (v1f32 FPR32:$Rn), + (v1f32 (scalar_to_vector (f32 fpimm:$FPImm))))), + (INSTS FPR32:$Rn, fpimm:$FPImm)>; + def : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn), + (v1f64 (bitconvert (v8i8 Neon_AllZero))))), + (INSTD FPR64:$Rn, 0)>; +} + +multiclass Neon_Scalar2SameMisc_D_size_patterns { + def : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn))), + (INSTD FPR64:$Rn)>; +} - // Index can only be half of the max value for lane in 64-bit vector +multiclass Neon_Scalar2SameMisc_BHSD_size_patterns + : Neon_Scalar2SameMisc_D_size_patterns { + def : Pat<(v1i8 (opnode (v1i8 FPR8:$Rn))), + (INSTB FPR8:$Rn)>; + def : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn))), + (INSTH FPR16:$Rn)>; + def : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn))), + (INSTS FPR32:$Rn)>; +} - def : NI_2VE_mul_lane(subop # "_2s4s"), neon_uimm1_bare, - op, VPR64, VPR64, v2f32, v2f32, v2f32, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; +multiclass Neon_Scalar2SameMisc_narrow_HSD_size_patterns< + SDPatternOperator opnode, + Instruction INSTH, + Instruction INSTS, + Instruction INSTD> { + def : Pat<(v1i8 (opnode (v1i16 FPR16:$Rn))), + (INSTH FPR16:$Rn)>; + def : Pat<(v1i16 (opnode (v1i32 FPR32:$Rn))), + (INSTS FPR32:$Rn)>; + def : Pat<(v1i32 (opnode (v1i64 FPR64:$Rn))), + (INSTD FPR64:$Rn)>; - def : NI_2VE_mul_lane(subop # "_4s4s"), neon_uimm1_bare, - op, VPR128, VPR64, v4f32, v4f32, v2f32, - BinOpFrag<(Neon_vduplane - (Neon_combine_4f node:$LHS, undef), - node:$RHS)>>; +} - def : NI_2VE_mul_lane_2d(subop # "_2d2d"), neon_uimm1_bare, - op, VPR128, VPR64, v2f64, v2f64, v1f64, - BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>; +multiclass Neon_Scalar2SameMisc_accum_BHSD_size_patterns< + SDPatternOperator opnode, + Instruction INSTB, + Instruction INSTH, + Instruction INSTS, + Instruction INSTD> { + def : Pat<(v1i8 (opnode (v1i8 FPR8:$Src), (v1i8 FPR8:$Rn))), + (INSTB FPR8:$Src, FPR8:$Rn)>; + def : Pat<(v1i16 (opnode (v1i16 FPR16:$Src), (v1i16 FPR16:$Rn))), + (INSTH FPR16:$Src, FPR16:$Rn)>; + def : Pat<(v1i32 (opnode (v1i32 FPR32:$Src), (v1i32 FPR32:$Rn))), + (INSTS FPR32:$Src, FPR32:$Rn)>; + def : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn))), + (INSTD FPR64:$Src, FPR64:$Rn)>; +} + +// Scalar Shift By Immediate + +class NeonI_ScalarShiftImm_size opcode, string asmop, + RegisterClass FPRC, Operand ImmTy> + : NeonI_ScalarShiftImm; + +multiclass NeonI_ScalarShiftRightImm_D_size opcode, + string asmop> { + def ddi : NeonI_ScalarShiftImm_size { + bits<6> Imm; + let Inst{22} = 0b1; // immh:immb = 1xxxxxx + let Inst{21-16} = Imm; + } } -defm FMUL_lane_v2 : NI_2VE_mul_v2_pat<"FMULve", fmul>; -defm FMULX_lane_v2 : NI_2VE_mul_v2_pat<"FMULXve", int_aarch64_neon_vmulx>; +multiclass NeonI_ScalarShiftRightImm_BHSD_size opcode, + string asmop> + : NeonI_ScalarShiftRightImm_D_size { + def bbi : NeonI_ScalarShiftImm_size { + bits<3> Imm; + let Inst{22-19} = 0b0001; // immh:immb = 0001xxx + let Inst{18-16} = Imm; + } + def hhi : NeonI_ScalarShiftImm_size { + bits<4> Imm; + let Inst{22-20} = 0b001; // immh:immb = 001xxxx + let Inst{19-16} = Imm; + } + def ssi : NeonI_ScalarShiftImm_size { + bits<5> Imm; + let Inst{22-21} = 0b01; // immh:immb = 01xxxxx + let Inst{20-16} = Imm; + } +} -// The followings are patterns using fma -// -ffp-contract=fast generates fma +multiclass NeonI_ScalarShiftLeftImm_D_size opcode, + string asmop> { + def ddi : NeonI_ScalarShiftImm_size { + bits<6> Imm; + let Inst{22} = 0b1; // immh:immb = 1xxxxxx + let Inst{21-16} = Imm; + } +} -multiclass NI_2VE_v2 opcode, string asmop> -{ - // vector register class for element is always 128-bit to cover the max index - def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", - neon_uimm2_bare, VPR64, VPR64, VPR128> { - let Inst{11} = {Index{1}}; - let Inst{21} = {Index{0}}; - let Inst{20-16} = Re; +multiclass NeonI_ScalarShiftLeftImm_BHSD_size opcode, + string asmop> + : NeonI_ScalarShiftLeftImm_D_size { + def bbi : NeonI_ScalarShiftImm_size { + bits<3> Imm; + let Inst{22-19} = 0b0001; // immh:immb = 0001xxx + let Inst{18-16} = Imm; + } + def hhi : NeonI_ScalarShiftImm_size { + bits<4> Imm; + let Inst{22-20} = 0b001; // immh:immb = 001xxxx + let Inst{19-16} = Imm; + } + def ssi : NeonI_ScalarShiftImm_size { + bits<5> Imm; + let Inst{22-21} = 0b01; // immh:immb = 01xxxxx + let Inst{20-16} = Imm; } +} - def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", - neon_uimm2_bare, VPR128, VPR128, VPR128> { - let Inst{11} = {Index{1}}; - let Inst{21} = {Index{0}}; - let Inst{20-16} = Re; +class NeonI_ScalarShiftRightImm_accum_D_size opcode, string asmop> + : NeonI_ScalarShiftImm { + bits<6> Imm; + let Inst{22} = 0b1; // immh:immb = 1xxxxxx + let Inst{21-16} = Imm; + let Constraints = "$Src = $Rd"; +} + +class NeonI_ScalarShiftLeftImm_accum_D_size opcode, string asmop> + : NeonI_ScalarShiftImm { + bits<6> Imm; + let Inst{22} = 0b1; // immh:immb = 1xxxxxx + let Inst{21-16} = Imm; + let Constraints = "$Src = $Rd"; +} + +class NeonI_ScalarShiftImm_narrow_size opcode, string asmop, + RegisterClass FPRCD, RegisterClass FPRCS, + Operand ImmTy> + : NeonI_ScalarShiftImm; + +multiclass NeonI_ScalarShiftImm_narrow_HSD_size opcode, + string asmop> { + def bhi : NeonI_ScalarShiftImm_narrow_size { + bits<3> Imm; + let Inst{22-19} = 0b0001; // immh:immb = 0001xxx + let Inst{18-16} = Imm; + } + def hsi : NeonI_ScalarShiftImm_narrow_size { + bits<4> Imm; + let Inst{22-20} = 0b001; // immh:immb = 001xxxx + let Inst{19-16} = Imm; + } + def sdi : NeonI_ScalarShiftImm_narrow_size { + bits<5> Imm; + let Inst{22-21} = 0b01; // immh:immb = 01xxxxx + let Inst{20-16} = Imm; } +} - // _1d2d doesn't exist! - - def _2d2d : NI_2VE<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d", - neon_uimm1_bare, VPR128, VPR128, VPR128> { - let Inst{11} = {Index{0}}; - let Inst{21} = 0b0; - let Inst{20-16} = Re; +multiclass NeonI_ScalarShiftImm_cvt_SD_size opcode, string asmop> { + def ssi : NeonI_ScalarShiftImm_size { + bits<5> Imm; + let Inst{22-21} = 0b01; // immh:immb = 01xxxxx + let Inst{20-16} = Imm; + } + def ddi : NeonI_ScalarShiftImm_size { + bits<6> Imm; + let Inst{22} = 0b1; // immh:immb = 1xxxxxx + let Inst{21-16} = Imm; } } -defm FMLAvve : NI_2VE_v2<0b0, 0b0001, "fmla">; -defm FMLSvve : NI_2VE_v2<0b0, 0b0101, "fmls">; +multiclass Neon_ScalarShiftRImm_D_size_patterns { + def ddi : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))), + (INSTD FPR64:$Rn, imm:$Imm)>; +} + +multiclass Neon_ScalarShiftLImm_D_size_patterns { + def ddi : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (i32 shl_imm64:$Imm))), + (INSTD FPR64:$Rn, imm:$Imm)>; +} + +class Neon_ScalarShiftImm_arm_D_size_patterns + : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), + (v1i64 (Neon_vdup (i32 shr_imm64:$Imm))))), + (INSTD FPR64:$Rn, imm:$Imm)>; + +multiclass Neon_ScalarShiftLImm_BHSD_size_patterns + : Neon_ScalarShiftLImm_D_size_patterns { + def bbi : Pat<(v1i8 (opnode (v1i8 FPR8:$Rn), (i32 shl_imm8:$Imm))), + (INSTB FPR8:$Rn, imm:$Imm)>; + def hhi : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (i32 shl_imm16:$Imm))), + (INSTH FPR16:$Rn, imm:$Imm)>; + def ssi : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (i32 shl_imm32:$Imm))), + (INSTS FPR32:$Rn, imm:$Imm)>; +} + +class Neon_ScalarShiftLImm_accum_D_size_patterns + : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn), + (i32 shl_imm64:$Imm))), + (INSTD FPR64:$Src, FPR64:$Rn, imm:$Imm)>; + +class Neon_ScalarShiftRImm_accum_D_size_patterns + : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn), + (i32 shr_imm64:$Imm))), + (INSTD FPR64:$Src, FPR64:$Rn, imm:$Imm)>; + +multiclass Neon_ScalarShiftImm_narrow_HSD_size_patterns< + SDPatternOperator opnode, + Instruction INSTH, + Instruction INSTS, + Instruction INSTD> { + def bhi : Pat<(v1i8 (opnode (v1i16 FPR16:$Rn), (i32 shr_imm16:$Imm))), + (INSTH FPR16:$Rn, imm:$Imm)>; + def hsi : Pat<(v1i16 (opnode (v1i32 FPR32:$Rn), (i32 shr_imm32:$Imm))), + (INSTS FPR32:$Rn, imm:$Imm)>; + def sdi : Pat<(v1i32 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))), + (INSTD FPR64:$Rn, imm:$Imm)>; +} + +multiclass Neon_ScalarShiftImm_scvtf_SD_size_patterns { + def ssi : Pat<(f32 (Sopnode (v1i32 FPR32:$Rn), (i32 shr_imm32:$Imm))), + (INSTS FPR32:$Rn, imm:$Imm)>; + def ddi : Pat<(f64 (Dopnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))), + (INSTD FPR64:$Rn, imm:$Imm)>; +} + +multiclass Neon_ScalarShiftImm_fcvts_SD_size_patterns { + def ssi : Pat<(v1i32 (Sopnode (v1f32 FPR32:$Rn), (i32 shr_imm32:$Imm))), + (INSTS FPR32:$Rn, imm:$Imm)>; + def ddi : Pat<(v1i64 (Dopnode (v1f64 FPR64:$Rn), (i32 shr_imm64:$Imm))), + (INSTD FPR64:$Rn, imm:$Imm)>; +} + +// Scalar Signed Shift Right (Immediate) +defm SSHR : NeonI_ScalarShiftRightImm_D_size<0b0, 0b00000, "sshr">; +defm : Neon_ScalarShiftRImm_D_size_patterns; +// Pattern to match llvm.arm.* intrinsic. +def : Neon_ScalarShiftImm_arm_D_size_patterns; + +// Scalar Unsigned Shift Right (Immediate) +defm USHR : NeonI_ScalarShiftRightImm_D_size<0b1, 0b00000, "ushr">; +defm : Neon_ScalarShiftRImm_D_size_patterns; +// Pattern to match llvm.arm.* intrinsic. +def : Neon_ScalarShiftImm_arm_D_size_patterns; + +// Scalar Signed Rounding Shift Right (Immediate) +defm SRSHR : NeonI_ScalarShiftRightImm_D_size<0b0, 0b00100, "srshr">; +defm : Neon_ScalarShiftRImm_D_size_patterns; + +// Scalar Unigned Rounding Shift Right (Immediate) +defm URSHR : NeonI_ScalarShiftRightImm_D_size<0b1, 0b00100, "urshr">; +defm : Neon_ScalarShiftRImm_D_size_patterns; + +// Scalar Signed Shift Right and Accumulate (Immediate) +def SSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b0, 0b00010, "ssra">; +def : Neon_ScalarShiftRImm_accum_D_size_patterns + ; + +// Scalar Unsigned Shift Right and Accumulate (Immediate) +def USRA : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b00010, "usra">; +def : Neon_ScalarShiftRImm_accum_D_size_patterns + ; + +// Scalar Signed Rounding Shift Right and Accumulate (Immediate) +def SRSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b0, 0b00110, "srsra">; +def : Neon_ScalarShiftRImm_accum_D_size_patterns + ; + +// Scalar Unsigned Rounding Shift Right and Accumulate (Immediate) +def URSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b00110, "ursra">; +def : Neon_ScalarShiftRImm_accum_D_size_patterns + ; + +// Scalar Shift Left (Immediate) +defm SHL : NeonI_ScalarShiftLeftImm_D_size<0b0, 0b01010, "shl">; +defm : Neon_ScalarShiftLImm_D_size_patterns; +// Pattern to match llvm.arm.* intrinsic. +def : Neon_ScalarShiftImm_arm_D_size_patterns; + +// Signed Saturating Shift Left (Immediate) +defm SQSHL : NeonI_ScalarShiftLeftImm_BHSD_size<0b0, 0b01110, "sqshl">; +defm : Neon_ScalarShiftLImm_BHSD_size_patterns; +// Pattern to match llvm.arm.* intrinsic. +defm : Neon_ScalarShiftLImm_D_size_patterns; + +// Unsigned Saturating Shift Left (Immediate) +defm UQSHL : NeonI_ScalarShiftLeftImm_BHSD_size<0b1, 0b01110, "uqshl">; +defm : Neon_ScalarShiftLImm_BHSD_size_patterns; +// Pattern to match llvm.arm.* intrinsic. +defm : Neon_ScalarShiftLImm_D_size_patterns; + +// Signed Saturating Shift Left Unsigned (Immediate) +defm SQSHLU : NeonI_ScalarShiftLeftImm_BHSD_size<0b1, 0b01100, "sqshlu">; +defm : Neon_ScalarShiftLImm_BHSD_size_patterns; + +// Shift Right And Insert (Immediate) +def SRI : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b01000, "sri">; +def : Neon_ScalarShiftRImm_accum_D_size_patterns + ; + +// Shift Left And Insert (Immediate) +def SLI : NeonI_ScalarShiftLeftImm_accum_D_size<0b1, 0b01010, "sli">; +def : Neon_ScalarShiftLImm_accum_D_size_patterns + ; + +// Signed Saturating Shift Right Narrow (Immediate) +defm SQSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b0, 0b10010, "sqshrn">; +defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns; + +// Unsigned Saturating Shift Right Narrow (Immediate) +defm UQSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10010, "uqshrn">; +defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns; + +// Signed Saturating Rounded Shift Right Narrow (Immediate) +defm SQRSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b0, 0b10011, "sqrshrn">; +defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns; + +// Unsigned Saturating Rounded Shift Right Narrow (Immediate) +defm UQRSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10011, "uqrshrn">; +defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns; + +// Signed Saturating Shift Right Unsigned Narrow (Immediate) +defm SQSHRUN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10000, "sqshrun">; +defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns; + +// Signed Saturating Rounded Shift Right Unsigned Narrow (Immediate) +defm SQRSHRUN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10001, "sqrshrun">; +defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns; + +// Scalar Signed Fixed-point Convert To Floating-Point (Immediate) +defm SCVTF_N : NeonI_ScalarShiftImm_cvt_SD_size<0b0, 0b11100, "scvtf">; +defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns; + +// Scalar Unsigned Fixed-point Convert To Floating-Point (Immediate) +defm UCVTF_N : NeonI_ScalarShiftImm_cvt_SD_size<0b1, 0b11100, "ucvtf">; +defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns; + +// Scalar Floating-point Convert To Signed Fixed-point (Immediate) +defm FCVTZS_N : NeonI_ScalarShiftImm_cvt_SD_size<0b0, 0b11111, "fcvtzs">; +defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns; + +// Scalar Floating-point Convert To Unsigned Fixed-point (Immediate) +defm FCVTZU_N : NeonI_ScalarShiftImm_cvt_SD_size<0b1, 0b11111, "fcvtzu">; +defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns; + +// Patterns For Convert Instructions Between v1f64 and v1i64 +class Neon_ScalarShiftImm_cvtf_v1f64_pattern + : Pat<(v1f64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))), + (INST FPR64:$Rn, imm:$Imm)>; + +class Neon_ScalarShiftImm_fcvt_v1f64_pattern + : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn), (i32 shr_imm64:$Imm))), + (INST FPR64:$Rn, imm:$Imm)>; + +def : Neon_ScalarShiftImm_cvtf_v1f64_pattern; + +def : Neon_ScalarShiftImm_cvtf_v1f64_pattern; + +def : Neon_ScalarShiftImm_fcvt_v1f64_pattern; + +def : Neon_ScalarShiftImm_fcvt_v1f64_pattern; -// Pattern for lane in 128-bit vector -class NI_2VEswap_laneq - : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))), - (ResTy ResVPR:$src), (ResTy ResVPR:$Rn))), - (INST ResVPR:$src, ResVPR:$Rn, OpVPR:$Re, OpImm:$Index)>; +// Scalar Integer Add +let isCommutable = 1 in { +def ADDddd : NeonI_Scalar3Same_D_size<0b0, 0b10000, "add">; +} -// Pattern for lane in 64-bit vector -class NI_2VEswap_lane - : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))), - (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))), - (INST ResVPR:$src, ResVPR:$Rn, - (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), OpImm:$Index)>; +// Scalar Integer Sub +def SUBddd : NeonI_Scalar3Same_D_size<0b1, 0b10000, "sub">; -// Pattern for lane in 64-bit vector -class NI_2VEswap_lane_2d2d - : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (OpTy OpVPR:$Re))), - (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))), - (INST ResVPR:$src, ResVPR:$Rn, - (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), 0)>; +// Pattern for Scalar Integer Add and Sub with D register only +defm : Neon_Scalar3Same_D_size_patterns; +defm : Neon_Scalar3Same_D_size_patterns; +// Patterns to match llvm.aarch64.* intrinsic for Scalar Add, Sub +defm : Neon_Scalar3Same_D_size_patterns; +defm : Neon_Scalar3Same_D_size_patterns; +defm : Neon_Scalar3Same_D_size_patterns; +defm : Neon_Scalar3Same_D_size_patterns; -multiclass NI_2VE_fma_v2_pat -{ - def : NI_2VEswap_laneq(subop # "_2s4s"), - neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32, - BinOpFrag<(Neon_vduplane - (Neon_low4f node:$LHS), node:$RHS)>>; +// Scalar Integer Saturating Add (Signed, Unsigned) +defm SQADD : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00001, "sqadd", 1>; +defm UQADD : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00001, "uqadd", 1>; - def : NI_2VEswap_laneq(subop # "_4s4s"), - neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; +// Scalar Integer Saturating Sub (Signed, Unsigned) +defm SQSUB : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00101, "sqsub", 0>; +defm UQSUB : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00101, "uqsub", 0>; - def : NI_2VEswap_laneq(subop # "_2d2d"), - neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; - // Index can only be half of the max value for lane in 64-bit vector +// Patterns to match llvm.aarch64.* intrinsic for +// Scalar Integer Saturating Add, Sub (Signed, Unsigned) +defm : Neon_Scalar3Same_BHSD_size_patterns; +defm : Neon_Scalar3Same_BHSD_size_patterns; +defm : Neon_Scalar3Same_BHSD_size_patterns; +defm : Neon_Scalar3Same_BHSD_size_patterns; - def : NI_2VEswap_lane(subop # "_2s4s"), - neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; +// Scalar Integer Saturating Doubling Multiply Half High +defm SQDMULH : NeonI_Scalar3Same_HS_sizes<0b0, 0b10110, "sqdmulh", 1>; - def : NI_2VEswap_lane(subop # "_4s4s"), - neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32, - BinOpFrag<(Neon_vduplane - (Neon_combine_4f node:$LHS, undef), - node:$RHS)>>; +// Scalar Integer Saturating Rounding Doubling Multiply Half High +defm SQRDMULH : NeonI_Scalar3Same_HS_sizes<0b1, 0b10110, "sqrdmulh", 1>; - def : NI_2VEswap_lane_2d2d(subop # "_2d2d"), - neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64, - BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>; -} +// Patterns to match llvm.arm.* intrinsic for +// Scalar Integer Saturating Doubling Multiply Half High and +// Scalar Integer Saturating Rounding Doubling Multiply Half High +defm : Neon_Scalar3Same_HS_size_patterns; +defm : Neon_Scalar3Same_HS_size_patterns; -defm FMLA_lane_v2_s : NI_2VE_fma_v2_pat<"FMLAvve", fma>; +// Scalar Floating-point Multiply Extended +defm FMULX : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11011, "fmulx", 1>; -multiclass NI_2VE_fms_v2_pat -{ - def : NI_2VEswap_laneq(subop # "_2s4s"), - neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32, - BinOpFrag<(fneg (Neon_vduplane - (Neon_low4f node:$LHS), node:$RHS))>>; +// Scalar Floating-point Reciprocal Step +defm FRECPS : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11111, "frecps", 0>; - def : NI_2VEswap_laneq(subop # "_2s4s"), - neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32, - BinOpFrag<(Neon_vduplane - (Neon_low4f (fneg node:$LHS)), - node:$RHS)>>; +// Scalar Floating-point Reciprocal Square Root Step +defm FRSQRTS : NeonI_Scalar3Same_SD_sizes<0b0, 0b1, 0b11111, "frsqrts", 0>; - def : NI_2VEswap_laneq(subop # "_4s4s"), - neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, - BinOpFrag<(fneg (Neon_vduplane - node:$LHS, node:$RHS))>>; +// Patterns to match llvm.arm.* intrinsic for +// Scalar Floating-point Reciprocal Step and +// Scalar Floating-point Reciprocal Square Root Step +defm : Neon_Scalar3Same_SD_size_patterns; +defm : Neon_Scalar3Same_SD_size_patterns; - def : NI_2VEswap_laneq(subop # "_4s4s"), - neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, - BinOpFrag<(Neon_vduplane - (fneg node:$LHS), node:$RHS)>>; +def : Pat<(v1f64 (fsqrt (v1f64 FPR64:$Rn))), (FSQRTdd FPR64:$Rn)>; - def : NI_2VEswap_laneq(subop # "_2d2d"), - neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, - BinOpFrag<(fneg (Neon_vduplane - node:$LHS, node:$RHS))>>; +// Patterns to match llvm.aarch64.* intrinsic for +// Scalar Floating-point Multiply Extended, +multiclass Neon_Scalar3Same_MULX_SD_size_patterns { + def : Pat<(f32 (opnode (f32 FPR32:$Rn), (f32 FPR32:$Rm))), + (INSTS FPR32:$Rn, FPR32:$Rm)>; + def : Pat<(f64 (opnode (f64 FPR64:$Rn), (f64 FPR64:$Rm))), + (INSTD FPR64:$Rn, FPR64:$Rm)>; +} - def : NI_2VEswap_laneq(subop # "_2d2d"), - neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, - BinOpFrag<(Neon_vduplane - (fneg node:$LHS), node:$RHS)>>; +defm : Neon_Scalar3Same_MULX_SD_size_patterns; - // Index can only be half of the max value for lane in 64-bit vector +// Scalar Integer Shift Left (Signed, Unsigned) +def SSHLddd : NeonI_Scalar3Same_D_size<0b0, 0b01000, "sshl">; +def USHLddd : NeonI_Scalar3Same_D_size<0b1, 0b01000, "ushl">; - def : NI_2VEswap_lane(subop # "_2s4s"), - neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32, - BinOpFrag<(fneg (Neon_vduplane - node:$LHS, node:$RHS))>>; +// Patterns to match llvm.arm.* intrinsic for +// Scalar Integer Shift Left (Signed, Unsigned) +defm : Neon_Scalar3Same_D_size_patterns; +defm : Neon_Scalar3Same_D_size_patterns; + +// Patterns to match llvm.aarch64.* intrinsic for +// Scalar Integer Shift Left (Signed, Unsigned) +defm : Neon_Scalar3Same_D_size_patterns; +defm : Neon_Scalar3Same_D_size_patterns; + +// Scalar Integer Saturating Shift Left (Signed, Unsigned) +defm SQSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01001, "sqshl", 0>; +defm UQSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01001, "uqshl", 0>; + +// Patterns to match llvm.aarch64.* intrinsic for +// Scalar Integer Saturating Shift Letf (Signed, Unsigned) +defm : Neon_Scalar3Same_BHSD_size_patterns; +defm : Neon_Scalar3Same_BHSD_size_patterns; + +// Patterns to match llvm.arm.* intrinsic for +// Scalar Integer Saturating Shift Letf (Signed, Unsigned) +defm : Neon_Scalar3Same_D_size_patterns; +defm : Neon_Scalar3Same_D_size_patterns; + +// Scalar Integer Rounding Shift Left (Signed, Unsigned) +def SRSHLddd: NeonI_Scalar3Same_D_size<0b0, 0b01010, "srshl">; +def URSHLddd: NeonI_Scalar3Same_D_size<0b1, 0b01010, "urshl">; + +// Patterns to match llvm.aarch64.* intrinsic for +// Scalar Integer Rounding Shift Left (Signed, Unsigned) +defm : Neon_Scalar3Same_D_size_patterns; +defm : Neon_Scalar3Same_D_size_patterns; + +// Patterns to match llvm.arm.* intrinsic for +// Scalar Integer Rounding Shift Left (Signed, Unsigned) +defm : Neon_Scalar3Same_D_size_patterns; +defm : Neon_Scalar3Same_D_size_patterns; + +// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned) +defm SQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01011, "sqrshl", 0>; +defm UQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01011, "uqrshl", 0>; + +// Patterns to match llvm.aarch64.* intrinsic for +// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned) +defm : Neon_Scalar3Same_BHSD_size_patterns; +defm : Neon_Scalar3Same_BHSD_size_patterns; + +// Patterns to match llvm.arm.* intrinsic for +// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned) +defm : Neon_Scalar3Same_D_size_patterns; +defm : Neon_Scalar3Same_D_size_patterns; + +// Signed Saturating Doubling Multiply-Add Long +defm SQDMLAL : NeonI_Scalar3Diff_ml_HS_size<0b0, 0b1001, "sqdmlal">; +defm : Neon_Scalar3Diff_ml_HS_size_patterns; + +// Signed Saturating Doubling Multiply-Subtract Long +defm SQDMLSL : NeonI_Scalar3Diff_ml_HS_size<0b0, 0b1011, "sqdmlsl">; +defm : Neon_Scalar3Diff_ml_HS_size_patterns; + +// Signed Saturating Doubling Multiply Long +defm SQDMULL : NeonI_Scalar3Diff_HS_size<0b0, 0b1101, "sqdmull">; +defm : Neon_Scalar3Diff_HS_size_patterns; + +// Scalar Signed Integer Convert To Floating-point +defm SCVTF : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11101, "scvtf">; +defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns; + +// Scalar Unsigned Integer Convert To Floating-point +defm UCVTF : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11101, "ucvtf">; +defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns; + +// Scalar Floating-point Converts +def FCVTXN : NeonI_Scalar2SameMisc_fcvtxn_D_size<0b1, 0b10110, "fcvtxn">; +def : Neon_Scalar2SameMisc_fcvtxn_D_size_patterns; + +defm FCVTNS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11010, "fcvtns">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns; + +defm FCVTNU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11010, "fcvtnu">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns; + +defm FCVTMS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11011, "fcvtms">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns; + +defm FCVTMU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11011, "fcvtmu">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns; + +defm FCVTAS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11100, "fcvtas">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns; + +defm FCVTAU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11100, "fcvtau">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns; + +defm FCVTPS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11010, "fcvtps">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns; + +defm FCVTPU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11010, "fcvtpu">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns; + +defm FCVTZS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11011, "fcvtzs">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns; + +defm FCVTZU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11011, "fcvtzu">; +defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns; + +// Patterns For Convert Instructions Between v1f64 and v1i64 +class Neon_Scalar2SameMisc_cvtf_v1f64_pattern + : Pat<(v1f64 (opnode (v1i64 FPR64:$Rn))), (INST FPR64:$Rn)>; + +class Neon_Scalar2SameMisc_fcvt_v1f64_pattern + : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>; + +def : Neon_Scalar2SameMisc_cvtf_v1f64_pattern; +def : Neon_Scalar2SameMisc_cvtf_v1f64_pattern; + +def : Neon_Scalar2SameMisc_fcvt_v1f64_pattern; +def : Neon_Scalar2SameMisc_fcvt_v1f64_pattern; + +// Scalar Floating-point Reciprocal Estimate +defm FRECPE : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11101, "frecpe">; +defm : Neon_Scalar2SameMisc_SD_size_patterns; + +// Scalar Floating-point Reciprocal Exponent +defm FRECPX : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11111, "frecpx">; +defm : Neon_Scalar2SameMisc_SD_size_patterns; + +// Scalar Floating-point Reciprocal Square Root Estimate +defm FRSQRTE: NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11101, "frsqrte">; +defm : Neon_Scalar2SameMisc_SD_size_patterns; + +// Scalar Floating-point Round +class Neon_ScalarFloatRound_pattern + : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>; + +def : Neon_ScalarFloatRound_pattern; +def : Neon_ScalarFloatRound_pattern; +def : Neon_ScalarFloatRound_pattern; +def : Neon_ScalarFloatRound_pattern; +def : Neon_ScalarFloatRound_pattern; +def : Neon_ScalarFloatRound_pattern; +def : Neon_ScalarFloatRound_pattern; + +// Scalar Integer Compare + +// Scalar Compare Bitwise Equal +def CMEQddd: NeonI_Scalar3Same_D_size<0b1, 0b10001, "cmeq">; +def : Neon_Scalar3Same_cmp_D_size_patterns; + +class Neon_Scalar3Same_cmp_D_size_v1_patterns + : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm), CC)), + (INSTD FPR64:$Rn, FPR64:$Rm)>; + +def : Neon_Scalar3Same_cmp_D_size_v1_patterns; + +// Scalar Compare Signed Greather Than Or Equal +def CMGEddd: NeonI_Scalar3Same_D_size<0b0, 0b00111, "cmge">; +def : Neon_Scalar3Same_cmp_D_size_patterns; + +// Scalar Compare Unsigned Higher Or Same +def CMHSddd: NeonI_Scalar3Same_D_size<0b1, 0b00111, "cmhs">; +def : Neon_Scalar3Same_cmp_D_size_patterns; + +// Scalar Compare Unsigned Higher +def CMHIddd: NeonI_Scalar3Same_D_size<0b1, 0b00110, "cmhi">; +def : Neon_Scalar3Same_cmp_D_size_patterns; + +// Scalar Compare Signed Greater Than +def CMGTddd: NeonI_Scalar3Same_D_size<0b0, 0b00110, "cmgt">; +def : Neon_Scalar3Same_cmp_D_size_patterns; + +// Scalar Compare Bitwise Test Bits +def CMTSTddd: NeonI_Scalar3Same_D_size<0b0, 0b10001, "cmtst">; +def : Neon_Scalar3Same_cmp_D_size_patterns; +def : Neon_Scalar3Same_cmp_D_size_patterns; + +// Scalar Compare Bitwise Equal To Zero +def CMEQddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01001, "cmeq">; +def : Neon_Scalar2SameMisc_cmpz_D_size_patterns; + +// Scalar Compare Signed Greather Than Or Equal To Zero +def CMGEddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b1, 0b01000, "cmge">; +def : Neon_Scalar2SameMisc_cmpz_D_size_patterns; + +// Scalar Compare Signed Greater Than Zero +def CMGTddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01000, "cmgt">; +def : Neon_Scalar2SameMisc_cmpz_D_size_patterns; + +// Scalar Compare Signed Less Than Or Equal To Zero +def CMLEddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b1, 0b01001, "cmle">; +def : Neon_Scalar2SameMisc_cmpz_D_size_patterns; + +// Scalar Compare Less Than Zero +def CMLTddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01010, "cmlt">; +def : Neon_Scalar2SameMisc_cmpz_D_size_patterns; + +// Scalar Floating-point Compare + +// Scalar Floating-point Compare Mask Equal +defm FCMEQ: NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11100, "fcmeq">; +defm : Neon_Scalar3Same_cmp_SD_size_patterns; + +// Scalar Floating-point Compare Mask Equal To Zero +defm FCMEQZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01101, "fcmeq">; +defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns; + +// Scalar Floating-point Compare Mask Greater Than Or Equal +defm FCMGE: NeonI_Scalar3Same_SD_sizes<0b1, 0b0, 0b11100, "fcmge">; +defm : Neon_Scalar3Same_cmp_SD_size_patterns; + +// Scalar Floating-point Compare Mask Greater Than Or Equal To Zero +defm FCMGEZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b1, 0b01100, "fcmge">; +defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns; + +// Scalar Floating-point Compare Mask Greather Than +defm FCMGT: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11100, "fcmgt">; +defm : Neon_Scalar3Same_cmp_SD_size_patterns; + +// Scalar Floating-point Compare Mask Greather Than Zero +defm FCMGTZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01100, "fcmgt">; +defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns; + +// Scalar Floating-point Compare Mask Less Than Or Equal To Zero +defm FCMLEZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b1, 0b01101, "fcmle">; +defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns; + +// Scalar Floating-point Compare Mask Less Than Zero +defm FCMLTZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01110, "fcmlt">; +defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns; + +// Scalar Floating-point Absolute Compare Mask Greater Than Or Equal +defm FACGE: NeonI_Scalar3Same_SD_sizes<0b1, 0b0, 0b11101, "facge">; +defm : Neon_Scalar3Same_cmp_SD_size_patterns; + +// Scalar Floating-point Absolute Compare Mask Greater Than +defm FACGT: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11101, "facgt">; +defm : Neon_Scalar3Same_cmp_SD_size_patterns; + +// Scakar Floating-point Absolute Difference +defm FABD: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11010, "fabd">; +defm : Neon_Scalar3Same_SD_size_patterns; + +// Scalar Absolute Value +defm ABS : NeonI_Scalar2SameMisc_D_size<0b0, 0b01011, "abs">; +defm : Neon_Scalar2SameMisc_D_size_patterns; + +// Scalar Signed Saturating Absolute Value +defm SQABS : NeonI_Scalar2SameMisc_BHSD_size<0b0, 0b00111, "sqabs">; +defm : Neon_Scalar2SameMisc_BHSD_size_patterns; + +// Scalar Negate +defm NEG : NeonI_Scalar2SameMisc_D_size<0b1, 0b01011, "neg">; +defm : Neon_Scalar2SameMisc_D_size_patterns; + +// Scalar Signed Saturating Negate +defm SQNEG : NeonI_Scalar2SameMisc_BHSD_size<0b1, 0b00111, "sqneg">; +defm : Neon_Scalar2SameMisc_BHSD_size_patterns; + +// Scalar Signed Saturating Accumulated of Unsigned Value +defm SUQADD : NeonI_Scalar2SameMisc_accum_BHSD_size<0b0, 0b00011, "suqadd">; +defm : Neon_Scalar2SameMisc_accum_BHSD_size_patterns; + +// Scalar Unsigned Saturating Accumulated of Signed Value +defm USQADD : NeonI_Scalar2SameMisc_accum_BHSD_size<0b1, 0b00011, "usqadd">; +defm : Neon_Scalar2SameMisc_accum_BHSD_size_patterns; + +def : Pat<(v1i64 (int_aarch64_neon_suqadd (v1i64 FPR64:$Src), + (v1i64 FPR64:$Rn))), + (SUQADDdd FPR64:$Src, FPR64:$Rn)>; + +def : Pat<(v1i64 (int_aarch64_neon_usqadd (v1i64 FPR64:$Src), + (v1i64 FPR64:$Rn))), + (USQADDdd FPR64:$Src, FPR64:$Rn)>; + +def : Pat<(v1i64 (int_arm_neon_vabs (v1i64 FPR64:$Rn))), + (ABSdd FPR64:$Rn)>; + +def : Pat<(v1i64 (int_arm_neon_vqabs (v1i64 FPR64:$Rn))), + (SQABSdd FPR64:$Rn)>; + +def : Pat<(v1i64 (int_arm_neon_vqneg (v1i64 FPR64:$Rn))), + (SQNEGdd FPR64:$Rn)>; + +def : Pat<(v1i64 (sub (v1i64 (bitconvert (v8i8 Neon_AllZero))), + (v1i64 FPR64:$Rn))), + (NEGdd FPR64:$Rn)>; + +// Scalar Signed Saturating Extract Unsigned Narrow +defm SQXTUN : NeonI_Scalar2SameMisc_narrow_HSD_size<0b1, 0b10010, "sqxtun">; +defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns; + +// Scalar Signed Saturating Extract Narrow +defm SQXTN : NeonI_Scalar2SameMisc_narrow_HSD_size<0b0, 0b10100, "sqxtn">; +defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns; + +// Scalar Unsigned Saturating Extract Narrow +defm UQXTN : NeonI_Scalar2SameMisc_narrow_HSD_size<0b1, 0b10100, "uqxtn">; +defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns; + +// Scalar Reduce Pairwise + +multiclass NeonI_ScalarPair_D_sizes opcode, + string asmop, bit Commutable = 0> { + let isCommutable = Commutable in { + def _D_2D : NeonI_ScalarPair; + } +} + +multiclass NeonI_ScalarPair_SD_sizes opcode, + string asmop, bit Commutable = 0> + : NeonI_ScalarPair_D_sizes { + let isCommutable = Commutable in { + def _S_2S : NeonI_ScalarPair; + } +} + +// Scalar Reduce Addition Pairwise (Integer) with +// Pattern to match llvm.arm.* intrinsic +defm ADDPvv : NeonI_ScalarPair_D_sizes<0b0, 0b1, 0b11011, "addp", 0>; + +// Pattern to match llvm.aarch64.* intrinsic for +// Scalar Reduce Addition Pairwise (Integer) +def : Pat<(v1i64 (int_aarch64_neon_vpadd (v2i64 VPR128:$Rn))), + (ADDPvv_D_2D VPR128:$Rn)>; +def : Pat<(v1i64 (int_aarch64_neon_vaddv (v2i64 VPR128:$Rn))), + (ADDPvv_D_2D VPR128:$Rn)>; + +// Scalar Reduce Addition Pairwise (Floating Point) +defm FADDPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01101, "faddp", 0>; + +// Scalar Reduce Maximum Pairwise (Floating Point) +defm FMAXPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01111, "fmaxp", 0>; + +// Scalar Reduce Minimum Pairwise (Floating Point) +defm FMINPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01111, "fminp", 0>; + +// Scalar Reduce maxNum Pairwise (Floating Point) +defm FMAXNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01100, "fmaxnmp", 0>; + +// Scalar Reduce minNum Pairwise (Floating Point) +defm FMINNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01100, "fminnmp", 0>; + +multiclass Neon_ScalarPair_SD_size_patterns { + def : Pat<(v1f32 (opnodeS (v2f32 VPR64:$Rn))), + (INSTS VPR64:$Rn)>; + def : Pat<(v1f64 (opnodeD (v2f64 VPR128:$Rn))), + (INSTD VPR128:$Rn)>; +} + +// Patterns to match llvm.aarch64.* intrinsic for +// Scalar Reduce Add, Max, Min, MaxiNum, MinNum Pairwise (Floating Point) +defm : Neon_ScalarPair_SD_size_patterns; + +defm : Neon_ScalarPair_SD_size_patterns; + +defm : Neon_ScalarPair_SD_size_patterns; + +defm : Neon_ScalarPair_SD_size_patterns; + +defm : Neon_ScalarPair_SD_size_patterns; + +defm : Neon_ScalarPair_SD_size_patterns; + +def : Pat<(v1f32 (int_aarch64_neon_vaddv (v4f32 VPR128:$Rn))), + (FADDPvv_S_2S (v2f32 + (EXTRACT_SUBREG + (v4f32 (FADDP_4S (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rn))), + sub_64)))>; + +defm : Neon_ScalarPair_SD_size_patterns; + +defm : Neon_ScalarPair_SD_size_patterns; + +defm : Neon_ScalarPair_SD_size_patterns; + +defm : Neon_ScalarPair_SD_size_patterns; + +// Scalar by element Arithmetic + +class NeonI_ScalarXIndexedElemArith opcode, + string rmlane, bit u, bit szhi, bit szlo, + RegisterClass ResFPR, RegisterClass OpFPR, + RegisterOperand OpVPR, Operand OpImm> + : NeonI_ScalarXIndexedElem { + bits<3> Imm; + bits<5> MRm; +} + +class NeonI_ScalarXIndexedElemArith_Constraint_Impl opcode, + string rmlane, + bit u, bit szhi, bit szlo, + RegisterClass ResFPR, + RegisterClass OpFPR, + RegisterOperand OpVPR, + Operand OpImm> + : NeonI_ScalarXIndexedElem { + let Constraints = "$src = $Rd"; + bits<3> Imm; + bits<5> MRm; +} + +// Scalar Floating Point multiply (scalar, by element) +def FMULssv_4S : NeonI_ScalarXIndexedElemArith<"fmul", + 0b1001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def FMULddv_2D : NeonI_ScalarXIndexedElemArith<"fmul", + 0b1001, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> { + let Inst{11} = Imm{0}; // h + let Inst{21} = 0b0; // l + let Inst{20-16} = MRm; +} + +// Scalar Floating Point multiply extended (scalar, by element) +def FMULXssv_4S : NeonI_ScalarXIndexedElemArith<"fmulx", + 0b1001, ".s", 0b1, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def FMULXddv_2D : NeonI_ScalarXIndexedElemArith<"fmulx", + 0b1001, ".d", 0b1, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> { + let Inst{11} = Imm{0}; // h + let Inst{21} = 0b0; // l + let Inst{20-16} = MRm; +} + +multiclass Neon_ScalarXIndexedElem_MUL_MULX_Patterns< + SDPatternOperator opnode, + Instruction INST, + ValueType ResTy, RegisterClass FPRC, ValueType OpTy, Operand OpImm, + ValueType OpNTy, ValueType ExTy, Operand OpNImm> { + + def : Pat<(ResTy (opnode (ResTy FPRC:$Rn), + (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)))), + (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>; + + def : Pat<(ResTy (opnode (ResTy FPRC:$Rn), + (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)))), + (ResTy (INST (ResTy FPRC:$Rn), + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)), + OpNImm:$Imm))>; + + // swapped operands + def : Pat<(ResTy (opnode + (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)), + (ResTy FPRC:$Rn))), + (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>; + + def : Pat<(ResTy (opnode + (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)), + (ResTy FPRC:$Rn))), + (ResTy (INST (ResTy FPRC:$Rn), + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)), + OpNImm:$Imm))>; +} + +// Patterns for Scalar Floating Point multiply (scalar, by element) +defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns; + +// Patterns for Scalar Floating Point multiply extended (scalar, by element) +defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns; + + +// Scalar Floating Point fused multiply-add (scalar, by element) +def FMLAssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla", + 0b0001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def FMLAddv_2D : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla", + 0b0001, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> { + let Inst{11} = Imm{0}; // h + let Inst{21} = 0b0; // l + let Inst{20-16} = MRm; +} + +// Scalar Floating Point fused multiply-subtract (scalar, by element) +def FMLSssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmls", + 0b0101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def FMLSddv_2D : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmls", + 0b0101, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> { + let Inst{11} = Imm{0}; // h + let Inst{21} = 0b0; // l + let Inst{20-16} = MRm; +} +// We are allowed to match the fma instruction regardless of compile options. +multiclass Neon_ScalarXIndexedElem_FMA_Patterns< + Instruction FMLAI, Instruction FMLSI, + ValueType ResTy, RegisterClass FPRC, ValueType OpTy, Operand OpImm, + ValueType OpNTy, ValueType ExTy, Operand OpNImm> { + // fmla + def : Pat<(ResTy (fma (ResTy FPRC:$Rn), + (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)), + (ResTy FPRC:$Ra))), + (ResTy (FMLAI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>; + + def : Pat<(ResTy (fma (ResTy FPRC:$Rn), + (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)), + (ResTy FPRC:$Ra))), + (ResTy (FMLAI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)), + OpNImm:$Imm))>; + + // swapped fmla operands + def : Pat<(ResTy (fma + (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)), + (ResTy FPRC:$Rn), + (ResTy FPRC:$Ra))), + (ResTy (FMLAI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>; + + def : Pat<(ResTy (fma + (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)), + (ResTy FPRC:$Rn), + (ResTy FPRC:$Ra))), + (ResTy (FMLAI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)), + OpNImm:$Imm))>; + + // fmls + def : Pat<(ResTy (fma (ResTy FPRC:$Rn), + (fneg (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm))), + (ResTy FPRC:$Ra))), + (ResTy (FMLSI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>; + + def : Pat<(ResTy (fma (ResTy FPRC:$Rn), + (fneg (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm))), + (ResTy FPRC:$Ra))), + (ResTy (FMLSI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)), + OpNImm:$Imm))>; + + // swapped fmls operands + def : Pat<(ResTy (fma + (fneg (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm))), + (ResTy FPRC:$Rn), + (ResTy FPRC:$Ra))), + (ResTy (FMLSI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>; + + def : Pat<(ResTy (fma + (fneg (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm))), + (ResTy FPRC:$Rn), + (ResTy FPRC:$Ra))), + (ResTy (FMLSI (ResTy FPRC:$Ra), + (ResTy FPRC:$Rn), + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)), + OpNImm:$Imm))>; +} + +// Scalar Floating Point fused multiply-add and +// multiply-subtract (scalar, by element) +defm : Neon_ScalarXIndexedElem_FMA_Patterns; +defm : Neon_ScalarXIndexedElem_FMA_Patterns; +defm : Neon_ScalarXIndexedElem_FMA_Patterns; + +// Scalar Signed saturating doubling multiply long (scalar, by element) +def SQDMULLshv_4H : NeonI_ScalarXIndexedElemArith<"sqdmull", + 0b1011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMULLshv_8H : NeonI_ScalarXIndexedElemArith<"sqdmull", + 0b1011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> { + let Inst{11} = Imm{2}; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMULLdsv_2S : NeonI_ScalarXIndexedElemArith<"sqdmull", + 0b1011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def SQDMULLdsv_4S : NeonI_ScalarXIndexedElemArith<"sqdmull", + 0b1011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} + +multiclass Neon_ScalarXIndexedElem_MUL_Patterns< + SDPatternOperator opnode, + Instruction INST, + ValueType ResTy, RegisterClass FPRC, + ValueType OpVTy, ValueType OpTy, + ValueType VecOpTy, ValueType ExTy, RegisterOperand VPRC, Operand OpImm> { + + def : Pat<(ResTy (opnode (OpVTy FPRC:$Rn), + (OpVTy (scalar_to_vector + (ExTy (vector_extract (VecOpTy VPRC:$MRm), OpImm:$Imm)))))), + (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>; + + //swapped operands + def : Pat<(ResTy (opnode + (OpVTy (scalar_to_vector + (ExTy (vector_extract (VecOpTy VPRC:$MRm), OpImm:$Imm)))), + (OpVTy FPRC:$Rn))), + (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>; +} + + +// Patterns for Scalar Signed saturating doubling +// multiply long (scalar, by element) +defm : Neon_ScalarXIndexedElem_MUL_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_Patterns; + +// Scalar Signed saturating doubling multiply-add long (scalar, by element) +def SQDMLALshv_4H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal", + 0b0011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMLALshv_8H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal", + 0b0011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> { + let Inst{11} = Imm{2}; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMLALdsv_2S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal", + 0b0011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def SQDMLALdsv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal", + 0b0011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} + +// Scalar Signed saturating doubling +// multiply-subtract long (scalar, by element) +def SQDMLSLshv_4H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl", + 0b0111, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMLSLshv_8H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl", + 0b0111, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> { + let Inst{11} = Imm{2}; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMLSLdsv_2S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl", + 0b0111, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def SQDMLSLdsv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl", + 0b0111, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} + +multiclass Neon_ScalarXIndexedElem_MLAL_Patterns< + SDPatternOperator opnode, + SDPatternOperator coreopnode, + Instruction INST, + ValueType ResTy, RegisterClass ResFPRC, RegisterClass FPRC, + ValueType OpTy, + ValueType OpVTy, ValueType ExTy, RegisterOperand VPRC, Operand OpImm> { + + def : Pat<(ResTy (opnode + (ResTy ResFPRC:$Ra), + (ResTy (coreopnode (OpTy FPRC:$Rn), + (OpTy (scalar_to_vector + (ExTy (vector_extract (OpVTy VPRC:$MRm), OpImm:$Imm)))))))), + (ResTy (INST (ResTy ResFPRC:$Ra), + (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>; + + // swapped operands + def : Pat<(ResTy (opnode + (ResTy ResFPRC:$Ra), + (ResTy (coreopnode + (OpTy (scalar_to_vector + (ExTy (vector_extract (OpVTy VPRC:$MRm), OpImm:$Imm)))), + (OpTy FPRC:$Rn))))), + (ResTy (INST (ResTy ResFPRC:$Ra), + (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>; +} + +// Patterns for Scalar Signed saturating +// doubling multiply-add long (scalar, by element) +defm : Neon_ScalarXIndexedElem_MLAL_Patterns; +defm : Neon_ScalarXIndexedElem_MLAL_Patterns; +defm : Neon_ScalarXIndexedElem_MLAL_Patterns; +defm : Neon_ScalarXIndexedElem_MLAL_Patterns; + +// Patterns for Scalar Signed saturating +// doubling multiply-sub long (scalar, by element) +defm : Neon_ScalarXIndexedElem_MLAL_Patterns; +defm : Neon_ScalarXIndexedElem_MLAL_Patterns; +defm : Neon_ScalarXIndexedElem_MLAL_Patterns; +defm : Neon_ScalarXIndexedElem_MLAL_Patterns; + +// Scalar general arithmetic operation +class Neon_Scalar_GeneralMath2D_pattern + : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>; + +class Neon_Scalar_GeneralMath3D_pattern + : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), + (INST FPR64:$Rn, FPR64:$Rm)>; + +class Neon_Scalar_GeneralMath4D_pattern + : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm), + (v1f64 FPR64:$Ra))), + (INST FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>; + +def : Neon_Scalar_GeneralMath3D_pattern; +def : Neon_Scalar_GeneralMath3D_pattern; +def : Neon_Scalar_GeneralMath3D_pattern; +def : Neon_Scalar_GeneralMath3D_pattern; +def : Neon_Scalar_GeneralMath3D_pattern; +def : Neon_Scalar_GeneralMath3D_pattern; +def : Neon_Scalar_GeneralMath3D_pattern; +def : Neon_Scalar_GeneralMath3D_pattern; +def : Neon_Scalar_GeneralMath3D_pattern; + +def : Neon_Scalar_GeneralMath2D_pattern; +def : Neon_Scalar_GeneralMath2D_pattern; + +def : Neon_Scalar_GeneralMath4D_pattern; +def : Neon_Scalar_GeneralMath4D_pattern; + +// Scalar Signed saturating doubling multiply returning +// high half (scalar, by element) +def SQDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqdmulh", + 0b1100, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR64Lo, neon_uimm2_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMULHhhv_8H : NeonI_ScalarXIndexedElemArith<"sqdmulh", + 0b1100, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR128Lo, neon_uimm3_bare> { + let Inst{11} = Imm{2}; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQDMULHssv_2S : NeonI_ScalarXIndexedElemArith<"sqdmulh", + 0b1100, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR64, neon_uimm1_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def SQDMULHssv_4S : NeonI_ScalarXIndexedElemArith<"sqdmulh", + 0b1100, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} + +// Patterns for Scalar Signed saturating doubling multiply returning +// high half (scalar, by element) +defm : Neon_ScalarXIndexedElem_MUL_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_Patterns; + +// Scalar Signed saturating rounding doubling multiply +// returning high half (scalar, by element) +def SQRDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqrdmulh", + 0b1101, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR64Lo, neon_uimm2_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQRDMULHhhv_8H : NeonI_ScalarXIndexedElemArith<"sqrdmulh", + 0b1101, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR128Lo, neon_uimm3_bare> { + let Inst{11} = Imm{2}; // h + let Inst{21} = Imm{1}; // l + let Inst{20} = Imm{0}; // m + let Inst{19-16} = MRm{3-0}; +} +def SQRDMULHssv_2S : NeonI_ScalarXIndexedElemArith<"sqrdmulh", + 0b1101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR64, neon_uimm1_bare> { + let Inst{11} = 0b0; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} +def SQRDMULHssv_4S : NeonI_ScalarXIndexedElemArith<"sqrdmulh", + 0b1101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> { + let Inst{11} = Imm{1}; // h + let Inst{21} = Imm{0}; // l + let Inst{20-16} = MRm; +} + +defm : Neon_ScalarXIndexedElem_MUL_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_Patterns; +defm : Neon_ScalarXIndexedElem_MUL_Patterns; + +// Scalar Copy - DUP element to scalar +class NeonI_Scalar_DUP + : NeonI_ScalarCopy<(outs ResRC:$Rd), (ins VPRC:$Rn, OpImm:$Imm), + asmop # "\t$Rd, $Rn." # asmlane # "[$Imm]", + [], + NoItinerary> { + bits<4> Imm; +} + +def DUPbv_B : NeonI_Scalar_DUP<"dup", "b", FPR8, VPR128, neon_uimm4_bare> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} +def DUPhv_H : NeonI_Scalar_DUP<"dup", "h", FPR16, VPR128, neon_uimm3_bare> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} +def DUPsv_S : NeonI_Scalar_DUP<"dup", "s", FPR32, VPR128, neon_uimm2_bare> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} +def DUPdv_D : NeonI_Scalar_DUP<"dup", "d", FPR64, VPR128, neon_uimm1_bare> { + let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; +} + +multiclass NeonI_Scalar_DUP_Elt_pattern { + def : Pat<(ResTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)), + (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>; + + def : Pat<(ResTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)), + (ResTy (DUPI + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + OpNImm:$Imm))>; +} + +// Patterns for vector extract of FP data using scalar DUP instructions +defm : NeonI_Scalar_DUP_Elt_pattern; +defm : NeonI_Scalar_DUP_Elt_pattern; + +multiclass NeonI_Scalar_DUP_Ext_Vec_pattern { + + def : Pat<(ResTy (extract_subvector (OpTy VPR128:$Rn), OpLImm:$Imm)), + (ResTy (DUPI VPR128:$Rn, OpLImm:$Imm))>; + + def : Pat<(ResTy (extract_subvector (NOpTy VPR64:$Rn), OpNImm:$Imm)), + (ResTy (DUPI + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + OpNImm:$Imm))>; +} + +// Patterns for extract subvectors of v1ix data using scalar DUP instructions. +defm : NeonI_Scalar_DUP_Ext_Vec_pattern; +defm : NeonI_Scalar_DUP_Ext_Vec_pattern; +defm : NeonI_Scalar_DUP_Ext_Vec_pattern; + +multiclass NeonI_Scalar_DUP_Copy_pattern1 { + + def : Pat<(ResTy (vector_insert (ResTy undef), + (ElemTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)), + (neon_uimm0_bare:$Imm))), + (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>; + + def : Pat<(ResTy (vector_insert (ResTy undef), + (ElemTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)), + (OpNImm:$Imm))), + (ResTy (DUPI + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + OpNImm:$Imm))>; +} + +multiclass NeonI_Scalar_DUP_Copy_pattern2 { + + def : Pat<(ResTy (scalar_to_vector + (ElemTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)))), + (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>; + + def : Pat<(ResTy (scalar_to_vector + (ElemTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)))), + (ResTy (DUPI + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + OpNImm:$Imm))>; +} + +// Patterns for vector copy to v1ix and v1fx vectors using scalar DUP +// instructions. +defm : NeonI_Scalar_DUP_Copy_pattern1; +defm : NeonI_Scalar_DUP_Copy_pattern1; +defm : NeonI_Scalar_DUP_Copy_pattern1; +defm : NeonI_Scalar_DUP_Copy_pattern1; +defm : NeonI_Scalar_DUP_Copy_pattern1; +defm : NeonI_Scalar_DUP_Copy_pattern1; +defm : NeonI_Scalar_DUP_Copy_pattern2; +defm : NeonI_Scalar_DUP_Copy_pattern2; +defm : NeonI_Scalar_DUP_Copy_pattern2; +defm : NeonI_Scalar_DUP_Copy_pattern2; +defm : NeonI_Scalar_DUP_Copy_pattern2; +defm : NeonI_Scalar_DUP_Copy_pattern2; + +multiclass NeonI_Scalar_DUP_alias { + def : NeonInstAlias; +} + +// Aliases for Scalar copy - DUP element (scalar) +// FIXME: This is actually the preferred syntax but TableGen can't deal with +// custom printing of aliases. +defm : NeonI_Scalar_DUP_alias<"mov", ".b", DUPbv_B, neon_uimm4_bare, FPR8>; +defm : NeonI_Scalar_DUP_alias<"mov", ".h", DUPhv_H, neon_uimm3_bare, FPR16>; +defm : NeonI_Scalar_DUP_alias<"mov", ".s", DUPsv_S, neon_uimm2_bare, FPR32>; +defm : NeonI_Scalar_DUP_alias<"mov", ".d", DUPdv_D, neon_uimm1_bare, FPR64>; + +multiclass NeonI_SDUP { + def : Pat<(ResTy (GetLow VPR128:$Rn)), + (ResTy (EXTRACT_SUBREG (OpTy VPR128:$Rn), sub_64))>; + def : Pat<(ResTy (GetHigh VPR128:$Rn)), + (ResTy (DUPdv_D (OpTy VPR128:$Rn), 1))>; +} + +defm : NeonI_SDUP; +defm : NeonI_SDUP; +defm : NeonI_SDUP; +defm : NeonI_SDUP; +defm : NeonI_SDUP; +defm : NeonI_SDUP; + +//===----------------------------------------------------------------------===// +// Non-Instruction Patterns +//===----------------------------------------------------------------------===// + +// 64-bit vector bitcasts... + +def : Pat<(v1i64 (bitconvert (v8i8 VPR64:$src))), (v1i64 VPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v8i8 VPR64:$src))), (v2f32 VPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v8i8 VPR64:$src))), (v2i32 VPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v8i8 VPR64:$src))), (v4i16 VPR64:$src)>; + +def : Pat<(v1i64 (bitconvert (v4i16 VPR64:$src))), (v1i64 VPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v4i16 VPR64:$src))), (v2i32 VPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v4i16 VPR64:$src))), (v2f32 VPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v4i16 VPR64:$src))), (v8i8 VPR64:$src)>; + +def : Pat<(v1i64 (bitconvert (v2i32 VPR64:$src))), (v1i64 VPR64:$src)>; +def : Pat<(v2f32 (bitconvert (v2i32 VPR64:$src))), (v2f32 VPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v2i32 VPR64:$src))), (v4i16 VPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v2i32 VPR64:$src))), (v8i8 VPR64:$src)>; + +def : Pat<(v1i64 (bitconvert (v2f32 VPR64:$src))), (v1i64 VPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v2f32 VPR64:$src))), (v2i32 VPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v2f32 VPR64:$src))), (v4i16 VPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v2f32 VPR64:$src))), (v8i8 VPR64:$src)>; + +def : Pat<(v2f32 (bitconvert (v1i64 VPR64:$src))), (v2f32 VPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v1i64 VPR64:$src))), (v2i32 VPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v1i64 VPR64:$src))), (v4i16 VPR64:$src)>; +def : Pat<(v8i8 (bitconvert (v1i64 VPR64:$src))), (v8i8 VPR64:$src)>; + +// ..and 128-bit vector bitcasts... + +def : Pat<(v2f64 (bitconvert (v16i8 VPR128:$src))), (v2f64 VPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v16i8 VPR128:$src))), (v2i64 VPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v16i8 VPR128:$src))), (v4f32 VPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v16i8 VPR128:$src))), (v4i32 VPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v16i8 VPR128:$src))), (v8i16 VPR128:$src)>; + +def : Pat<(v2f64 (bitconvert (v8i16 VPR128:$src))), (v2f64 VPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v8i16 VPR128:$src))), (v2i64 VPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v8i16 VPR128:$src))), (v4i32 VPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v8i16 VPR128:$src))), (v4f32 VPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v8i16 VPR128:$src))), (v16i8 VPR128:$src)>; + +def : Pat<(v2f64 (bitconvert (v4i32 VPR128:$src))), (v2f64 VPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v4i32 VPR128:$src))), (v2i64 VPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v4i32 VPR128:$src))), (v4f32 VPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v4i32 VPR128:$src))), (v8i16 VPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v4i32 VPR128:$src))), (v16i8 VPR128:$src)>; + +def : Pat<(v2f64 (bitconvert (v4f32 VPR128:$src))), (v2f64 VPR128:$src)>; +def : Pat<(v2i64 (bitconvert (v4f32 VPR128:$src))), (v2i64 VPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v4f32 VPR128:$src))), (v4i32 VPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v4f32 VPR128:$src))), (v8i16 VPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v4f32 VPR128:$src))), (v16i8 VPR128:$src)>; + +def : Pat<(v2f64 (bitconvert (v2i64 VPR128:$src))), (v2f64 VPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v2i64 VPR128:$src))), (v4f32 VPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v2i64 VPR128:$src))), (v4i32 VPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v2i64 VPR128:$src))), (v8i16 VPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v2i64 VPR128:$src))), (v16i8 VPR128:$src)>; + +def : Pat<(v2i64 (bitconvert (v2f64 VPR128:$src))), (v2i64 VPR128:$src)>; +def : Pat<(v4f32 (bitconvert (v2f64 VPR128:$src))), (v4f32 VPR128:$src)>; +def : Pat<(v4i32 (bitconvert (v2f64 VPR128:$src))), (v4i32 VPR128:$src)>; +def : Pat<(v8i16 (bitconvert (v2f64 VPR128:$src))), (v8i16 VPR128:$src)>; +def : Pat<(v16i8 (bitconvert (v2f64 VPR128:$src))), (v16i8 VPR128:$src)>; + + +// ...and scalar bitcasts... +def : Pat<(f16 (bitconvert (v1i16 FPR16:$src))), (f16 FPR16:$src)>; +def : Pat<(f32 (bitconvert (v1i32 FPR32:$src))), (f32 FPR32:$src)>; +def : Pat<(f64 (bitconvert (v1i64 FPR64:$src))), (f64 FPR64:$src)>; +def : Pat<(f32 (bitconvert (v1f32 FPR32:$src))), (f32 FPR32:$src)>; +def : Pat<(f64 (bitconvert (v1f64 FPR64:$src))), (f64 FPR64:$src)>; + +def : Pat<(i64 (bitconvert (v1i64 FPR64:$src))), (FMOVxd $src)>; +def : Pat<(i64 (bitconvert (v1f64 FPR64:$src))), (FMOVxd $src)>; +def : Pat<(i64 (bitconvert (v2i32 FPR64:$src))), (FMOVxd $src)>; +def : Pat<(i64 (bitconvert (v2f32 FPR64:$src))), (FMOVxd $src)>; +def : Pat<(i64 (bitconvert (v4i16 FPR64:$src))), (FMOVxd $src)>; +def : Pat<(i64 (bitconvert (v8i8 FPR64:$src))), (FMOVxd $src)>; + +def : Pat<(i32 (bitconvert (v1i32 FPR32:$src))), (FMOVws $src)>; + +def : Pat<(v8i8 (bitconvert (v1i64 VPR64:$src))), (v8i8 VPR64:$src)>; +def : Pat<(v4i16 (bitconvert (v1i64 VPR64:$src))), (v4i16 VPR64:$src)>; +def : Pat<(v2i32 (bitconvert (v1i64 VPR64:$src))), (v2i32 VPR64:$src)>; + +def : Pat<(f64 (bitconvert (v8i8 VPR64:$src))), (f64 VPR64:$src)>; +def : Pat<(f64 (bitconvert (v4i16 VPR64:$src))), (f64 VPR64:$src)>; +def : Pat<(f64 (bitconvert (v2i32 VPR64:$src))), (f64 VPR64:$src)>; +def : Pat<(f64 (bitconvert (v2f32 VPR64:$src))), (f64 VPR64:$src)>; +def : Pat<(f64 (bitconvert (v1i64 VPR64:$src))), (f64 VPR64:$src)>; + +def : Pat<(f128 (bitconvert (v16i8 VPR128:$src))), (f128 VPR128:$src)>; +def : Pat<(f128 (bitconvert (v8i16 VPR128:$src))), (f128 VPR128:$src)>; +def : Pat<(f128 (bitconvert (v4i32 VPR128:$src))), (f128 VPR128:$src)>; +def : Pat<(f128 (bitconvert (v2i64 VPR128:$src))), (f128 VPR128:$src)>; +def : Pat<(f128 (bitconvert (v4f32 VPR128:$src))), (f128 VPR128:$src)>; +def : Pat<(f128 (bitconvert (v2f64 VPR128:$src))), (f128 VPR128:$src)>; + +def : Pat<(v1i16 (bitconvert (f16 FPR16:$src))), (v1i16 FPR16:$src)>; +def : Pat<(v1i32 (bitconvert (f32 FPR32:$src))), (v1i32 FPR32:$src)>; +def : Pat<(v1i64 (bitconvert (f64 FPR64:$src))), (v1i64 FPR64:$src)>; +def : Pat<(v1f32 (bitconvert (f32 FPR32:$src))), (v1f32 FPR32:$src)>; +def : Pat<(v1f64 (bitconvert (f64 FPR64:$src))), (v1f64 FPR64:$src)>; + +def : Pat<(v1i64 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>; +def : Pat<(v1f64 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>; +def : Pat<(v2i32 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>; +def : Pat<(v2f32 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>; +def : Pat<(v4i16 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>; +def : Pat<(v8i8 (bitconvert (i64 GPR64:$src))), (FMOVdx $src)>; + +def : Pat<(v1i32 (bitconvert (i32 GPR32:$src))), (FMOVsw $src)>; + +def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), (v8i8 FPR64:$src)>; +def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), (v4i16 FPR64:$src)>; +def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (v2i32 FPR64:$src)>; +def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 FPR64:$src)>; +def : Pat<(v1i64 (bitconvert (f64 FPR64:$src))), (v1i64 FPR64:$src)>; + +def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>; +def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))), (v8i16 FPR128:$src)>; +def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), (v4i32 FPR128:$src)>; +def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), (v2i64 FPR128:$src)>; +def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>; +def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), (v2f64 FPR128:$src)>; + +// Scalar Three Same + +def neon_uimm3 : Operand, + ImmLeaf { + let ParserMatchClass = uimm3_asmoperand; + let PrintMethod = "printUImmHexOperand"; +} + +def neon_uimm4 : Operand, + ImmLeaf { + let ParserMatchClass = uimm4_asmoperand; + let PrintMethod = "printUImmHexOperand"; +} + +// Bitwise Extract +class NeonI_Extract op2, string asmop, + string OpS, RegisterOperand OpVPR, Operand OpImm> + : NeonI_BitExtract{ + bits<4> Index; +} + +def EXTvvvi_8b : NeonI_Extract<0b0, 0b00, "ext", "8b", + VPR64, neon_uimm3> { + let Inst{14-11} = {0b0, Index{2}, Index{1}, Index{0}}; +} + +def EXTvvvi_16b: NeonI_Extract<0b1, 0b00, "ext", "16b", + VPR128, neon_uimm4> { + let Inst{14-11} = Index; +} + +class NI_Extract + : Pat<(OpTy (Neon_vextract (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm), + (i64 OpImm:$Imm))), + (INST OpVPR:$Rn, OpVPR:$Rm, OpImm:$Imm)>; + +def : NI_Extract; +def : NI_Extract; +def : NI_Extract; +def : NI_Extract; +def : NI_Extract; +def : NI_Extract; +def : NI_Extract; +def : NI_Extract; +def : NI_Extract; +def : NI_Extract; +def : NI_Extract; +def : NI_Extract; + +// Table lookup +class NI_TBL op2, bits<2> len, bit op, + string asmop, string OpS, RegisterOperand OpVPR, + RegisterOperand VecList> + : NeonI_TBL; + +// The vectors in look up table are always 16b +multiclass NI_TBL_pat len, bit op, string asmop, string List> { + def _8b : NI_TBL<0, 0b00, len, op, asmop, "8b", VPR64, + !cast(List # "16B_operand")>; + + def _16b : NI_TBL<1, 0b00, len, op, asmop, "16b", VPR128, + !cast(List # "16B_operand")>; +} + +defm TBL1 : NI_TBL_pat<0b00, 0b0, "tbl", "VOne">; +defm TBL2 : NI_TBL_pat<0b01, 0b0, "tbl", "VPair">; +defm TBL3 : NI_TBL_pat<0b10, 0b0, "tbl", "VTriple">; +defm TBL4 : NI_TBL_pat<0b11, 0b0, "tbl", "VQuad">; + +// Table lookup extention +class NI_TBX op2, bits<2> len, bit op, + string asmop, string OpS, RegisterOperand OpVPR, + RegisterOperand VecList> + : NeonI_TBL { + let Constraints = "$src = $Rd"; +} + +// The vectors in look up table are always 16b +multiclass NI_TBX_pat len, bit op, string asmop, string List> { + def _8b : NI_TBX<0, 0b00, len, op, asmop, "8b", VPR64, + !cast(List # "16B_operand")>; + + def _16b : NI_TBX<1, 0b00, len, op, asmop, "16b", VPR128, + !cast(List # "16B_operand")>; +} + +defm TBX1 : NI_TBX_pat<0b00, 0b1, "tbx", "VOne">; +defm TBX2 : NI_TBX_pat<0b01, 0b1, "tbx", "VPair">; +defm TBX3 : NI_TBX_pat<0b10, 0b1, "tbx", "VTriple">; +defm TBX4 : NI_TBX_pat<0b11, 0b1, "tbx", "VQuad">; + +class NeonI_INS_main + : NeonI_copy<0b1, 0b0, 0b0011, + (outs VPR128:$Rd), (ins VPR128:$src, OpGPR:$Rn, OpImm:$Imm), + asmop # "\t$Rd." # Res # "[$Imm], $Rn", + [(set (ResTy VPR128:$Rd), + (ResTy (vector_insert + (ResTy VPR128:$src), + (OpTy OpGPR:$Rn), + (OpImm:$Imm))))], + NoItinerary> { + bits<4> Imm; + let Constraints = "$src = $Rd"; +} + +//Insert element (vector, from main) +def INSbw : NeonI_INS_main<"ins", "b", v16i8, GPR32, i32, + neon_uimm4_bare> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} +def INShw : NeonI_INS_main<"ins", "h", v8i16, GPR32, i32, + neon_uimm3_bare> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} +def INSsw : NeonI_INS_main<"ins", "s", v4i32, GPR32, i32, + neon_uimm2_bare> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} +def INSdx : NeonI_INS_main<"ins", "d", v2i64, GPR64, i64, + neon_uimm1_bare> { + let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; +} + +def : NeonInstAlias<"mov $Rd.b[$Imm], $Rn", + (INSbw VPR128:$Rd, GPR32:$Rn, neon_uimm4_bare:$Imm), 0>; +def : NeonInstAlias<"mov $Rd.h[$Imm], $Rn", + (INShw VPR128:$Rd, GPR32:$Rn, neon_uimm3_bare:$Imm), 0>; +def : NeonInstAlias<"mov $Rd.s[$Imm], $Rn", + (INSsw VPR128:$Rd, GPR32:$Rn, neon_uimm2_bare:$Imm), 0>; +def : NeonInstAlias<"mov $Rd.d[$Imm], $Rn", + (INSdx VPR128:$Rd, GPR64:$Rn, neon_uimm1_bare:$Imm), 0>; + +class Neon_INS_main_pattern + : Pat<(ResTy (vector_insert + (ResTy VPR64:$src), + (OpTy OpGPR:$Rn), + (OpImm:$Imm))), + (ResTy (EXTRACT_SUBREG + (ExtResTy (INS (ExtResTy (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)), + OpGPR:$Rn, OpImm:$Imm)), sub_64))>; + +def INSbw_pattern : Neon_INS_main_pattern; +def INShw_pattern : Neon_INS_main_pattern; +def INSsw_pattern : Neon_INS_main_pattern; +def INSdx_pattern : Neon_INS_main_pattern; + +class NeonI_INS_element + : NeonI_insert<0b1, 0b1, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn, + ResImm:$Immd, ResImm:$Immn), + asmop # "\t$Rd." # Res # "[$Immd], $Rn." # Res # "[$Immn]", + [], + NoItinerary> { + let Constraints = "$src = $Rd"; + bits<4> Immd; + bits<4> Immn; +} + +//Insert element (vector, from element) +def INSELb : NeonI_INS_element<"ins", "b", neon_uimm4_bare> { + let Inst{20-16} = {Immd{3}, Immd{2}, Immd{1}, Immd{0}, 0b1}; + let Inst{14-11} = {Immn{3}, Immn{2}, Immn{1}, Immn{0}}; +} +def INSELh : NeonI_INS_element<"ins", "h", neon_uimm3_bare> { + let Inst{20-16} = {Immd{2}, Immd{1}, Immd{0}, 0b1, 0b0}; + let Inst{14-11} = {Immn{2}, Immn{1}, Immn{0}, 0b0}; + // bit 11 is unspecified, but should be set to zero. +} +def INSELs : NeonI_INS_element<"ins", "s", neon_uimm2_bare> { + let Inst{20-16} = {Immd{1}, Immd{0}, 0b1, 0b0, 0b0}; + let Inst{14-11} = {Immn{1}, Immn{0}, 0b0, 0b0}; + // bits 11-12 are unspecified, but should be set to zero. +} +def INSELd : NeonI_INS_element<"ins", "d", neon_uimm1_bare> { + let Inst{20-16} = {Immd, 0b1, 0b0, 0b0, 0b0}; + let Inst{14-11} = {Immn{0}, 0b0, 0b0, 0b0}; + // bits 11-13 are unspecified, but should be set to zero. +} + +def : NeonInstAlias<"mov $Rd.b[$Immd], $Rn.b[$Immn]", + (INSELb VPR128:$Rd, VPR128:$Rn, + neon_uimm4_bare:$Immd, neon_uimm4_bare:$Immn), 0>; +def : NeonInstAlias<"mov $Rd.h[$Immd], $Rn.h[$Immn]", + (INSELh VPR128:$Rd, VPR128:$Rn, + neon_uimm3_bare:$Immd, neon_uimm3_bare:$Immn), 0>; +def : NeonInstAlias<"mov $Rd.s[$Immd], $Rn.s[$Immn]", + (INSELs VPR128:$Rd, VPR128:$Rn, + neon_uimm2_bare:$Immd, neon_uimm2_bare:$Immn), 0>; +def : NeonInstAlias<"mov $Rd.d[$Immd], $Rn.d[$Immn]", + (INSELd VPR128:$Rd, VPR128:$Rn, + neon_uimm1_bare:$Immd, neon_uimm1_bare:$Immn), 0>; + +multiclass Neon_INS_elt_pattern { +def : Pat<(ResTy (vector_insert + (ResTy VPR128:$src), + (MidTy (vector_extract + (ResTy VPR128:$Rn), + (StImm:$Immn))), + (StImm:$Immd))), + (INS (ResTy VPR128:$src), (ResTy VPR128:$Rn), + StImm:$Immd, StImm:$Immn)>; + +def : Pat <(ResTy (vector_insert + (ResTy VPR128:$src), + (MidTy (vector_extract + (NaTy VPR64:$Rn), + (NaImm:$Immn))), + (StImm:$Immd))), + (INS (ResTy VPR128:$src), + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)), + StImm:$Immd, NaImm:$Immn)>; + +def : Pat <(NaTy (vector_insert + (NaTy VPR64:$src), + (MidTy (vector_extract + (ResTy VPR128:$Rn), + (StImm:$Immn))), + (NaImm:$Immd))), + (NaTy (EXTRACT_SUBREG + (ResTy (INS + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)), + (ResTy VPR128:$Rn), + NaImm:$Immd, StImm:$Immn)), + sub_64))>; + +def : Pat <(NaTy (vector_insert + (NaTy VPR64:$src), + (MidTy (vector_extract + (NaTy VPR64:$Rn), + (NaImm:$Immn))), + (NaImm:$Immd))), + (NaTy (EXTRACT_SUBREG + (ResTy (INS + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)), + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)), + NaImm:$Immd, NaImm:$Immn)), + sub_64))>; +} + +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; +defm : Neon_INS_elt_pattern; + +multiclass Neon_INS_elt_float_pattern { +def : Pat <(ResTy (vector_insert + (ResTy VPR128:$src), + (MidTy OpFPR:$Rn), + (ResImm:$Imm))), + (INS (ResTy VPR128:$src), + (ResTy (SUBREG_TO_REG (i64 0), OpFPR:$Rn, SubIndex)), + ResImm:$Imm, + (i64 0))>; + +def : Pat <(NaTy (vector_insert + (NaTy VPR64:$src), + (MidTy OpFPR:$Rn), + (ResImm:$Imm))), + (NaTy (EXTRACT_SUBREG + (ResTy (INS + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)), + (ResTy (SUBREG_TO_REG (i64 0), (MidTy OpFPR:$Rn), SubIndex)), + ResImm:$Imm, + (i64 0))), + sub_64))>; +} + +defm : Neon_INS_elt_float_pattern; +defm : Neon_INS_elt_float_pattern; + +class NeonI_SMOV + : NeonI_copy { + bits<4> Imm; +} + +//Signed integer move (main, from element) +def SMOVwb : NeonI_SMOV<"smov", "b", 0b0, v16i8, i8, neon_uimm4_bare, + GPR32, i32> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} +def SMOVwh : NeonI_SMOV<"smov", "h", 0b0, v8i16, i16, neon_uimm3_bare, + GPR32, i32> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} +def SMOVxb : NeonI_SMOV<"smov", "b", 0b1, v16i8, i8, neon_uimm4_bare, + GPR64, i64> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} +def SMOVxh : NeonI_SMOV<"smov", "h", 0b1, v8i16, i16, neon_uimm3_bare, + GPR64, i64> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} +def SMOVxs : NeonI_SMOV<"smov", "s", 0b1, v4i32, i32, neon_uimm2_bare, + GPR64, i64> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} + +multiclass Neon_SMOVx_pattern { + def : Pat<(i64 (sext_inreg + (i64 (anyext + (i32 (vector_extract + (StTy VPR128:$Rn), (StImm:$Imm))))), + eleTy)), + (SMOVI VPR128:$Rn, StImm:$Imm)>; + + def : Pat<(i64 (sext + (i32 (vector_extract + (StTy VPR128:$Rn), (StImm:$Imm))))), + (SMOVI VPR128:$Rn, StImm:$Imm)>; + + def : Pat<(i64 (sext_inreg + (i64 (vector_extract + (NaTy VPR64:$Rn), (NaImm:$Imm))), + eleTy)), + (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + NaImm:$Imm)>; + + def : Pat<(i64 (sext_inreg + (i64 (anyext + (i32 (vector_extract + (NaTy VPR64:$Rn), (NaImm:$Imm))))), + eleTy)), + (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + NaImm:$Imm)>; + + def : Pat<(i64 (sext + (i32 (vector_extract + (NaTy VPR64:$Rn), (NaImm:$Imm))))), + (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + NaImm:$Imm)>; +} + +defm : Neon_SMOVx_pattern; +defm : Neon_SMOVx_pattern; +defm : Neon_SMOVx_pattern; + +class Neon_SMOVw_pattern + : Pat<(i32 (sext_inreg + (i32 (vector_extract + (NaTy VPR64:$Rn), (NaImm:$Imm))), + eleTy)), + (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + NaImm:$Imm)>; + +def : Neon_SMOVw_pattern; +def : Neon_SMOVw_pattern; + +class NeonI_UMOV + : NeonI_copy { + bits<4> Imm; +} + +//Unsigned integer move (main, from element) +def UMOVwb : NeonI_UMOV<"umov", "b", 0b0, v16i8, neon_uimm4_bare, + GPR32, i32> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} +def UMOVwh : NeonI_UMOV<"umov", "h", 0b0, v8i16, neon_uimm3_bare, + GPR32, i32> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} +def UMOVws : NeonI_UMOV<"umov", "s", 0b0, v4i32, neon_uimm2_bare, + GPR32, i32> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} +def UMOVxd : NeonI_UMOV<"umov", "d", 0b1, v2i64, neon_uimm1_bare, + GPR64, i64> { + let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; +} + +def : NeonInstAlias<"mov $Rd, $Rn.s[$Imm]", + (UMOVws GPR32:$Rd, VPR128:$Rn, neon_uimm2_bare:$Imm), 0>; +def : NeonInstAlias<"mov $Rd, $Rn.d[$Imm]", + (UMOVxd GPR64:$Rd, VPR128:$Rn, neon_uimm1_bare:$Imm), 0>; + +class Neon_UMOV_pattern + : Pat<(ResTy (vector_extract + (NaTy VPR64:$Rn), NaImm:$Imm)), + (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + NaImm:$Imm)>; + +def : Neon_UMOV_pattern; +def : Neon_UMOV_pattern; +def : Neon_UMOV_pattern; + +def : Pat<(i32 (and + (i32 (vector_extract + (v16i8 VPR128:$Rn), (neon_uimm4_bare:$Imm))), + 255)), + (UMOVwb VPR128:$Rn, neon_uimm4_bare:$Imm)>; + +def : Pat<(i32 (and + (i32 (vector_extract + (v8i16 VPR128:$Rn), (neon_uimm3_bare:$Imm))), + 65535)), + (UMOVwh VPR128:$Rn, neon_uimm3_bare:$Imm)>; + +def : Pat<(i64 (zext + (i32 (vector_extract + (v2i64 VPR128:$Rn), (neon_uimm1_bare:$Imm))))), + (UMOVxd VPR128:$Rn, neon_uimm1_bare:$Imm)>; + +def : Pat<(i32 (and + (i32 (vector_extract + (v8i8 VPR64:$Rn), (neon_uimm3_bare:$Imm))), + 255)), + (UMOVwb (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64), + neon_uimm3_bare:$Imm)>; + +def : Pat<(i32 (and + (i32 (vector_extract + (v4i16 VPR64:$Rn), (neon_uimm2_bare:$Imm))), + 65535)), + (UMOVwh (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64), + neon_uimm2_bare:$Imm)>; + +def : Pat<(i64 (zext + (i32 (vector_extract + (v1i64 VPR64:$Rn), (neon_uimm0_bare:$Imm))))), + (UMOVxd (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64), + neon_uimm0_bare:$Imm)>; + +// Additional copy patterns for scalar types +def : Pat<(i32 (vector_extract (v1i8 FPR8:$Rn), (i64 0))), + (UMOVwb (v16i8 + (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8)), (i64 0))>; + +def : Pat<(i32 (vector_extract (v1i16 FPR16:$Rn), (i64 0))), + (UMOVwh (v8i16 + (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16)), (i64 0))>; + +def : Pat<(i32 (vector_extract (v1i32 FPR32:$Rn), (i64 0))), + (FMOVws FPR32:$Rn)>; + +def : Pat<(i64 (vector_extract (v1i64 FPR64:$Rn), (i64 0))), + (FMOVxd FPR64:$Rn)>; + +def : Pat<(f64 (vector_extract (v1f64 FPR64:$Rn), (i64 0))), + (f64 FPR64:$Rn)>; + +def : Pat<(f32 (vector_extract (v1f32 FPR32:$Rn), (i64 0))), + (f32 FPR32:$Rn)>; + +def : Pat<(v1i8 (scalar_to_vector GPR32:$Rn)), + (v1i8 (EXTRACT_SUBREG (v16i8 + (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))), + sub_8))>; + +def : Pat<(v1i16 (scalar_to_vector GPR32:$Rn)), + (v1i16 (EXTRACT_SUBREG (v8i16 + (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))), + sub_16))>; + +def : Pat<(v1i32 (scalar_to_vector GPR32:$src)), + (FMOVsw $src)>; + +def : Pat<(v1i64 (scalar_to_vector GPR64:$src)), + (FMOVdx $src)>; + +def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$Rn))), + (v1f32 FPR32:$Rn)>; +def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Rn))), + (v1f64 FPR64:$Rn)>; + +def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))), + (FMOVdd $src)>; + +def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$src))), + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), + (f64 FPR64:$src), sub_64)>; + +class NeonI_DUP_Elt + : NeonI_copy { + bits<4> Imm; +} + +def DUPELT16b : NeonI_DUP_Elt<0b1, "dup", ".16b", ".b", VPR128, + neon_uimm4_bare> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} + +def DUPELT8h : NeonI_DUP_Elt<0b1, "dup", ".8h", ".h", VPR128, + neon_uimm3_bare> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} + +def DUPELT4s : NeonI_DUP_Elt<0b1, "dup", ".4s", ".s", VPR128, + neon_uimm2_bare> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} + +def DUPELT2d : NeonI_DUP_Elt<0b1, "dup", ".2d", ".d", VPR128, + neon_uimm1_bare> { + let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; +} + +def DUPELT8b : NeonI_DUP_Elt<0b0, "dup", ".8b", ".b", VPR64, + neon_uimm4_bare> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} + +def DUPELT4h : NeonI_DUP_Elt<0b0, "dup", ".4h", ".h", VPR64, + neon_uimm3_bare> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} + +def DUPELT2s : NeonI_DUP_Elt<0b0, "dup", ".2s", ".s", VPR64, + neon_uimm2_bare> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} + +multiclass NeonI_DUP_Elt_pattern { +def : Pat<(ResTy (Neon_vduplane (OpTy VPR128:$Rn), OpLImm:$Imm)), + (ResTy (DUPELT (OpTy VPR128:$Rn), OpLImm:$Imm))>; + +def : Pat<(ResTy (Neon_vduplane + (NaTy VPR64:$Rn), OpNImm:$Imm)), + (ResTy (DUPELT + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), OpNImm:$Imm))>; +} +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; +defm : NeonI_DUP_Elt_pattern; + +def : Pat<(v2f32 (Neon_vdup (f32 FPR32:$Rn))), + (v2f32 (DUPELT2s + (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32), + (i64 0)))>; +def : Pat<(v4f32 (Neon_vdup (f32 FPR32:$Rn))), + (v4f32 (DUPELT4s + (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32), + (i64 0)))>; +def : Pat<(v2f64 (Neon_vdup (f64 FPR64:$Rn))), + (v2f64 (DUPELT2d + (SUBREG_TO_REG (i64 0), FPR64:$Rn, sub_64), + (i64 0)))>; + +class NeonI_DUP + : NeonI_copy; + +def DUP16b : NeonI_DUP<0b1, "dup", ".16b", VPR128, v16i8, GPR32, i32> { + let Inst{20-16} = 0b00001; + // bits 17-20 are unspecified, but should be set to zero. +} + +def DUP8h : NeonI_DUP<0b1, "dup", ".8h", VPR128, v8i16, GPR32, i32> { + let Inst{20-16} = 0b00010; + // bits 18-20 are unspecified, but should be set to zero. +} + +def DUP4s : NeonI_DUP<0b1, "dup", ".4s", VPR128, v4i32, GPR32, i32> { + let Inst{20-16} = 0b00100; + // bits 19-20 are unspecified, but should be set to zero. +} + +def DUP2d : NeonI_DUP<0b1, "dup", ".2d", VPR128, v2i64, GPR64, i64> { + let Inst{20-16} = 0b01000; + // bit 20 is unspecified, but should be set to zero. +} + +def DUP8b : NeonI_DUP<0b0, "dup", ".8b", VPR64, v8i8, GPR32, i32> { + let Inst{20-16} = 0b00001; + // bits 17-20 are unspecified, but should be set to zero. +} + +def DUP4h : NeonI_DUP<0b0, "dup", ".4h", VPR64, v4i16, GPR32, i32> { + let Inst{20-16} = 0b00010; + // bits 18-20 are unspecified, but should be set to zero. +} + +def DUP2s : NeonI_DUP<0b0, "dup", ".2s", VPR64, v2i32, GPR32, i32> { + let Inst{20-16} = 0b00100; + // bits 19-20 are unspecified, but should be set to zero. +} + +// patterns for CONCAT_VECTORS +multiclass Concat_Vector_Pattern { +def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), undef)), + (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)>; +def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))), + (INSELd + (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rm, sub_64)), + (i64 1), + (i64 0))>; +def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rn))), + (DUPELT2d + (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + (i64 0))> ; +} + +defm : Concat_Vector_Pattern; +defm : Concat_Vector_Pattern; +defm : Concat_Vector_Pattern; +defm : Concat_Vector_Pattern; +defm : Concat_Vector_Pattern; +defm : Concat_Vector_Pattern; + +//patterns for EXTRACT_SUBVECTOR +def : Pat<(v8i8 (extract_subvector (v16i8 VPR128:$Rn), (i64 0))), + (v8i8 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v4i16 (extract_subvector (v8i16 VPR128:$Rn), (i64 0))), + (v4i16 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v2i32 (extract_subvector (v4i32 VPR128:$Rn), (i64 0))), + (v2i32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v1i64 (extract_subvector (v2i64 VPR128:$Rn), (i64 0))), + (v1i64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v2f32 (extract_subvector (v4f32 VPR128:$Rn), (i64 0))), + (v2f32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v1f64 (extract_subvector (v2f64 VPR128:$Rn), (i64 0))), + (v1f64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; + +// The followings are for instruction class (3V Elem) + +// Variant 1 + +class NI_2VE size, bits<4> opcode, + string asmop, string ResS, string OpS, string EleOpS, + Operand OpImm, RegisterOperand ResVPR, + RegisterOperand OpVPR, RegisterOperand EleOpVPR> + : NeonI_2VElem { + bits<3> Index; + bits<5> Re; + + let Constraints = "$src = $Rd"; +} + +multiclass NI_2VE_v1 opcode, string asmop> { + // vector register class for element is always 128-bit to cover the max index + def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", + neon_uimm2_bare, VPR64, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // Index operations on 16-bit(H) elements are restricted to using v0-v15. + def _4h8h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h", + neon_uimm3_bare, VPR64, VPR64, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } + + def _8h8h : NI_2VE<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h", + neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } +} + +defm MLAvve : NI_2VE_v1<0b1, 0b0000, "mla">; +defm MLSvve : NI_2VE_v1<0b1, 0b0100, "mls">; + +// Pattern for lane in 128-bit vector +class NI_2VE_laneq + : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn), + (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST ResVPR:$src, OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VE_lane + : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn), + (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST ResVPR:$src, OpVPR:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; + +multiclass NI_2VE_v1_pat +{ + def : NI_2VE_laneq(subop # "_2s4s"), neon_uimm2_bare, + op, VPR64, VPR64, VPR128, v2i32, v2i32, v4i32>; + + def : NI_2VE_laneq(subop # "_4s4s"), neon_uimm2_bare, + op, VPR128, VPR128, VPR128, v4i32, v4i32, v4i32>; + + def : NI_2VE_laneq(subop # "_4h8h"), neon_uimm3_bare, + op, VPR64, VPR64, VPR128Lo, v4i16, v4i16, v8i16>; + + def : NI_2VE_laneq(subop # "_8h8h"), neon_uimm3_bare, + op, VPR128, VPR128, VPR128Lo, v8i16, v8i16, v8i16>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_lane(subop # "_2s4s"), neon_uimm1_bare, + op, VPR64, VPR64, VPR64, v2i32, v2i32, v2i32>; + + def : NI_2VE_lane(subop # "_4h8h"), neon_uimm2_bare, + op, VPR64, VPR64, VPR64Lo, v4i16, v4i16, v4i16>; +} + +defm MLA_lane_v1 : NI_2VE_v1_pat<"MLAvve", Neon_mla>; +defm MLS_lane_v1 : NI_2VE_v1_pat<"MLSvve", Neon_mls>; + +class NI_2VE_2op size, bits<4> opcode, + string asmop, string ResS, string OpS, string EleOpS, + Operand OpImm, RegisterOperand ResVPR, + RegisterOperand OpVPR, RegisterOperand EleOpVPR> + : NeonI_2VElem { + bits<3> Index; + bits<5> Re; +} + +multiclass NI_2VE_v1_2op opcode, string asmop> { + // vector register class for element is always 128-bit to cover the max index + def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", + neon_uimm2_bare, VPR64, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // Index operations on 16-bit(H) elements are restricted to using v0-v15. + def _4h8h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h", + neon_uimm3_bare, VPR64, VPR64, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } + + def _8h8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h", + neon_uimm3_bare, VPR128, VPR128, VPR128Lo> { + let Inst{11} = {Index{2}}; + let Inst{21} = {Index{1}}; + let Inst{20} = {Index{0}}; + let Inst{19-16} = Re{3-0}; + } +} + +defm MULve : NI_2VE_v1_2op<0b0, 0b1000, "mul">; +defm SQDMULHve : NI_2VE_v1_2op<0b0, 0b1100, "sqdmulh">; +defm SQRDMULHve : NI_2VE_v1_2op<0b0, 0b1101, "sqrdmulh">; + +// Pattern for lane in 128-bit vector +class NI_2VE_mul_laneq + : Pat<(ResTy (op (OpTy OpVPR:$Rn), + (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VE_mul_lane + : Pat<(ResTy (op (OpTy OpVPR:$Rn), + (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST OpVPR:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; + +multiclass NI_2VE_mul_v1_pat { + def : NI_2VE_mul_laneq(subop # "_2s4s"), neon_uimm2_bare, + op, VPR64, VPR128, v2i32, v2i32, v4i32>; + + def : NI_2VE_mul_laneq(subop # "_4s4s"), neon_uimm2_bare, + op, VPR128, VPR128, v4i32, v4i32, v4i32>; + + def : NI_2VE_mul_laneq(subop # "_4h8h"), neon_uimm3_bare, + op, VPR64, VPR128Lo, v4i16, v4i16, v8i16>; + + def : NI_2VE_mul_laneq(subop # "_8h8h"), neon_uimm3_bare, + op, VPR128, VPR128Lo, v8i16, v8i16, v8i16>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_mul_lane(subop # "_2s4s"), neon_uimm1_bare, + op, VPR64, VPR64, v2i32, v2i32, v2i32>; + + def : NI_2VE_mul_lane(subop # "_4h8h"), neon_uimm2_bare, + op, VPR64, VPR64Lo, v4i16, v4i16, v4i16>; +} + +defm MUL_lane_v1 : NI_2VE_mul_v1_pat<"MULve", mul>; +defm SQDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQDMULHve", int_arm_neon_vqdmulh>; +defm SQRDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQRDMULHve", int_arm_neon_vqrdmulh>; + +// Variant 2 + +multiclass NI_2VE_v2_2op opcode, string asmop> { + // vector register class for element is always 128-bit to cover the max index + def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", + neon_uimm2_bare, VPR64, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // _1d2d doesn't exist! + + def _2d2d : NI_2VE_2op<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d", + neon_uimm1_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{0}}; + let Inst{21} = 0b0; + let Inst{20-16} = Re; + } +} + +defm FMULve : NI_2VE_v2_2op<0b0, 0b1001, "fmul">; +defm FMULXve : NI_2VE_v2_2op<0b1, 0b1001, "fmulx">; + +class NI_2VE_mul_lane_2d + : Pat<(ResTy (op (OpTy OpVPR:$Rn), + (OpTy (coreop (EleOpTy EleOpVPR:$Re), (EleOpTy EleOpVPR:$Re))))), + (INST OpVPR:$Rn, + (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), 0)>; + +multiclass NI_2VE_mul_v2_pat { + def : NI_2VE_mul_laneq(subop # "_2s4s"), neon_uimm2_bare, + op, VPR64, VPR128, v2f32, v2f32, v4f32>; + + def : NI_2VE_mul_laneq(subop # "_4s4s"), neon_uimm2_bare, + op, VPR128, VPR128, v4f32, v4f32, v4f32>; + + def : NI_2VE_mul_laneq(subop # "_2d2d"), neon_uimm1_bare, + op, VPR128, VPR128, v2f64, v2f64, v2f64>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VE_mul_lane(subop # "_2s4s"), neon_uimm1_bare, + op, VPR64, VPR64, v2f32, v2f32, v2f32>; + + def : NI_2VE_mul_lane_2d(subop # "_2d2d"), neon_uimm1_bare, + op, VPR128, VPR64, v2f64, v2f64, v1f64, + BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>; +} + +defm FMUL_lane_v2 : NI_2VE_mul_v2_pat<"FMULve", fmul>; +defm FMULX_lane_v2 : NI_2VE_mul_v2_pat<"FMULXve", int_aarch64_neon_vmulx>; + +def : Pat<(v2f32 (fmul (v2f32 (Neon_vdup (f32 FPR32:$Re))), + (v2f32 VPR64:$Rn))), + (FMULve_2s4s VPR64:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>; + +def : Pat<(v4f32 (fmul (v4f32 (Neon_vdup (f32 FPR32:$Re))), + (v4f32 VPR128:$Rn))), + (FMULve_4s4s VPR128:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>; + +def : Pat<(v2f64 (fmul (v2f64 (Neon_vdup (f64 FPR64:$Re))), + (v2f64 VPR128:$Rn))), + (FMULve_2d2d VPR128:$Rn, (SUBREG_TO_REG (i64 0), $Re, sub_64), 0)>; + +// The followings are patterns using fma +// -ffp-contract=fast generates fma + +multiclass NI_2VE_v2 opcode, string asmop> { + // vector register class for element is always 128-bit to cover the max index + def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s", + neon_uimm2_bare, VPR64, VPR64, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s", + neon_uimm2_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{1}}; + let Inst{21} = {Index{0}}; + let Inst{20-16} = Re; + } + + // _1d2d doesn't exist! + + def _2d2d : NI_2VE<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d", + neon_uimm1_bare, VPR128, VPR128, VPR128> { + let Inst{11} = {Index{0}}; + let Inst{21} = 0b0; + let Inst{20-16} = Re; + } +} + +defm FMLAvve : NI_2VE_v2<0b0, 0b0001, "fmla">; +defm FMLSvve : NI_2VE_v2<0b0, 0b0101, "fmls">; + +// Pattern for lane in 128-bit vector +class NI_2VEswap_laneq + : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))), + (ResTy ResVPR:$src), (ResTy ResVPR:$Rn))), + (INST ResVPR:$src, ResVPR:$Rn, OpVPR:$Re, OpImm:$Index)>; + +// Pattern for lane 0 +class NI_2VEfma_lane0 + : Pat<(ResTy (op (ResTy ResVPR:$Rn), + (ResTy (Neon_vdup (f32 FPR32:$Re))), + (ResTy ResVPR:$src))), + (INST ResVPR:$src, ResVPR:$Rn, + (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>; + +// Pattern for lane in 64-bit vector +class NI_2VEswap_lane + : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))), + (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))), + (INST ResVPR:$src, ResVPR:$Rn, + (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), OpImm:$Index)>; + +// Pattern for lane in 64-bit vector +class NI_2VEswap_lane_2d2d + : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (OpTy OpVPR:$Re))), + (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))), + (INST ResVPR:$src, ResVPR:$Rn, + (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), 0)>; + + +multiclass NI_2VE_fma_v2_pat { + def : NI_2VEswap_laneq(subop # "_2s4s"), + neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VEfma_lane0(subop # "_2s4s"), + op, VPR64, v2f32>; + + def : NI_2VEswap_laneq(subop # "_4s4s"), + neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VEfma_lane0(subop # "_4s4s"), + op, VPR128, v4f32>; + + def : NI_2VEswap_laneq(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VEswap_lane(subop # "_2s4s"), + neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32, + BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + + def : NI_2VEswap_lane_2d2d(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64, + BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>; +} + +defm FMLA_lane_v2_s : NI_2VE_fma_v2_pat<"FMLAvve", fma>; + +// Pattern for lane 0 +class NI_2VEfms_lane0 + : Pat<(ResTy (op (ResTy (fneg ResVPR:$Rn)), + (ResTy (Neon_vdup (f32 FPR32:$Re))), + (ResTy ResVPR:$src))), + (INST ResVPR:$src, ResVPR:$Rn, + (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>; + +multiclass NI_2VE_fms_v2_pat +{ + def : NI_2VEswap_laneq(subop # "_2s4s"), + neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32, + BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_laneq(subop # "_2s4s"), + neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32, + BinOpFrag<(Neon_vduplane + (fneg node:$LHS), node:$RHS)>>; + + def : NI_2VEfms_lane0(subop # "_2s4s"), + op, VPR64, v2f32>; + + def : NI_2VEswap_laneq(subop # "_4s4s"), + neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, + BinOpFrag<(fneg (Neon_vduplane + node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_laneq(subop # "_4s4s"), + neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, + BinOpFrag<(Neon_vduplane + (fneg node:$LHS), node:$RHS)>>; + + def : NI_2VEfms_lane0(subop # "_4s4s"), + op, VPR128, v4f32>; + + def : NI_2VEswap_laneq(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, + BinOpFrag<(fneg (Neon_vduplane + node:$LHS, node:$RHS))>>; + + def : NI_2VEswap_laneq(subop # "_2d2d"), + neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, + BinOpFrag<(Neon_vduplane + (fneg node:$LHS), node:$RHS)>>; + + // Index can only be half of the max value for lane in 64-bit vector + + def : NI_2VEswap_lane(subop # "_2s4s"), + neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32, + BinOpFrag<(fneg (Neon_vduplane + node:$LHS, node:$RHS))>>; def : NI_2VEswap_lane(subop # "_2s4s"), neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32, @@ -4125,15 +7267,11 @@ multiclass NI_2VE_fms_v2_pat def : NI_2VEswap_lane(subop # "_4s4s"), neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32, - BinOpFrag<(fneg (Neon_vduplane - (Neon_combine_4f node:$LHS, undef), - node:$RHS))>>; + BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>; def : NI_2VEswap_lane(subop # "_4s4s"), neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32, - BinOpFrag<(Neon_vduplane - (Neon_combine_4f (fneg node:$LHS), undef), - node:$RHS)>>; + BinOpFrag<(Neon_vduplane (fneg node:$LHS), node:$RHS)>>; def : NI_2VEswap_lane_2d2d(subop # "_2d2d"), neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64, @@ -4152,8 +7290,7 @@ defm FMLS_lane_v2_s : NI_2VE_fms_v2_pat<"FMLSvve", fma>; // E.g. SMLAL : 4S/4H/H (v0-v15), 2D/2S/S // SMLAL2: 4S/8H/H (v0-v15), 2D/4S/S -multiclass NI_2VE_v3 opcode, string asmop> -{ +multiclass NI_2VE_v3 opcode, string asmop> { // vector register class for element is always 128-bit to cover the max index def _2d2s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s", neon_uimm2_bare, VPR128, VPR64, VPR128> { @@ -4161,7 +7298,7 @@ multiclass NI_2VE_v3 opcode, string asmop> let Inst{21} = {Index{0}}; let Inst{20-16} = Re; } - + def _2d4s : NI_2VE<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s", neon_uimm2_bare, VPR128, VPR128, VPR128> { let Inst{11} = {Index{1}}; @@ -4177,7 +7314,7 @@ multiclass NI_2VE_v3 opcode, string asmop> let Inst{20} = {Index{0}}; let Inst{19-16} = Re{3-0}; } - + def _4s4h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h", neon_uimm3_bare, VPR128, VPR64, VPR128Lo> { let Inst{11} = {Index{2}}; @@ -4194,8 +7331,7 @@ defm UMLSLvve : NI_2VE_v3<0b1, 0b0110, "umlsl">; defm SQDMLALvve : NI_2VE_v3<0b0, 0b0011, "sqdmlal">; defm SQDMLSLvve : NI_2VE_v3<0b0, 0b0111, "sqdmlsl">; -multiclass NI_2VE_v3_2op opcode, string asmop> -{ +multiclass NI_2VE_v3_2op opcode, string asmop> { // vector register class for element is always 128-bit to cover the max index def _2d2s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s", neon_uimm2_bare, VPR128, VPR64, VPR128> { @@ -4203,7 +7339,7 @@ multiclass NI_2VE_v3_2op opcode, string asmop> let Inst{21} = {Index{0}}; let Inst{20-16} = Re; } - + def _2d4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s", neon_uimm2_bare, VPR128, VPR128, VPR128> { let Inst{11} = {Index{1}}; @@ -4219,7 +7355,7 @@ multiclass NI_2VE_v3_2op opcode, string asmop> let Inst{20} = {Index{0}}; let Inst{19-16} = Re{3-0}; } - + def _4s4h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h", neon_uimm3_bare, VPR128, VPR64, VPR128Lo> { let Inst{11} = {Index{2}}; @@ -4233,66 +7369,74 @@ defm SMULLve : NI_2VE_v3_2op<0b0, 0b1010, "smull">; defm UMULLve : NI_2VE_v3_2op<0b1, 0b1010, "umull">; defm SQDMULLve : NI_2VE_v3_2op<0b0, 0b1011, "sqdmull">; +def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))), + (FMOVdd $src)>; +def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$src))), + (FMOVss $src)>; + // Pattern for lane in 128-bit vector class NI_2VEL2_laneq + SDPatternOperator hiop> : Pat<(ResTy (op (ResTy VPR128:$src), (HalfOpTy (hiop (OpTy VPR128:$Rn))), - (HalfOpTy (coreop (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (HalfOpTy (Neon_vduplane + (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), (INST VPR128:$src, VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>; // Pattern for lane in 64-bit vector class NI_2VEL2_lane + SDPatternOperator hiop> : Pat<(ResTy (op (ResTy VPR128:$src), (HalfOpTy (hiop (OpTy VPR128:$Rn))), - (HalfOpTy (coreop (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), - (INST VPR128:$src, VPR128:$Rn, + (HalfOpTy (Neon_vduplane + (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST VPR128:$src, VPR128:$Rn, (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; -multiclass NI_2VEL_v3_pat -{ +class NI_2VEL2_lane0 + : Pat<(ResTy (op (ResTy VPR128:$src), + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))), + (INST VPR128:$src, VPR128:$Rn, (DupInst $Re), 0)>; + +multiclass NI_2VEL_v3_pat { def : NI_2VE_laneq(subop # "_4s4h"), neon_uimm3_bare, - op, VPR128, VPR64, VPR128Lo, v4i32, v4i16, v8i16, - BinOpFrag<(Neon_vduplane - (Neon_low8H node:$LHS), node:$RHS)>>; - + op, VPR128, VPR64, VPR128Lo, v4i32, v4i16, v8i16>; + def : NI_2VE_laneq(subop # "_2d2s"), neon_uimm2_bare, - op, VPR128, VPR64, VPR128, v2i64, v2i32, v4i32, - BinOpFrag<(Neon_vduplane - (Neon_low4S node:$LHS), node:$RHS)>>; - + op, VPR128, VPR64, VPR128, v2i64, v2i32, v4i32>; + def : NI_2VEL2_laneq(subop # "_4s8h"), neon_uimm3_bare, - op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H, - BinOpFrag<(Neon_vduplane - (Neon_low8H node:$LHS), node:$RHS)>>; - + op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>; + def : NI_2VEL2_laneq(subop # "_2d4s"), neon_uimm2_bare, - op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S, - BinOpFrag<(Neon_vduplane - (Neon_low4S node:$LHS), node:$RHS)>>; - + op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>; + + def : NI_2VEL2_lane0(subop # "_4s8h"), + op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>; + + def : NI_2VEL2_lane0(subop # "_2d4s"), + op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>; + // Index can only be half of the max value for lane in 64-bit vector def : NI_2VE_lane(subop # "_4s4h"), neon_uimm2_bare, - op, VPR128, VPR64, VPR64Lo, v4i32, v4i16, v4i16, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; - + op, VPR128, VPR64, VPR64Lo, v4i32, v4i16, v4i16>; + def : NI_2VE_lane(subop # "_2d2s"), neon_uimm1_bare, - op, VPR128, VPR64, VPR64, v2i64, v2i32, v2i32, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + op, VPR128, VPR64, VPR64, v2i64, v2i32, v2i32>; def : NI_2VEL2_lane(subop # "_4s8h"), neon_uimm2_bare, - op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; - + op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>; + def : NI_2VEL2_lane(subop # "_2d4s"), neon_uimm1_bare, - op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>; } defm SMLAL_lane_v3 : NI_2VEL_v3_pat<"SMLALvve", Neon_smlal>; @@ -4304,71 +7448,73 @@ defm UMLSL_lane_v3 : NI_2VEL_v3_pat<"UMLSLvve", Neon_umlsl>; class NI_2VEL2_mul_laneq - : Pat<(ResTy (op + SDPatternOperator hiop> + : Pat<(ResTy (op (HalfOpTy (hiop (OpTy VPR128:$Rn))), - (HalfOpTy (coreop (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (HalfOpTy (Neon_vduplane + (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), (INST VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>; // Pattern for lane in 64-bit vector class NI_2VEL2_mul_lane + SDPatternOperator hiop> : Pat<(ResTy (op (HalfOpTy (hiop (OpTy VPR128:$Rn))), - (HalfOpTy (coreop (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), - (INST VPR128:$Rn, + (HalfOpTy (Neon_vduplane + (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))), + (INST VPR128:$Rn, (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; -multiclass NI_2VEL_mul_v3_pat -{ +// Pattern for fixed lane 0 +class NI_2VEL2_mul_lane0 + : Pat<(ResTy (op + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))), + (INST VPR128:$Rn, (DupInst $Re), 0)>; + +multiclass NI_2VEL_mul_v3_pat { def : NI_2VE_mul_laneq(subop # "_4s4h"), neon_uimm3_bare, - op, VPR64, VPR128Lo, v4i32, v4i16, v8i16, - BinOpFrag<(Neon_vduplane - (Neon_low8H node:$LHS), node:$RHS)>>; + op, VPR64, VPR128Lo, v4i32, v4i16, v8i16>; def : NI_2VE_mul_laneq(subop # "_2d2s"), neon_uimm2_bare, - op, VPR64, VPR128, v2i64, v2i32, v4i32, - BinOpFrag<(Neon_vduplane - (Neon_low4S node:$LHS), node:$RHS)>>; + op, VPR64, VPR128, v2i64, v2i32, v4i32>; def : NI_2VEL2_mul_laneq(subop # "_4s8h"), neon_uimm3_bare, - op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, - Neon_High8H, - BinOpFrag<(Neon_vduplane - (Neon_low8H node:$LHS), node:$RHS)>>; - + op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>; + def : NI_2VEL2_mul_laneq(subop # "_2d4s"), neon_uimm2_bare, - op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S, - BinOpFrag<(Neon_vduplane - (Neon_low4S node:$LHS), node:$RHS)>>; - + op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>; + + def : NI_2VEL2_mul_lane0(subop # "_4s8h"), + op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>; + + def : NI_2VEL2_mul_lane0(subop # "_2d4s"), + op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>; + // Index can only be half of the max value for lane in 64-bit vector def : NI_2VE_mul_lane(subop # "_4s4h"), neon_uimm2_bare, - op, VPR64, VPR64Lo, v4i32, v4i16, v4i16, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + op, VPR64, VPR64Lo, v4i32, v4i16, v4i16>; def : NI_2VE_mul_lane(subop # "_2d2s"), neon_uimm1_bare, - op, VPR64, VPR64, v2i64, v2i32, v2i32, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + op, VPR64, VPR64, v2i64, v2i32, v2i32>; def : NI_2VEL2_mul_lane(subop # "_4s8h"), neon_uimm2_bare, - op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; - + op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>; + def : NI_2VEL2_mul_lane(subop # "_2d4s"), neon_uimm1_bare, - op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>; } defm SMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SMULLve", int_arm_neon_vmulls>; defm UMULL_lane_v3 : NI_2VEL_mul_v3_pat<"UMULLve", int_arm_neon_vmullu>; defm SQDMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SQDMULLve", int_arm_neon_vqdmull>; -multiclass NI_qdma -{ +multiclass NI_qdma { def _4s : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm), (op node:$Ra, (v4i32 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>; @@ -4381,409 +7527,1017 @@ multiclass NI_qdma defm Neon_qdmlal : NI_qdma; defm Neon_qdmlsl : NI_qdma; -multiclass NI_2VEL_v3_qdma_pat -{ +multiclass NI_2VEL_v3_qdma_pat { def : NI_2VE_laneq(subop # "_4s4h"), neon_uimm3_bare, !cast(op # "_4s"), VPR128, VPR64, VPR128Lo, - v4i32, v4i16, v8i16, - BinOpFrag<(Neon_vduplane - (Neon_low8H node:$LHS), node:$RHS)>>; - + v4i32, v4i16, v8i16>; + def : NI_2VE_laneq(subop # "_2d2s"), neon_uimm2_bare, !cast(op # "_2d"), VPR128, VPR64, VPR128, - v2i64, v2i32, v4i32, - BinOpFrag<(Neon_vduplane - (Neon_low4S node:$LHS), node:$RHS)>>; - + v2i64, v2i32, v4i32>; + def : NI_2VEL2_laneq(subop # "_4s8h"), neon_uimm3_bare, !cast(op # "_4s"), VPR128Lo, - v4i32, v8i16, v8i16, v4i16, Neon_High8H, - BinOpFrag<(Neon_vduplane - (Neon_low8H node:$LHS), node:$RHS)>>; - + v4i32, v8i16, v8i16, v4i16, Neon_High8H>; + def : NI_2VEL2_laneq(subop # "_2d4s"), neon_uimm2_bare, !cast(op # "_2d"), VPR128, - v2i64, v4i32, v4i32, v2i32, Neon_High4S, - BinOpFrag<(Neon_vduplane - (Neon_low4S node:$LHS), node:$RHS)>>; - + v2i64, v4i32, v4i32, v2i32, Neon_High4S>; + + def : NI_2VEL2_lane0(subop # "_4s8h"), + !cast(op # "_4s"), + v4i32, v8i16, v4i16, Neon_High8H, DUP8h>; + + def : NI_2VEL2_lane0(subop # "_2d4s"), + !cast(op # "_2d"), + v2i64, v4i32, v2i32, Neon_High4S, DUP4s>; + // Index can only be half of the max value for lane in 64-bit vector def : NI_2VE_lane(subop # "_4s4h"), neon_uimm2_bare, !cast(op # "_4s"), VPR128, VPR64, VPR64Lo, - v4i32, v4i16, v4i16, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; - + v4i32, v4i16, v4i16>; + def : NI_2VE_lane(subop # "_2d2s"), neon_uimm1_bare, !cast(op # "_2d"), VPR128, VPR64, VPR64, - v2i64, v2i32, v2i32, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + v2i64, v2i32, v2i32>; def : NI_2VEL2_lane(subop # "_4s8h"), neon_uimm2_bare, !cast(op # "_4s"), VPR64Lo, - v4i32, v8i16, v4i16, v4i16, Neon_High8H, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; - + v4i32, v8i16, v4i16, v4i16, Neon_High8H>; + def : NI_2VEL2_lane(subop # "_2d4s"), neon_uimm1_bare, !cast(op # "_2d"), VPR64, - v2i64, v4i32, v2i32, v2i32, Neon_High4S, - BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + v2i64, v4i32, v2i32, v2i32, Neon_High4S>; } defm SQDMLAL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLALvve", "Neon_qdmlal">; defm SQDMLSL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLSLvve", "Neon_qdmlsl">; // End of implementation for instruction class (3V Elem) - -//Insert element (vector, from main) -def INSbw : NeonI_INS_main<"ins", "b", v16i8, GPR32, i32, - neon_uimm4_bare> { - let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; -} -def INShw : NeonI_INS_main<"ins", "h", v8i16, GPR32, i32, - neon_uimm3_bare> { - let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; -} -def INSsw : NeonI_INS_main<"ins", "s", v4i32, GPR32, i32, - neon_uimm2_bare> { - let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; -} -def INSdx : NeonI_INS_main<"ins", "d", v2i64, GPR64, i64, - neon_uimm1_bare> { - let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; + +class NeonI_REV size, bit Q, bit U, + bits<5> opcode, RegisterOperand ResVPR, ValueType ResTy, + SDPatternOperator Neon_Rev> + : NeonI_2VMisc ; + +def REV64_16b : NeonI_REV<"rev64", "16b", 0b00, 0b1, 0b0, 0b00000, VPR128, + v16i8, Neon_rev64>; +def REV64_8h : NeonI_REV<"rev64", "8h", 0b01, 0b1, 0b0, 0b00000, VPR128, + v8i16, Neon_rev64>; +def REV64_4s : NeonI_REV<"rev64", "4s", 0b10, 0b1, 0b0, 0b00000, VPR128, + v4i32, Neon_rev64>; +def REV64_8b : NeonI_REV<"rev64", "8b", 0b00, 0b0, 0b0, 0b00000, VPR64, + v8i8, Neon_rev64>; +def REV64_4h : NeonI_REV<"rev64", "4h", 0b01, 0b0, 0b0, 0b00000, VPR64, + v4i16, Neon_rev64>; +def REV64_2s : NeonI_REV<"rev64", "2s", 0b10, 0b0, 0b0, 0b00000, VPR64, + v2i32, Neon_rev64>; + +def : Pat<(v4f32 (Neon_rev64 (v4f32 VPR128:$Rn))), (REV64_4s VPR128:$Rn)>; +def : Pat<(v2f32 (Neon_rev64 (v2f32 VPR64:$Rn))), (REV64_2s VPR64:$Rn)>; + +def REV32_16b : NeonI_REV<"rev32", "16b", 0b00, 0b1, 0b1, 0b00000, VPR128, + v16i8, Neon_rev32>; +def REV32_8h : NeonI_REV<"rev32", "8h", 0b01, 0b1, 0b1, 0b00000, VPR128, + v8i16, Neon_rev32>; +def REV32_8b : NeonI_REV<"rev32", "8b", 0b00, 0b0, 0b1, 0b00000, VPR64, + v8i8, Neon_rev32>; +def REV32_4h : NeonI_REV<"rev32", "4h", 0b01, 0b0, 0b1, 0b00000, VPR64, + v4i16, Neon_rev32>; + +def REV16_16b : NeonI_REV<"rev16", "16b", 0b00, 0b1, 0b0, 0b00001, VPR128, + v16i8, Neon_rev16>; +def REV16_8b : NeonI_REV<"rev16", "8b", 0b00, 0b0, 0b0, 0b00001, VPR64, + v8i8, Neon_rev16>; + +multiclass NeonI_PairwiseAdd opcode, + SDPatternOperator Neon_Padd> { + def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.8h, $Rn.16b", + [(set (v8i16 VPR128:$Rd), + (v8i16 (Neon_Padd (v16i8 VPR128:$Rn))))], + NoItinerary>; + + def 8b4h : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.4h, $Rn.8b", + [(set (v4i16 VPR64:$Rd), + (v4i16 (Neon_Padd (v8i8 VPR64:$Rn))))], + NoItinerary>; + + def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.8h", + [(set (v4i32 VPR128:$Rd), + (v4i32 (Neon_Padd (v8i16 VPR128:$Rn))))], + NoItinerary>; + + def 4h2s : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.4h", + [(set (v2i32 VPR64:$Rd), + (v2i32 (Neon_Padd (v4i16 VPR64:$Rn))))], + NoItinerary>; + + def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.2d, $Rn.4s", + [(set (v2i64 VPR128:$Rd), + (v2i64 (Neon_Padd (v4i32 VPR128:$Rn))))], + NoItinerary>; + + def 2s1d : NeonI_2VMisc<0b0, U, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.1d, $Rn.2s", + [(set (v1i64 VPR64:$Rd), + (v1i64 (Neon_Padd (v2i32 VPR64:$Rn))))], + NoItinerary>; +} + +defm SADDLP : NeonI_PairwiseAdd<"saddlp", 0b0, 0b00010, + int_arm_neon_vpaddls>; +defm UADDLP : NeonI_PairwiseAdd<"uaddlp", 0b1, 0b00010, + int_arm_neon_vpaddlu>; + +multiclass NeonI_PairwiseAddAcc opcode, + SDPatternOperator Neon_Padd> { + let Constraints = "$src = $Rd" in { + def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.8h, $Rn.16b", + [(set (v8i16 VPR128:$Rd), + (v8i16 (Neon_Padd + (v8i16 VPR128:$src), (v16i8 VPR128:$Rn))))], + NoItinerary>; + + def 8b4h : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn), + asmop # "\t$Rd.4h, $Rn.8b", + [(set (v4i16 VPR64:$Rd), + (v4i16 (Neon_Padd + (v4i16 VPR64:$src), (v8i8 VPR64:$Rn))))], + NoItinerary>; + + def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.8h", + [(set (v4i32 VPR128:$Rd), + (v4i32 (Neon_Padd + (v4i32 VPR128:$src), (v8i16 VPR128:$Rn))))], + NoItinerary>; + + def 4h2s : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.4h", + [(set (v2i32 VPR64:$Rd), + (v2i32 (Neon_Padd + (v2i32 VPR64:$src), (v4i16 VPR64:$Rn))))], + NoItinerary>; + + def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.2d, $Rn.4s", + [(set (v2i64 VPR128:$Rd), + (v2i64 (Neon_Padd + (v2i64 VPR128:$src), (v4i32 VPR128:$Rn))))], + NoItinerary>; + + def 2s1d : NeonI_2VMisc<0b0, U, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn), + asmop # "\t$Rd.1d, $Rn.2s", + [(set (v1i64 VPR64:$Rd), + (v1i64 (Neon_Padd + (v1i64 VPR64:$src), (v2i32 VPR64:$Rn))))], + NoItinerary>; + } } -class Neon_INS_main_pattern - : Pat<(ResTy (vector_insert - (ResTy VPR64:$src), - (OpTy OpGPR:$Rn), - (OpImm:$Imm))), - (ResTy (EXTRACT_SUBREG - (ExtResTy (INS (ExtResTy (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)), - OpGPR:$Rn, OpImm:$Imm)), sub_64))>; +defm SADALP : NeonI_PairwiseAddAcc<"sadalp", 0b0, 0b00110, + int_arm_neon_vpadals>; +defm UADALP : NeonI_PairwiseAddAcc<"uadalp", 0b1, 0b00110, + int_arm_neon_vpadalu>; + +multiclass NeonI_2VMisc_BHSDsize_1Arg opcode> { + def 16b : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.16b, $Rn.16b", + [], NoItinerary>; + + def 8h : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.8h, $Rn.8h", + [], NoItinerary>; + + def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.4s", + [], NoItinerary>; + + def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.2d, $Rn.2d", + [], NoItinerary>; + + def 8b : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.8b, $Rn.8b", + [], NoItinerary>; + + def 4h : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.4h, $Rn.4h", + [], NoItinerary>; + + def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.2s", + [], NoItinerary>; +} + +defm SQABS : NeonI_2VMisc_BHSDsize_1Arg<"sqabs", 0b0, 0b00111>; +defm SQNEG : NeonI_2VMisc_BHSDsize_1Arg<"sqneg", 0b1, 0b00111>; +defm ABS : NeonI_2VMisc_BHSDsize_1Arg<"abs", 0b0, 0b01011>; +defm NEG : NeonI_2VMisc_BHSDsize_1Arg<"neg", 0b1, 0b01011>; + +multiclass NeonI_2VMisc_BHSD_1Arg_Pattern { + def : Pat<(v16i8 (Neon_Op (v16i8 VPR128:$Rn))), + (v16i8 (!cast(Prefix # 16b) (v16i8 VPR128:$Rn)))>; + + def : Pat<(v8i16 (Neon_Op (v8i16 VPR128:$Rn))), + (v8i16 (!cast(Prefix # 8h) (v8i16 VPR128:$Rn)))>; + + def : Pat<(v4i32 (Neon_Op (v4i32 VPR128:$Rn))), + (v4i32 (!cast(Prefix # 4s) (v4i32 VPR128:$Rn)))>; + + def : Pat<(v2i64 (Neon_Op (v2i64 VPR128:$Rn))), + (v2i64 (!cast(Prefix # 2d) (v2i64 VPR128:$Rn)))>; + + def : Pat<(v8i8 (Neon_Op (v8i8 VPR64:$Rn))), + (v8i8 (!cast(Prefix # 8b) (v8i8 VPR64:$Rn)))>; + + def : Pat<(v4i16 (Neon_Op (v4i16 VPR64:$Rn))), + (v4i16 (!cast(Prefix # 4h) (v4i16 VPR64:$Rn)))>; + + def : Pat<(v2i32 (Neon_Op (v2i32 VPR64:$Rn))), + (v2i32 (!cast(Prefix # 2s) (v2i32 VPR64:$Rn)))>; +} + +defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"SQABS", int_arm_neon_vqabs>; +defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"SQNEG", int_arm_neon_vqneg>; +defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"ABS", int_arm_neon_vabs>; + +def : Pat<(v16i8 (sub + (v16i8 Neon_AllZero), + (v16i8 VPR128:$Rn))), + (v16i8 (NEG16b (v16i8 VPR128:$Rn)))>; +def : Pat<(v8i8 (sub + (v8i8 Neon_AllZero), + (v8i8 VPR64:$Rn))), + (v8i8 (NEG8b (v8i8 VPR64:$Rn)))>; +def : Pat<(v8i16 (sub + (v8i16 (bitconvert (v16i8 Neon_AllZero))), + (v8i16 VPR128:$Rn))), + (v8i16 (NEG8h (v8i16 VPR128:$Rn)))>; +def : Pat<(v4i16 (sub + (v4i16 (bitconvert (v8i8 Neon_AllZero))), + (v4i16 VPR64:$Rn))), + (v4i16 (NEG4h (v4i16 VPR64:$Rn)))>; +def : Pat<(v4i32 (sub + (v4i32 (bitconvert (v16i8 Neon_AllZero))), + (v4i32 VPR128:$Rn))), + (v4i32 (NEG4s (v4i32 VPR128:$Rn)))>; +def : Pat<(v2i32 (sub + (v2i32 (bitconvert (v8i8 Neon_AllZero))), + (v2i32 VPR64:$Rn))), + (v2i32 (NEG2s (v2i32 VPR64:$Rn)))>; +def : Pat<(v2i64 (sub + (v2i64 (bitconvert (v16i8 Neon_AllZero))), + (v2i64 VPR128:$Rn))), + (v2i64 (NEG2d (v2i64 VPR128:$Rn)))>; + +multiclass NeonI_2VMisc_BHSDsize_2Args opcode> { + let Constraints = "$src = $Rd" in { + def 16b : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.16b, $Rn.16b", + [], NoItinerary>; + + def 8h : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.8h, $Rn.8h", + [], NoItinerary>; -def INSbw_pattern : Neon_INS_main_pattern; -def INShw_pattern : Neon_INS_main_pattern; -def INSsw_pattern : Neon_INS_main_pattern; -def INSdx_pattern : Neon_INS_main_pattern; + def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.4s", + [], NoItinerary>; -class NeonI_INS_element - : NeonI_insert<0b1, 0b1, - (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn, - ResImm:$Immd, ResImm:$Immn), - asmop # "\t$Rd." # Res # "[$Immd], $Rn." # Res # "[$Immn]", - [(set (ResTy VPR128:$Rd), - (ResTy (vector_insert - (ResTy VPR128:$src), - (MidTy (vector_extract - (ResTy VPR128:$Rn), - (ResImm:$Immn))), - (ResImm:$Immd))))], - NoItinerary> { - let Constraints = "$src = $Rd"; - bits<4> Immd; - bits<4> Immn; -} + def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "\t$Rd.2d, $Rn.2d", + [], NoItinerary>; -//Insert element (vector, from element) -def INSELb : NeonI_INS_element<"ins", "b", v16i8, neon_uimm4_bare, i32> { - let Inst{20-16} = {Immd{3}, Immd{2}, Immd{1}, Immd{0}, 0b1}; - let Inst{14-11} = {Immn{3}, Immn{2}, Immn{1}, Immn{0}}; -} -def INSELh : NeonI_INS_element<"ins", "h", v8i16, neon_uimm3_bare, i32> { - let Inst{20-16} = {Immd{2}, Immd{1}, Immd{0}, 0b1, 0b0}; - let Inst{14-12} = {Immn{2}, Immn{1}, Immn{0}}; - // bit 11 is unspecified. -} -def INSELs : NeonI_INS_element<"ins", "s", v4i32, neon_uimm2_bare, i32> { - let Inst{20-16} = {Immd{1}, Immd{0}, 0b1, 0b0, 0b0}; - let Inst{14-13} = {Immn{1}, Immn{0}}; - // bits 11-12 are unspecified. -} -def INSELd : NeonI_INS_element<"ins", "d", v2i64, neon_uimm1_bare, i64> { - let Inst{20-16} = {Immd, 0b1, 0b0, 0b0, 0b0}; - let Inst{14} = Immn{0}; - // bits 11-13 are unspecified. -} + def 8b : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn), + asmop # "\t$Rd.8b, $Rn.8b", + [], NoItinerary>; -multiclass Neon_INS_elt_pattern { -def : Pat<(NaTy (vector_insert - (NaTy VPR64:$src), - (MidTy (vector_extract - (StTy VPR128:$Rn), - (StImm:$Immn))), - (NaImm:$Immd))), - (NaTy (EXTRACT_SUBREG - (StTy (INS - (StTy (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)), - (StTy VPR128:$Rn), - NaImm:$Immd, - StImm:$Immn)), - sub_64))>; - -def : Pat<(StTy (vector_insert - (StTy VPR128:$src), - (MidTy (vector_extract - (NaTy VPR64:$Rn), - (NaImm:$Immn))), - (StImm:$Immd))), - (StTy (INS - (StTy VPR128:$src), - (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), - StImm:$Immd, - NaImm:$Immn))>; - -def : Pat<(NaTy (vector_insert - (NaTy VPR64:$src), - (MidTy (vector_extract - (NaTy VPR64:$Rn), - (NaImm:$Immn))), - (NaImm:$Immd))), - (NaTy (EXTRACT_SUBREG - (StTy (INS - (StTy (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)), - (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), - NaImm:$Immd, - NaImm:$Immn)), - sub_64))>; -} - -defm INSb_pattern : Neon_INS_elt_pattern; -defm INSh_pattern : Neon_INS_elt_pattern; -defm INSs_pattern : Neon_INS_elt_pattern; -defm INSd_pattern : Neon_INS_elt_pattern; + def 4h : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn), + asmop # "\t$Rd.4h, $Rn.4h", + [], NoItinerary>; -class NeonI_SMOV - : NeonI_copy { - bits<4> Imm; + def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.2s", + [], NoItinerary>; + } } -//Signed integer move (main, from element) -def SMOVwb : NeonI_SMOV<"smov", "b", 0b0, v16i8, i8, neon_uimm4_bare, - GPR32, i32> { - let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; -} -def SMOVwh : NeonI_SMOV<"smov", "h", 0b0, v8i16, i16, neon_uimm3_bare, - GPR32, i32> { - let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; -} -def SMOVxb : NeonI_SMOV<"smov", "b", 0b1, v16i8, i8, neon_uimm4_bare, - GPR64, i64> { - let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; -} -def SMOVxh : NeonI_SMOV<"smov", "h", 0b1, v8i16, i16, neon_uimm3_bare, - GPR64, i64> { - let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +defm SUQADD : NeonI_2VMisc_BHSDsize_2Args<"suqadd", 0b0, 0b00011>; +defm USQADD : NeonI_2VMisc_BHSDsize_2Args<"usqadd", 0b1, 0b00011>; + +multiclass NeonI_2VMisc_BHSD_2Args_Pattern { + def : Pat<(v16i8 (Neon_Op (v16i8 VPR128:$src), (v16i8 VPR128:$Rn))), + (v16i8 (!cast(Prefix # 16b) + (v16i8 VPR128:$src), (v16i8 VPR128:$Rn)))>; + + def : Pat<(v8i16 (Neon_Op (v8i16 VPR128:$src), (v8i16 VPR128:$Rn))), + (v8i16 (!cast(Prefix # 8h) + (v8i16 VPR128:$src), (v8i16 VPR128:$Rn)))>; + + def : Pat<(v4i32 (Neon_Op (v4i32 VPR128:$src), (v4i32 VPR128:$Rn))), + (v4i32 (!cast(Prefix # 4s) + (v4i32 VPR128:$src), (v4i32 VPR128:$Rn)))>; + + def : Pat<(v2i64 (Neon_Op (v2i64 VPR128:$src), (v2i64 VPR128:$Rn))), + (v2i64 (!cast(Prefix # 2d) + (v2i64 VPR128:$src), (v2i64 VPR128:$Rn)))>; + + def : Pat<(v8i8 (Neon_Op (v8i8 VPR64:$src), (v8i8 VPR64:$Rn))), + (v8i8 (!cast(Prefix # 8b) + (v8i8 VPR64:$src), (v8i8 VPR64:$Rn)))>; + + def : Pat<(v4i16 (Neon_Op (v4i16 VPR64:$src), (v4i16 VPR64:$Rn))), + (v4i16 (!cast(Prefix # 4h) + (v4i16 VPR64:$src), (v4i16 VPR64:$Rn)))>; + + def : Pat<(v2i32 (Neon_Op (v2i32 VPR64:$src), (v2i32 VPR64:$Rn))), + (v2i32 (!cast(Prefix # 2s) + (v2i32 VPR64:$src), (v2i32 VPR64:$Rn)))>; +} + +defm : NeonI_2VMisc_BHSD_2Args_Pattern<"SUQADD", int_aarch64_neon_suqadd>; +defm : NeonI_2VMisc_BHSD_2Args_Pattern<"USQADD", int_aarch64_neon_usqadd>; + +multiclass NeonI_2VMisc_BHSsizes { + def 16b : NeonI_2VMisc<0b1, U, 0b00, 0b00100, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.16b, $Rn.16b", + [(set (v16i8 VPR128:$Rd), + (v16i8 (Neon_Op (v16i8 VPR128:$Rn))))], + NoItinerary>; + + def 8h : NeonI_2VMisc<0b1, U, 0b01, 0b00100, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.8h, $Rn.8h", + [(set (v8i16 VPR128:$Rd), + (v8i16 (Neon_Op (v8i16 VPR128:$Rn))))], + NoItinerary>; + + def 4s : NeonI_2VMisc<0b1, U, 0b10, 0b00100, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.4s", + [(set (v4i32 VPR128:$Rd), + (v4i32 (Neon_Op (v4i32 VPR128:$Rn))))], + NoItinerary>; + + def 8b : NeonI_2VMisc<0b0, U, 0b00, 0b00100, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.8b, $Rn.8b", + [(set (v8i8 VPR64:$Rd), + (v8i8 (Neon_Op (v8i8 VPR64:$Rn))))], + NoItinerary>; + + def 4h : NeonI_2VMisc<0b0, U, 0b01, 0b00100, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.4h, $Rn.4h", + [(set (v4i16 VPR64:$Rd), + (v4i16 (Neon_Op (v4i16 VPR64:$Rn))))], + NoItinerary>; + + def 2s : NeonI_2VMisc<0b0, U, 0b10, 0b00100, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.2s", + [(set (v2i32 VPR64:$Rd), + (v2i32 (Neon_Op (v2i32 VPR64:$Rn))))], + NoItinerary>; +} + +defm CLS : NeonI_2VMisc_BHSsizes<"cls", 0b0, int_arm_neon_vcls>; +defm CLZ : NeonI_2VMisc_BHSsizes<"clz", 0b1, ctlz>; + +multiclass NeonI_2VMisc_Bsize size, + bits<5> Opcode> { + def 16b : NeonI_2VMisc<0b1, U, size, Opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.16b, $Rn.16b", + [], NoItinerary>; + + def 8b : NeonI_2VMisc<0b0, U, size, Opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.8b, $Rn.8b", + [], NoItinerary>; +} + +defm CNT : NeonI_2VMisc_Bsize<"cnt", 0b0, 0b00, 0b00101>; +defm NOT : NeonI_2VMisc_Bsize<"not", 0b1, 0b00, 0b00101>; +defm RBIT : NeonI_2VMisc_Bsize<"rbit", 0b1, 0b01, 0b00101>; + +def : NeonInstAlias<"mvn $Rd.16b, $Rn.16b", + (NOT16b VPR128:$Rd, VPR128:$Rn), 0>; +def : NeonInstAlias<"mvn $Rd.8b, $Rn.8b", + (NOT8b VPR64:$Rd, VPR64:$Rn), 0>; + +def : Pat<(v16i8 (ctpop (v16i8 VPR128:$Rn))), + (v16i8 (CNT16b (v16i8 VPR128:$Rn)))>; +def : Pat<(v8i8 (ctpop (v8i8 VPR64:$Rn))), + (v8i8 (CNT8b (v8i8 VPR64:$Rn)))>; + +def : Pat<(v16i8 (xor + (v16i8 VPR128:$Rn), + (v16i8 Neon_AllOne))), + (v16i8 (NOT16b (v16i8 VPR128:$Rn)))>; +def : Pat<(v8i8 (xor + (v8i8 VPR64:$Rn), + (v8i8 Neon_AllOne))), + (v8i8 (NOT8b (v8i8 VPR64:$Rn)))>; +def : Pat<(v8i16 (xor + (v8i16 VPR128:$Rn), + (v8i16 (bitconvert (v16i8 Neon_AllOne))))), + (NOT16b VPR128:$Rn)>; +def : Pat<(v4i16 (xor + (v4i16 VPR64:$Rn), + (v4i16 (bitconvert (v8i8 Neon_AllOne))))), + (NOT8b VPR64:$Rn)>; +def : Pat<(v4i32 (xor + (v4i32 VPR128:$Rn), + (v4i32 (bitconvert (v16i8 Neon_AllOne))))), + (NOT16b VPR128:$Rn)>; +def : Pat<(v2i32 (xor + (v2i32 VPR64:$Rn), + (v2i32 (bitconvert (v8i8 Neon_AllOne))))), + (NOT8b VPR64:$Rn)>; +def : Pat<(v2i64 (xor + (v2i64 VPR128:$Rn), + (v2i64 (bitconvert (v16i8 Neon_AllOne))))), + (NOT16b VPR128:$Rn)>; + +def : Pat<(v16i8 (int_aarch64_neon_rbit (v16i8 VPR128:$Rn))), + (v16i8 (RBIT16b (v16i8 VPR128:$Rn)))>; +def : Pat<(v8i8 (int_aarch64_neon_rbit (v8i8 VPR64:$Rn))), + (v8i8 (RBIT8b (v8i8 VPR64:$Rn)))>; + +multiclass NeonI_2VMisc_SDsizes opcode, + SDPatternOperator Neon_Op> { + def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.4s", + [(set (v4f32 VPR128:$Rd), + (v4f32 (Neon_Op (v4f32 VPR128:$Rn))))], + NoItinerary>; + + def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.2d, $Rn.2d", + [(set (v2f64 VPR128:$Rd), + (v2f64 (Neon_Op (v2f64 VPR128:$Rn))))], + NoItinerary>; + + def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.2s", + [(set (v2f32 VPR64:$Rd), + (v2f32 (Neon_Op (v2f32 VPR64:$Rn))))], + NoItinerary>; +} + +defm FABS : NeonI_2VMisc_SDsizes<"fabs", 0b0, 0b01111, fabs>; +defm FNEG : NeonI_2VMisc_SDsizes<"fneg", 0b1, 0b01111, fneg>; + +multiclass NeonI_2VMisc_HSD_Narrow opcode> { + def 8h8b : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.8b, $Rn.8h", + [], NoItinerary>; + + def 4s4h : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4h, $Rn.4s", + [], NoItinerary>; + + def 2d2s : NeonI_2VMisc<0b0, U, 0b10, opcode, + (outs VPR64:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.2s, $Rn.2d", + [], NoItinerary>; + + let Constraints = "$Rd = $src" in { + def 8h16b : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "2\t$Rd.16b, $Rn.8h", + [], NoItinerary>; + + def 4s8h : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "2\t$Rd.8h, $Rn.4s", + [], NoItinerary>; + + def 2d4s : NeonI_2VMisc<0b1, U, 0b10, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "2\t$Rd.4s, $Rn.2d", + [], NoItinerary>; + } } -def SMOVxs : NeonI_SMOV<"smov", "s", 0b1, v4i32, i32, neon_uimm2_bare, - GPR64, i64> { - let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; + +defm XTN : NeonI_2VMisc_HSD_Narrow<"xtn", 0b0, 0b10010>; +defm SQXTUN : NeonI_2VMisc_HSD_Narrow<"sqxtun", 0b1, 0b10010>; +defm SQXTN : NeonI_2VMisc_HSD_Narrow<"sqxtn", 0b0, 0b10100>; +defm UQXTN : NeonI_2VMisc_HSD_Narrow<"uqxtn", 0b1, 0b10100>; + +multiclass NeonI_2VMisc_Narrow_Patterns { + def : Pat<(v8i8 (Neon_Op (v8i16 VPR128:$Rn))), + (v8i8 (!cast(Prefix # 8h8b) (v8i16 VPR128:$Rn)))>; + + def : Pat<(v4i16 (Neon_Op (v4i32 VPR128:$Rn))), + (v4i16 (!cast(Prefix # 4s4h) (v4i32 VPR128:$Rn)))>; + + def : Pat<(v2i32 (Neon_Op (v2i64 VPR128:$Rn))), + (v2i32 (!cast(Prefix # 2d2s) (v2i64 VPR128:$Rn)))>; + + def : Pat<(v16i8 (concat_vectors + (v8i8 VPR64:$src), + (v8i8 (Neon_Op (v8i16 VPR128:$Rn))))), + (!cast(Prefix # 8h16b) + (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64), + VPR128:$Rn)>; + + def : Pat<(v8i16 (concat_vectors + (v4i16 VPR64:$src), + (v4i16 (Neon_Op (v4i32 VPR128:$Rn))))), + (!cast(Prefix # 4s8h) + (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64), + VPR128:$Rn)>; + + def : Pat<(v4i32 (concat_vectors + (v2i32 VPR64:$src), + (v2i32 (Neon_Op (v2i64 VPR128:$Rn))))), + (!cast(Prefix # 2d4s) + (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64), + VPR128:$Rn)>; +} + +defm : NeonI_2VMisc_Narrow_Patterns<"XTN", trunc>; +defm : NeonI_2VMisc_Narrow_Patterns<"SQXTUN", int_arm_neon_vqmovnsu>; +defm : NeonI_2VMisc_Narrow_Patterns<"SQXTN", int_arm_neon_vqmovns>; +defm : NeonI_2VMisc_Narrow_Patterns<"UQXTN", int_arm_neon_vqmovnu>; + +multiclass NeonI_2VMisc_SHIFT opcode> { + let DecoderMethod = "DecodeSHLLInstruction" in { + def 8b8h : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR128:$Rd), + (ins VPR64:$Rn, uimm_exact8:$Imm), + asmop # "\t$Rd.8h, $Rn.8b, $Imm", + [], NoItinerary>; + + def 4h4s : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR128:$Rd), + (ins VPR64:$Rn, uimm_exact16:$Imm), + asmop # "\t$Rd.4s, $Rn.4h, $Imm", + [], NoItinerary>; + + def 2s2d : NeonI_2VMisc<0b0, U, 0b10, opcode, + (outs VPR128:$Rd), + (ins VPR64:$Rn, uimm_exact32:$Imm), + asmop # "\t$Rd.2d, $Rn.2s, $Imm", + [], NoItinerary>; + + def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), + (ins VPR128:$Rn, uimm_exact8:$Imm), + asmop # "2\t$Rd.8h, $Rn.16b, $Imm", + [], NoItinerary>; + + def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), + (ins VPR128:$Rn, uimm_exact16:$Imm), + asmop # "2\t$Rd.4s, $Rn.8h, $Imm", + [], NoItinerary>; + + def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode, + (outs VPR128:$Rd), + (ins VPR128:$Rn, uimm_exact32:$Imm), + asmop # "2\t$Rd.2d, $Rn.4s, $Imm", + [], NoItinerary>; + } } -multiclass Neon_SMOVx_pattern { - def : Pat<(i64 (sext_inreg - (i64 (anyext - (i32 (vector_extract - (StTy VPR128:$Rn), (StImm:$Imm))))), - eleTy)), - (SMOVI VPR128:$Rn, StImm:$Imm)>; - - def : Pat<(i64 (sext - (i32 (vector_extract - (StTy VPR128:$Rn), (StImm:$Imm))))), - (SMOVI VPR128:$Rn, StImm:$Imm)>; - - def : Pat<(i64 (sext_inreg - (i64 (vector_extract - (NaTy VPR64:$Rn), (NaImm:$Imm))), - eleTy)), - (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), - NaImm:$Imm)>; - - def : Pat<(i64 (sext_inreg - (i64 (anyext - (i32 (vector_extract - (NaTy VPR64:$Rn), (NaImm:$Imm))))), - eleTy)), - (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), - NaImm:$Imm)>; - - def : Pat<(i64 (sext - (i32 (vector_extract - (NaTy VPR64:$Rn), (NaImm:$Imm))))), - (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), - NaImm:$Imm)>; +defm SHLL : NeonI_2VMisc_SHIFT<"shll", 0b1, 0b10011>; + +class NeonI_SHLL_Patterns + : Pat<(DesTy (shl + (DesTy (ExtOp (OpTy VPR64:$Rn))), + (DesTy (Neon_vdup + (i32 Neon_Imm:$Imm))))), + (!cast("SHLL" # suffix) VPR64:$Rn, Neon_Imm:$Imm)>; + +class NeonI_SHLL_High_Patterns + : Pat<(DesTy (shl + (DesTy (ExtOp + (OpTy (GetHigh VPR128:$Rn)))), + (DesTy (Neon_vdup + (i32 Neon_Imm:$Imm))))), + (!cast("SHLL" # suffix) VPR128:$Rn, Neon_Imm:$Imm)>; + +def : NeonI_SHLL_Patterns; +def : NeonI_SHLL_Patterns; +def : NeonI_SHLL_Patterns; +def : NeonI_SHLL_Patterns; +def : NeonI_SHLL_Patterns; +def : NeonI_SHLL_Patterns; +def : NeonI_SHLL_High_Patterns; +def : NeonI_SHLL_High_Patterns; +def : NeonI_SHLL_High_Patterns; +def : NeonI_SHLL_High_Patterns; +def : NeonI_SHLL_High_Patterns; +def : NeonI_SHLL_High_Patterns; + +multiclass NeonI_2VMisc_SD_Narrow opcode> { + def 4s4h : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR64:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4h, $Rn.4s", + [], NoItinerary>; + + def 2d2s : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.2s, $Rn.2d", + [], NoItinerary>; + + let Constraints = "$src = $Rd" in { + def 4s8h : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "2\t$Rd.8h, $Rn.4s", + [], NoItinerary>; + + def 2d4s : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "2\t$Rd.4s, $Rn.2d", + [], NoItinerary>; + } } -defm SMOVxb_pattern : Neon_SMOVx_pattern; -defm SMOVxh_pattern : Neon_SMOVx_pattern; -defm SMOVxs_pattern : Neon_SMOVx_pattern; +defm FCVTN : NeonI_2VMisc_SD_Narrow<"fcvtn", 0b0, 0b10110>; -class Neon_SMOVw_pattern - : Pat<(i32 (sext_inreg - (i32 (vector_extract - (NaTy VPR64:$Rn), (NaImm:$Imm))), - eleTy)), - (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), - NaImm:$Imm)>; +multiclass NeonI_2VMisc_Narrow_Pattern { -def SMOVwb_pattern : Neon_SMOVw_pattern; -def SMOVwh_pattern : Neon_SMOVw_pattern; + def : Pat<(v4i16 (f32_to_f16_Op (v4f32 VPR128:$Rn))), + (!cast(prefix # "4s4h") (v4f32 VPR128:$Rn))>; + def : Pat<(v8i16 (concat_vectors + (v4i16 VPR64:$src), + (v4i16 (f32_to_f16_Op (v4f32 VPR128:$Rn))))), + (!cast(prefix # "4s8h") + (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)), + (v4f32 VPR128:$Rn))>; -class NeonI_UMOV - : NeonI_copy { - bits<4> Imm; -} + def : Pat<(v2f32 (f64_to_f32_Op (v2f64 VPR128:$Rn))), + (!cast(prefix # "2d2s") (v2f64 VPR128:$Rn))>; -//Unsigned integer move (main, from element) -def UMOVwb : NeonI_UMOV<"umov", "b", 0b0, v16i8, neon_uimm4_bare, - GPR32, i32> { - let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; -} -def UMOVwh : NeonI_UMOV<"umov", "h", 0b0, v8i16, neon_uimm3_bare, - GPR32, i32> { - let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; -} -def UMOVws : NeonI_UMOV<"umov", "s", 0b0, v4i32, neon_uimm2_bare, - GPR32, i32> { - let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; -} -def UMOVxd : NeonI_UMOV<"umov", "d", 0b1, v2i64, neon_uimm1_bare, - GPR64, i64> { - let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; + def : Pat<(v4f32 (concat_vectors + (v2f32 VPR64:$src), + (v2f32 (f64_to_f32_Op (v2f64 VPR128:$Rn))))), + (!cast(prefix # "2d4s") + (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)), + (v2f64 VPR128:$Rn))>; } -class Neon_UMOV_pattern - : Pat<(ResTy (vector_extract - (NaTy VPR64:$Rn), NaImm:$Imm)), - (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), - NaImm:$Imm)>; +defm : NeonI_2VMisc_Narrow_Pattern<"FCVTN", int_arm_neon_vcvtfp2hf, fround>; -def UMOVwb_pattern : Neon_UMOV_pattern; -def UMOVwh_pattern : Neon_UMOV_pattern; -def UMOVws_pattern : Neon_UMOV_pattern; +multiclass NeonI_2VMisc_D_Narrow opcode> { + def 2d2s : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR64:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.2s, $Rn.2d", + [], NoItinerary>; -def : Pat<(i32 (and - (i32 (vector_extract - (v16i8 VPR128:$Rn), (neon_uimm4_bare:$Imm))), - 255)), - (UMOVwb VPR128:$Rn, neon_uimm4_bare:$Imm)>; + def 2d4s : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn), + asmop # "2\t$Rd.4s, $Rn.2d", + [], NoItinerary> { + let Constraints = "$src = $Rd"; + } -def : Pat<(i32 (and - (i32 (vector_extract - (v8i16 VPR128:$Rn), (neon_uimm3_bare:$Imm))), - 65535)), - (UMOVwh VPR128:$Rn, neon_uimm3_bare:$Imm)>; + def : Pat<(v2f32 (int_aarch64_neon_fcvtxn (v2f64 VPR128:$Rn))), + (!cast(prefix # "2d2s") VPR128:$Rn)>; -def : Pat<(i64 (zext - (i32 (vector_extract - (v2i64 VPR128:$Rn), (neon_uimm1_bare:$Imm))))), - (UMOVxd VPR128:$Rn, neon_uimm1_bare:$Imm)>; + def : Pat<(v4f32 (concat_vectors + (v2f32 VPR64:$src), + (v2f32 (int_aarch64_neon_fcvtxn (v2f64 VPR128:$Rn))))), + (!cast(prefix # "2d4s") + (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)), + VPR128:$Rn)>; +} -def : Pat<(i32 (and - (i32 (vector_extract - (v8i8 VPR64:$Rn), (neon_uimm3_bare:$Imm))), - 255)), - (UMOVwb (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64), - neon_uimm3_bare:$Imm)>; +defm FCVTXN : NeonI_2VMisc_D_Narrow<"fcvtxn","FCVTXN", 0b1, 0b10110>; -def : Pat<(i32 (and - (i32 (vector_extract - (v4i16 VPR64:$Rn), (neon_uimm2_bare:$Imm))), - 65535)), - (UMOVwh (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64), - neon_uimm2_bare:$Imm)>; +def Neon_High4Float : PatFrag<(ops node:$in), + (extract_subvector (v4f32 node:$in), (iPTR 2))>; -def : Pat<(i64 (zext - (i32 (vector_extract - (v1i64 VPR64:$Rn), (neon_uimm0_bare:$Imm))))), - (UMOVxd (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64), - neon_uimm0_bare:$Imm)>; +multiclass NeonI_2VMisc_HS_Extend opcode> { + def 4h4s : NeonI_2VMisc<0b0, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.4s, $Rn.4h", + [], NoItinerary>; -// Additional copy patterns for scalar types -def : Pat<(i32 (vector_extract (v1i8 FPR8:$Rn), (i64 0))), - (UMOVwb (v16i8 - (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8)), (i64 0))>; + def 2s2d : NeonI_2VMisc<0b0, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.2d, $Rn.2s", + [], NoItinerary>; -def : Pat<(i32 (vector_extract (v1i16 FPR16:$Rn), (i64 0))), - (UMOVwh (v8i16 - (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16)), (i64 0))>; + def 8h4s : NeonI_2VMisc<0b1, U, 0b00, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "2\t$Rd.4s, $Rn.8h", + [], NoItinerary>; -def : Pat<(i32 (vector_extract (v1i32 FPR32:$Rn), (i64 0))), - (FMOVws FPR32:$Rn)>; + def 4s2d : NeonI_2VMisc<0b1, U, 0b01, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "2\t$Rd.2d, $Rn.4s", + [], NoItinerary>; +} -def : Pat<(i64 (vector_extract (v1i64 FPR64:$Rn), (i64 0))), - (FMOVxd FPR64:$Rn)>; - -def : Pat<(f64 (vector_extract (v1f64 FPR64:$Rn), (i64 0))), - (f64 FPR64:$Rn)>; +defm FCVTL : NeonI_2VMisc_HS_Extend<"fcvtl", 0b0, 0b10111>; + +multiclass NeonI_2VMisc_Extend_Pattern { + def : Pat<(v4f32 (int_arm_neon_vcvthf2fp (v4i16 VPR64:$Rn))), + (!cast(prefix # "4h4s") VPR64:$Rn)>; + + def : Pat<(v4f32 (int_arm_neon_vcvthf2fp + (v4i16 (Neon_High8H + (v8i16 VPR128:$Rn))))), + (!cast(prefix # "8h4s") VPR128:$Rn)>; + + def : Pat<(v2f64 (fextend (v2f32 VPR64:$Rn))), + (!cast(prefix # "2s2d") VPR64:$Rn)>; + + def : Pat<(v2f64 (fextend + (v2f32 (Neon_High4Float + (v4f32 VPR128:$Rn))))), + (!cast(prefix # "4s2d") VPR128:$Rn)>; +} + +defm : NeonI_2VMisc_Extend_Pattern<"FCVTL">; + +multiclass NeonI_2VMisc_SD_Conv opcode, + ValueType ResTy4s, ValueType OpTy4s, + ValueType ResTy2d, ValueType OpTy2d, + ValueType ResTy2s, ValueType OpTy2s, + SDPatternOperator Neon_Op> { + + def 4s : NeonI_2VMisc<0b1, U, {Size, 0b0}, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.4s", + [(set (ResTy4s VPR128:$Rd), + (ResTy4s (Neon_Op (OpTy4s VPR128:$Rn))))], + NoItinerary>; + + def 2d : NeonI_2VMisc<0b1, U, {Size, 0b1}, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.2d, $Rn.2d", + [(set (ResTy2d VPR128:$Rd), + (ResTy2d (Neon_Op (OpTy2d VPR128:$Rn))))], + NoItinerary>; + + def 2s : NeonI_2VMisc<0b0, U, {Size, 0b0}, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.2s", + [(set (ResTy2s VPR64:$Rd), + (ResTy2s (Neon_Op (OpTy2s VPR64:$Rn))))], + NoItinerary>; +} + +multiclass NeonI_2VMisc_fp_to_int opcode, SDPatternOperator Neon_Op> { + defm _ : NeonI_2VMisc_SD_Conv; +} + +defm FCVTNS : NeonI_2VMisc_fp_to_int<"fcvtns", 0b0, 0b0, 0b11010, + int_aarch64_neon_fcvtns>; +defm FCVTNU : NeonI_2VMisc_fp_to_int<"fcvtnu", 0b0, 0b1, 0b11010, + int_aarch64_neon_fcvtnu>; +defm FCVTPS : NeonI_2VMisc_fp_to_int<"fcvtps", 0b1, 0b0, 0b11010, + int_aarch64_neon_fcvtps>; +defm FCVTPU : NeonI_2VMisc_fp_to_int<"fcvtpu", 0b1, 0b1, 0b11010, + int_aarch64_neon_fcvtpu>; +defm FCVTMS : NeonI_2VMisc_fp_to_int<"fcvtms", 0b0, 0b0, 0b11011, + int_aarch64_neon_fcvtms>; +defm FCVTMU : NeonI_2VMisc_fp_to_int<"fcvtmu", 0b0, 0b1, 0b11011, + int_aarch64_neon_fcvtmu>; +defm FCVTZS : NeonI_2VMisc_fp_to_int<"fcvtzs", 0b1, 0b0, 0b11011, fp_to_sint>; +defm FCVTZU : NeonI_2VMisc_fp_to_int<"fcvtzu", 0b1, 0b1, 0b11011, fp_to_uint>; +defm FCVTAS : NeonI_2VMisc_fp_to_int<"fcvtas", 0b0, 0b0, 0b11100, + int_aarch64_neon_fcvtas>; +defm FCVTAU : NeonI_2VMisc_fp_to_int<"fcvtau", 0b0, 0b1, 0b11100, + int_aarch64_neon_fcvtau>; + +multiclass NeonI_2VMisc_int_to_fp opcode, SDPatternOperator Neon_Op> { + defm _ : NeonI_2VMisc_SD_Conv; +} + +defm SCVTF : NeonI_2VMisc_int_to_fp<"scvtf", 0b0, 0b0, 0b11101, sint_to_fp>; +defm UCVTF : NeonI_2VMisc_int_to_fp<"ucvtf", 0b0, 0b1, 0b11101, uint_to_fp>; + +multiclass NeonI_2VMisc_fp_to_fp opcode, SDPatternOperator Neon_Op> { + defm _ : NeonI_2VMisc_SD_Conv; +} + +defm FRINTN : NeonI_2VMisc_fp_to_fp<"frintn", 0b0, 0b0, 0b11000, + int_aarch64_neon_frintn>; +defm FRINTA : NeonI_2VMisc_fp_to_fp<"frinta", 0b0, 0b1, 0b11000, frnd>; +defm FRINTP : NeonI_2VMisc_fp_to_fp<"frintp", 0b1, 0b0, 0b11000, fceil>; +defm FRINTM : NeonI_2VMisc_fp_to_fp<"frintm", 0b0, 0b0, 0b11001, ffloor>; +defm FRINTX : NeonI_2VMisc_fp_to_fp<"frintx", 0b0, 0b1, 0b11001, frint>; +defm FRINTZ : NeonI_2VMisc_fp_to_fp<"frintz", 0b1, 0b0, 0b11001, ftrunc>; +defm FRINTI : NeonI_2VMisc_fp_to_fp<"frinti", 0b1, 0b1, 0b11001, fnearbyint>; +defm FRECPE : NeonI_2VMisc_fp_to_fp<"frecpe", 0b1, 0b0, 0b11101, + int_arm_neon_vrecpe>; +defm FRSQRTE : NeonI_2VMisc_fp_to_fp<"frsqrte", 0b1, 0b1, 0b11101, + int_arm_neon_vrsqrte>; +defm FSQRT : NeonI_2VMisc_fp_to_fp<"fsqrt", 0b1, 0b1, 0b11111, fsqrt>; + +multiclass NeonI_2VMisc_S_Conv opcode, SDPatternOperator Neon_Op> { + def 4s : NeonI_2VMisc<0b1, U, {Size, 0b0}, opcode, + (outs VPR128:$Rd), (ins VPR128:$Rn), + asmop # "\t$Rd.4s, $Rn.4s", + [(set (v4i32 VPR128:$Rd), + (v4i32 (Neon_Op (v4i32 VPR128:$Rn))))], + NoItinerary>; + + def 2s : NeonI_2VMisc<0b0, U, {Size, 0b0}, opcode, + (outs VPR64:$Rd), (ins VPR64:$Rn), + asmop # "\t$Rd.2s, $Rn.2s", + [(set (v2i32 VPR64:$Rd), + (v2i32 (Neon_Op (v2i32 VPR64:$Rn))))], + NoItinerary>; +} + +defm URECPE : NeonI_2VMisc_S_Conv<"urecpe", 0b1, 0b0, 0b11100, + int_arm_neon_vrecpe>; +defm URSQRTE : NeonI_2VMisc_S_Conv<"ursqrte", 0b1, 0b1, 0b11100, + int_arm_neon_vrsqrte>; + +// Crypto Class +class NeonI_Cryptoaes_2v size, bits<5> opcode, + string asmop, SDPatternOperator opnode> + : NeonI_Crypto_AES{ + let Constraints = "$src = $Rd"; + let Predicates = [HasNEON, HasCrypto]; +} -def : Pat<(f32 (vector_extract (v1f32 FPR32:$Rn), (i64 0))), - (f32 FPR32:$Rn)>; +def AESE : NeonI_Cryptoaes_2v<0b00, 0b00100, "aese", int_arm_neon_aese>; +def AESD : NeonI_Cryptoaes_2v<0b00, 0b00101, "aesd", int_arm_neon_aesd>; -def : Pat<(v1i8 (scalar_to_vector GPR32:$Rn)), - (v1i8 (EXTRACT_SUBREG (v16i8 - (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))), - sub_8))>; +class NeonI_Cryptoaes size, bits<5> opcode, + string asmop, SDPatternOperator opnode> + : NeonI_Crypto_AES; -def : Pat<(v1i16 (scalar_to_vector GPR32:$Rn)), - (v1i16 (EXTRACT_SUBREG (v8i16 - (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))), - sub_16))>; +def AESMC : NeonI_Cryptoaes<0b00, 0b00110, "aesmc", int_arm_neon_aesmc>; +def AESIMC : NeonI_Cryptoaes<0b00, 0b00111, "aesimc", int_arm_neon_aesimc>; -def : Pat<(v1i32 (scalar_to_vector GPR32:$src)), - (FMOVsw $src)>; +class NeonI_Cryptosha_vv size, bits<5> opcode, + string asmop, SDPatternOperator opnode> + : NeonI_Crypto_SHA { + let Constraints = "$src = $Rd"; + let Predicates = [HasNEON, HasCrypto]; +} -def : Pat<(v1i64 (scalar_to_vector GPR64:$src)), - (FMOVdx $src)>; +def SHA1SU1 : NeonI_Cryptosha_vv<0b00, 0b00001, "sha1su1", + int_arm_neon_sha1su1>; +def SHA256SU0 : NeonI_Cryptosha_vv<0b00, 0b00010, "sha256su0", + int_arm_neon_sha256su0>; -def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$Rn))), - (v1f32 FPR32:$Rn)>; -def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Rn))), - (v1f64 FPR64:$Rn)>; \ No newline at end of file +class NeonI_Cryptosha_ss size, bits<5> opcode, + string asmop, SDPatternOperator opnode> + : NeonI_Crypto_SHA { + let Predicates = [HasNEON, HasCrypto]; +} + +def SHA1H : NeonI_Cryptosha_ss<0b00, 0b00000, "sha1h", int_arm_neon_sha1h>; + +class NeonI_Cryptosha3_vvv size, bits<3> opcode, string asmop, + SDPatternOperator opnode> + : NeonI_Crypto_3VSHA { + let Constraints = "$src = $Rd"; + let Predicates = [HasNEON, HasCrypto]; +} + +def SHA1SU0 : NeonI_Cryptosha3_vvv<0b00, 0b011, "sha1su0", + int_arm_neon_sha1su0>; +def SHA256SU1 : NeonI_Cryptosha3_vvv<0b00, 0b110, "sha256su1", + int_arm_neon_sha256su1>; + +class NeonI_Cryptosha3_qqv size, bits<3> opcode, string asmop, + SDPatternOperator opnode> + : NeonI_Crypto_3VSHA { + let Constraints = "$src = $Rd"; + let Predicates = [HasNEON, HasCrypto]; +} + +def SHA256H : NeonI_Cryptosha3_qqv<0b00, 0b100, "sha256h", + int_arm_neon_sha256h>; +def SHA256H2 : NeonI_Cryptosha3_qqv<0b00, 0b101, "sha256h2", + int_arm_neon_sha256h2>; + +class NeonI_Cryptosha3_qsv size, bits<3> opcode, string asmop, + SDPatternOperator opnode> + : NeonI_Crypto_3VSHA { + let Constraints = "$src = $Rd"; + let Predicates = [HasNEON, HasCrypto]; +} + +def SHA1C : NeonI_Cryptosha3_qsv<0b00, 0b000, "sha1c", int_aarch64_neon_sha1c>; +def SHA1P : NeonI_Cryptosha3_qsv<0b00, 0b001, "sha1p", int_aarch64_neon_sha1p>; +def SHA1M : NeonI_Cryptosha3_qsv<0b00, 0b010, "sha1m", int_aarch64_neon_sha1m>;