X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86InstrAVX512.td;h=3dbc3d2abd8fdd62775ee5af007321afb70f48bb;hp=3678255e5f6df493ed6297bd26dd7fbcd9452743;hb=4a524934577d85e5095df8ea62ad6a3261076d0c;hpb=265d201e1931c30a309b4889644d423cea9befa2 diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 3678255e5f6..3dbc3d2abd8 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -1,70 +1,231 @@ -// Common base class of AVX512_masking and AVX512_masking_3src. -multiclass AVX512_masking_common O, Format F, dag Outs, dag Ins, - dag MaskingIns, dag ZeroMaskingIns, - string OpcodeStr, - string AttSrcAsm, string IntelSrcAsm, - dag RHS, dag MaskingRHS, ValueType OpVT, - RegisterClass RC, RegisterClass KRC, - string MaskingConstraint = ""> { - def NAME: AVX512; +// Group template arguments that can be derived from the vector type (EltNum x +// EltVT). These are things like the register class for the writemask, etc. +// The idea is to pass one of these as the template argument rather than the +// individual arguments. +class X86VectorVTInfo { + RegisterClass RC = rc; + int NumElts = numelts; + + // Corresponding mask register class. + RegisterClass KRC = !cast("VK" # NumElts); + + // Corresponding write-mask register class. + RegisterClass KRCWM = !cast("VK" # NumElts # "WM"); + + // The GPR register class that can hold the write mask. Use GR8 for fewer + // than 8 elements. Use shift-right and equal to work around the lack of + // !lt in tablegen. + RegisterClass MRC = + !cast("GR" # + !if (!eq (!srl(NumElts, 3), 0), 8, NumElts)); + + // Suffix used in the instruction mnemonic. + string Suffix = suffix; + + string VTName = "v" # NumElts # EltVT; + + // The vector VT. + ValueType VT = !cast(VTName); + + string EltTypeName = !cast(EltVT); + // Size of the element type in bits, e.g. 32 for v16i32. + string EltSizeName = !subst("i", "", !subst("f", "", EltTypeName)); + int EltSize = EltVT.Size; + + // "i" for integer types and "f" for floating-point types + string TypeVariantName = !subst(EltSizeName, "", EltTypeName); + + // Size of RC in bits, e.g. 512 for VR512. + int Size = VT.Size; + + // The corresponding memory operand, e.g. i512mem for VR512. + X86MemOperand MemOp = !cast(TypeVariantName # Size # "mem"); + X86MemOperand ScalarMemOp = !cast(EltVT # "mem"); + + // Load patterns + // Note: For 128/256-bit integer VT we choose loadv2i64/loadv4i64 + // due to load promotion during legalization + PatFrag LdFrag = !cast("load" # + !if (!eq (TypeVariantName, "i"), + !if (!eq (Size, 128), "v2i64", + !if (!eq (Size, 256), "v4i64", + VTName)), VTName)); + PatFrag ScalarLdFrag = !cast("load" # EltVT); + + // Load patterns used for memory operands. We only have this defined in + // case of i64 element types for sub-512 integer vectors. For now, keep + // MemOpFrag undefined in these cases. + PatFrag MemOpFrag = + !if (!eq (TypeVariantName, "f"), !cast("memop" # VTName), + !if (!eq (EltTypeName, "i64"), !cast("memop" # VTName), + !if (!eq (VTName, "v16i32"), !cast("memop" # VTName), ?))); + + // The corresponding float type, e.g. v16f32 for v16i32 + // Note: For EltSize < 32, FloatVT is illegal and TableGen + // fails to compile, so we choose FloatVT = VT + ValueType FloatVT = !cast( + !if (!eq (!srl(EltSize,5),0), + VTName, + !if (!eq(TypeVariantName, "i"), + "v" # NumElts # "f" # EltSize, + VTName))); + + // The string to specify embedded broadcast in assembly. + string BroadcastStr = "{1to" # NumElts # "}"; + + // 8-bit compressed displacement tuple/subvector format. This is only + // defined for NumElts <= 8. + CD8VForm CD8TupleForm = !if (!eq (!srl(NumElts, 4), 0), + !cast("CD8VT" # NumElts), ?); + + SubRegIndex SubRegIdx = !if (!eq (Size, 128), sub_xmm, + !if (!eq (Size, 256), sub_ymm, ?)); + + Domain ExeDomain = !if (!eq (EltTypeName, "f32"), SSEPackedSingle, + !if (!eq (EltTypeName, "f64"), SSEPackedDouble, + SSEPackedInt)); + + // A vector type of the same width with element type i32. This is used to + // create the canonical constant zero node ImmAllZerosV. + ValueType i32VT = !cast("v" # !srl(Size, 5) # "i32"); + dag ImmAllZerosV = (VT (bitconvert (i32VT immAllZerosV))); +} + +def v64i8_info : X86VectorVTInfo<64, i8, VR512, "b">; +def v32i16_info : X86VectorVTInfo<32, i16, VR512, "w">; +def v16i32_info : X86VectorVTInfo<16, i32, VR512, "d">; +def v8i64_info : X86VectorVTInfo<8, i64, VR512, "q">; +def v16f32_info : X86VectorVTInfo<16, f32, VR512, "ps">; +def v8f64_info : X86VectorVTInfo<8, f64, VR512, "pd">; + +// "x" in v32i8x_info means RC = VR256X +def v32i8x_info : X86VectorVTInfo<32, i8, VR256X, "b">; +def v16i16x_info : X86VectorVTInfo<16, i16, VR256X, "w">; +def v8i32x_info : X86VectorVTInfo<8, i32, VR256X, "d">; +def v4i64x_info : X86VectorVTInfo<4, i64, VR256X, "q">; + +def v16i8x_info : X86VectorVTInfo<16, i8, VR128X, "b">; +def v8i16x_info : X86VectorVTInfo<8, i16, VR128X, "w">; +def v4i32x_info : X86VectorVTInfo<4, i32, VR128X, "d">; +def v2i64x_info : X86VectorVTInfo<2, i64, VR128X, "q">; + +class AVX512VLVectorVTInfo { + X86VectorVTInfo info512 = i512; + X86VectorVTInfo info256 = i256; + X86VectorVTInfo info128 = i128; +} + +def avx512vl_i8_info : AVX512VLVectorVTInfo; +def avx512vl_i16_info : AVX512VLVectorVTInfo; +def avx512vl_i32_info : AVX512VLVectorVTInfo; +def avx512vl_i64_info : AVX512VLVectorVTInfo; + +// This multiclass generates the masking variants from the non-masking +// variant. It only provides the assembly pieces for the masking variants. +// It assumes custom ISel patterns for masking which can be provided as +// template arguments. +multiclass AVX512_maskable_custom O, Format F, + dag Outs, + dag Ins, dag MaskingIns, dag ZeroMaskingIns, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + list Pattern, + list MaskingPattern, + list ZeroMaskingPattern, + string MaskingConstraint = "", + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0> { + let isCommutable = IsCommutable in + def NAME: AVX512; // Prefer over VMOV*rrk Pat<> let AddedComplexity = 20 in def NAME#k: AVX512, + OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"# + "$dst {${mask}}, "#IntelSrcAsm#"}", + MaskingPattern, itin>, EVEX_K { // In case of the 3src subclass this is overridden with a let. string Constraints = MaskingConstraint; } let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<> def NAME#kz: AVX512, + OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"# + "$dst {${mask}} {z}, "#IntelSrcAsm#"}", + ZeroMaskingPattern, + itin>, EVEX_KZ; } + +// Common base class of AVX512_maskable and AVX512_maskable_3src. +multiclass AVX512_maskable_common O, Format F, X86VectorVTInfo _, + dag Outs, + dag Ins, dag MaskingIns, dag ZeroMaskingIns, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, dag MaskingRHS, + string MaskingConstraint = "", + InstrItinClass itin = NoItinerary, + bit IsCommutable = 0> : + AVX512_maskable_custom; + // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the instruction. In the masking case, the // perserved vector elements come from a new dummy input operand tied to $dst. -multiclass AVX512_masking O, Format F, dag Outs, dag Ins, - string OpcodeStr, - string AttSrcAsm, string IntelSrcAsm, - dag RHS, ValueType OpVT, RegisterClass RC, - RegisterClass KRC> : - AVX512_masking_common; - -// Similar to AVX512_masking but in this case one of the source operands +multiclass AVX512_maskable O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, InstrItinClass itin = NoItinerary, + bit IsCommutable = 0> : + AVX512_maskable_common; + +// Similar to AVX512_maskable but in this case one of the source operands // ($src1) is already tied to $dst so we just use that for the preserved // vector elements. NOTE that the NonTiedIns (the ins dag) should exclude // $src1. -multiclass AVX512_masking_3src O, Format F, dag Outs, dag NonTiedIns, - string OpcodeStr, - string AttSrcAsm, string IntelSrcAsm, - dag RHS, ValueType OpVT, - RegisterClass RC, RegisterClass KRC> : - AVX512_masking_common; +multiclass AVX512_maskable_3src O, Format F, X86VectorVTInfo _, + dag Outs, dag NonTiedIns, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS> : + AVX512_maskable_common; + + +multiclass AVX512_maskable_in_asm O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, + string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + list Pattern> : + AVX512_maskable_custom; // Bitcasts between 512-bit vector types. Return the original type since // no instruction is needed for the conversion @@ -185,119 +346,92 @@ def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>; //===----------------------------------------------------------------------===// // AVX-512 - VECTOR INSERT // -// -- 32x8 form -- -let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { -def VINSERTF32x4rr : AVX512AIi8<0x18, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src1, VR128X:$src2, i8imm:$src3), - "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512; -let mayLoad = 1 in -def VINSERTF32x4rm : AVX512AIi8<0x18, MRMSrcMem, (outs VR512:$dst), - (ins VR512:$src1, f128mem:$src2, i8imm:$src3), - "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>; -} - -// -- 64x4 fp form -- -let hasSideEffects = 0, ExeDomain = SSEPackedDouble in { -def VINSERTF64x4rr : AVX512AIi8<0x1a, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src1, VR256X:$src2, i8imm:$src3), - "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, VEX_W; -let mayLoad = 1 in -def VINSERTF64x4rm : AVX512AIi8<0x1a, MRMSrcMem, (outs VR512:$dst), - (ins VR512:$src1, i256mem:$src2, i8imm:$src3), - "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; -} -// -- 32x4 integer form -- -let hasSideEffects = 0 in { -def VINSERTI32x4rr : AVX512AIi8<0x38, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src1, VR128X:$src2, i8imm:$src3), - "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512; -let mayLoad = 1 in -def VINSERTI32x4rm : AVX512AIi8<0x38, MRMSrcMem, (outs VR512:$dst), - (ins VR512:$src1, i128mem:$src2, i8imm:$src3), - "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>; + +multiclass vinsert_for_size_no_alt { + let hasSideEffects = 0, ExeDomain = To.ExeDomain in { + def rr : AVX512AIi8, + EVEX_4V, EVEX_V512; + + let mayLoad = 1 in + def rm : AVX512AIi8, + EVEX_4V, EVEX_V512, EVEX_CD8; + } } -let hasSideEffects = 0 in { -// -- 64x4 form -- -def VINSERTI64x4rr : AVX512AIi8<0x3a, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src1, VR256X:$src2, i8imm:$src3), - "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, VEX_W; -let mayLoad = 1 in -def VINSERTI64x4rm : AVX512AIi8<0x3a, MRMSrcMem, (outs VR512:$dst), - (ins VR512:$src1, i256mem:$src2, i8imm:$src3), - "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; -} - -def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (v4f32 VR128X:$src2), - (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8f64 VR512:$src1), (v2f64 VR128X:$src2), - (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (v2i64 VR128X:$src2), - (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v4i32 VR128X:$src2), - (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; - -def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (loadv4f32 addr:$src2), - (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), - (bc_v4i32 (loadv2i64 addr:$src2)), - (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8f64 VR512:$src1), (loadv2f64 addr:$src2), - (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (loadv2i64 addr:$src2), - (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert128_imm VR512:$ins))>; - -def : Pat<(vinsert256_insert:$ins (v16f32 VR512:$src1), (v8f32 VR256X:$src2), - (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert256_insert:$ins (v8f64 VR512:$src1), (v4f64 VR256X:$src2), - (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v8i64 VR512:$src1), (v4i64 VR256X:$src2), - (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v8i32 VR256X:$src2), - (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; - -def : Pat<(vinsert256_insert:$ins (v16f32 VR512:$src1), (loadv8f32 addr:$src2), - (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert256_insert:$ins (v8f64 VR512:$src1), (loadv4f64 addr:$src2), - (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert256_insert:$ins (v8i64 VR512:$src1), (loadv4i64 addr:$src2), - (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; -def : Pat<(vinsert256_insert:$ins (v16i32 VR512:$src1), - (bc_v8i32 (loadv4i64 addr:$src2)), - (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2, - (INSERT_get_vinsert256_imm VR512:$ins))>; +multiclass vinsert_for_size : + vinsert_for_size_no_alt { + // Codegen pattern with the alternative types, e.g. v2i64 -> v8i64 for + // vinserti32x4. Only add this if 64x2 and friends are not supported + // natively via AVX512DQ. + let Predicates = [NoDQI] in + def : Pat<(vinsert_insert:$ins + (AltTo.VT VR512:$src1), (AltFrom.VT From.RC:$src2), (iPTR imm)), + (AltTo.VT (!cast(NAME # From.EltSize # "x4rr") + VR512:$src1, From.RC:$src2, + (INSERT_get_vinsert_imm VR512:$ins)))>; +} + +multiclass vinsert_for_type { + defm NAME # "32x4" : vinsert_for_size, + X86VectorVTInfo<16, EltVT32, VR512>, + X86VectorVTInfo< 2, EltVT64, VR128X>, + X86VectorVTInfo< 8, EltVT64, VR512>, + vinsert128_insert, + INSERT_get_vinsert128_imm>; + let Predicates = [HasDQI] in + defm NAME # "64x2" : vinsert_for_size_no_alt, + X86VectorVTInfo< 8, EltVT64, VR512>, + vinsert128_insert, + INSERT_get_vinsert128_imm>, VEX_W; + defm NAME # "64x4" : vinsert_for_size, + X86VectorVTInfo< 8, EltVT64, VR512>, + X86VectorVTInfo< 8, EltVT32, VR256>, + X86VectorVTInfo<16, EltVT32, VR512>, + vinsert256_insert, + INSERT_get_vinsert256_imm>, VEX_W; + let Predicates = [HasDQI] in + defm NAME # "32x8" : vinsert_for_size_no_alt, + X86VectorVTInfo<16, EltVT32, VR512>, + vinsert256_insert, + INSERT_get_vinsert256_imm>; +} + +defm VINSERTF : vinsert_for_type; +defm VINSERTI : vinsert_for_type; // vinsertps - insert f32 to XMM def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), - (ins VR128X:$src1, VR128X:$src2, u32u8imm:$src3), + (ins VR128X:$src1, VR128X:$src2, i8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>, EVEX_4V; def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), - (ins VR128X:$src1, f32mem:$src2, u32u8imm:$src3), + (ins VR128X:$src1, f32mem:$src2, i8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128X:$dst, (X86insertps VR128X:$src1, (v4f32 (scalar_to_vector (loadf32 addr:$src2))), @@ -306,106 +440,90 @@ def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), //===----------------------------------------------------------------------===// // AVX-512 VECTOR EXTRACT //--- -let hasSideEffects = 0, ExeDomain = SSEPackedSingle in { -// -- 32x4 form -- -def VEXTRACTF32x4rr : AVX512AIi8<0x19, MRMDestReg, (outs VR128X:$dst), - (ins VR512:$src1, i8imm:$src2), - "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512; -def VEXTRACTF32x4mr : AVX512AIi8<0x19, MRMDestMem, (outs), - (ins f128mem:$dst, VR512:$src1, i8imm:$src2), - "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>; - -// -- 64x4 form -- -def VEXTRACTF64x4rr : AVX512AIi8<0x1b, MRMDestReg, (outs VR256X:$dst), - (ins VR512:$src1, i8imm:$src2), - "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, VEX_W; -let mayStore = 1 in -def VEXTRACTF64x4mr : AVX512AIi8<0x1b, MRMDestMem, (outs), - (ins f256mem:$dst, VR512:$src1, i8imm:$src2), - "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; -} -let hasSideEffects = 0 in { -// -- 32x4 form -- -def VEXTRACTI32x4rr : AVX512AIi8<0x39, MRMDestReg, (outs VR128X:$dst), - (ins VR512:$src1, i8imm:$src2), - "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512; -def VEXTRACTI32x4mr : AVX512AIi8<0x39, MRMDestMem, (outs), - (ins i128mem:$dst, VR512:$src1, i8imm:$src2), - "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>; - -// -- 64x4 form -- -def VEXTRACTI64x4rr : AVX512AIi8<0x3b, MRMDestReg, (outs VR256X:$dst), - (ins VR512:$src1, i8imm:$src2), - "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, VEX_W; -let mayStore = 1 in -def VEXTRACTI64x4mr : AVX512AIi8<0x3b, MRMDestMem, (outs), - (ins i256mem:$dst, VR512:$src1, i8imm:$src2), - "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>; -} - -def : Pat<(vextract128_extract:$ext (v16f32 VR512:$src1), (iPTR imm)), - (v4f32 (VEXTRACTF32x4rr VR512:$src1, - (EXTRACT_get_vextract128_imm VR128X:$ext)))>; - -def : Pat<(vextract128_extract:$ext VR512:$src1, (iPTR imm)), - (v4i32 (VEXTRACTF32x4rr VR512:$src1, - (EXTRACT_get_vextract128_imm VR128X:$ext)))>; - -def : Pat<(vextract128_extract:$ext (v8f64 VR512:$src1), (iPTR imm)), - (v2f64 (VEXTRACTF32x4rr VR512:$src1, - (EXTRACT_get_vextract128_imm VR128X:$ext)))>; - -def : Pat<(vextract128_extract:$ext (v8i64 VR512:$src1), (iPTR imm)), - (v2i64 (VEXTRACTI32x4rr VR512:$src1, - (EXTRACT_get_vextract128_imm VR128X:$ext)))>; - - -def : Pat<(vextract256_extract:$ext (v16f32 VR512:$src1), (iPTR imm)), - (v8f32 (VEXTRACTF64x4rr VR512:$src1, - (EXTRACT_get_vextract256_imm VR256X:$ext)))>; - -def : Pat<(vextract256_extract:$ext (v16i32 VR512:$src1), (iPTR imm)), - (v8i32 (VEXTRACTI64x4rr VR512:$src1, - (EXTRACT_get_vextract256_imm VR256X:$ext)))>; - -def : Pat<(vextract256_extract:$ext (v8f64 VR512:$src1), (iPTR imm)), - (v4f64 (VEXTRACTF64x4rr VR512:$src1, - (EXTRACT_get_vextract256_imm VR256X:$ext)))>; - -def : Pat<(vextract256_extract:$ext (v8i64 VR512:$src1), (iPTR imm)), - (v4i64 (VEXTRACTI64x4rr VR512:$src1, - (EXTRACT_get_vextract256_imm VR256X:$ext)))>; - -// A 256-bit subvector extract from the first 512-bit vector position -// is a subregister copy that needs no instruction. -def : Pat<(v8i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))), - (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm))>; -def : Pat<(v8f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))), - (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm))>; -def : Pat<(v4i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))), - (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm))>; -def : Pat<(v4f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))), - (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm))>; - -// zmm -> xmm -def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))), - (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>; -def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))), - (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>; -def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))), - (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>; -def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))), - (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>; +multiclass vextract_for_size { + let hasSideEffects = 0, ExeDomain = To.ExeDomain in { + defm rr : AVX512_maskable_in_asm, + AVX512AIi8Base, EVEX, EVEX_V512; + let mayStore = 1 in + def rm : AVX512AIi8, EVEX, EVEX_V512, EVEX_CD8; + } + // Codegen pattern with the alternative types, e.g. v8i64 -> v2i64 for + // vextracti32x4 + def : Pat<(vextract_extract:$ext (AltFrom.VT VR512:$src1), (iPTR imm)), + (AltTo.VT (!cast(NAME # To.EltSize # "x4rr") + VR512:$src1, + (EXTRACT_get_vextract_imm To.RC:$ext)))>; + + // A 128/256-bit subvector extract from the first 512-bit vector position is + // a subregister copy that needs no instruction. + def : Pat<(To.VT (extract_subvector (From.VT VR512:$src), (iPTR 0))), + (To.VT + (EXTRACT_SUBREG (From.VT VR512:$src), To.SubRegIdx))>; + + // And for the alternative types. + def : Pat<(AltTo.VT (extract_subvector (AltFrom.VT VR512:$src), (iPTR 0))), + (AltTo.VT + (EXTRACT_SUBREG (AltFrom.VT VR512:$src), AltTo.SubRegIdx))>; + + // Intrinsic call with masking. + def : Pat<(!cast("int_x86_avx512_mask_vextract" # To.EltTypeName # + "x4_512") + VR512:$src1, (iPTR imm:$idx), To.RC:$src0, GR8:$mask), + (!cast(NAME # To.EltSize # "x4rrk") To.RC:$src0, + (v4i1 (COPY_TO_REGCLASS GR8:$mask, VK4WM)), + VR512:$src1, imm:$idx)>; + + // Intrinsic call with zero-masking. + def : Pat<(!cast("int_x86_avx512_mask_vextract" # To.EltTypeName # + "x4_512") + VR512:$src1, (iPTR imm:$idx), To.ImmAllZerosV, GR8:$mask), + (!cast(NAME # To.EltSize # "x4rrkz") + (v4i1 (COPY_TO_REGCLASS GR8:$mask, VK4WM)), + VR512:$src1, imm:$idx)>; + + // Intrinsic call without masking. + def : Pat<(!cast("int_x86_avx512_mask_vextract" # To.EltTypeName # + "x4_512") + VR512:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)), + (!cast(NAME # To.EltSize # "x4rr") + VR512:$src1, imm:$idx)>; +} + +multiclass vextract_for_type { + defm NAME # "32x4" : vextract_for_size, + X86VectorVTInfo< 4, EltVT32, VR128X>, + X86VectorVTInfo< 8, EltVT64, VR512>, + X86VectorVTInfo< 2, EltVT64, VR128X>, + vextract128_extract, + EXTRACT_get_vextract128_imm>; + defm NAME # "64x4" : vextract_for_size, + X86VectorVTInfo< 4, EltVT64, VR256X>, + X86VectorVTInfo<16, EltVT32, VR512>, + X86VectorVTInfo< 8, EltVT32, VR256>, + vextract256_extract, + EXTRACT_get_vextract256_imm>, VEX_W; +} + +defm VEXTRACTF : vextract_for_type; +defm VEXTRACTI : vextract_for_type; // A 128-bit subvector insert to the first 512-bit vector position // is a subregister copy that needs no instruction. @@ -437,13 +555,13 @@ def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)), // vextractps - extract 32 bits from XMM def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst), - (ins VR128X:$src1, u32u8imm:$src2), + (ins VR128X:$src1, i32i8imm:$src2), "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>, EVEX; def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs), - (ins f32mem:$dst, VR128X:$src1, u32u8imm:$src2), + (ins f32mem:$dst, VR128X:$src1, i32i8imm:$src2), "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2), addr:$dst)]>, EVEX, EVEX_CD8<32, CD8VT1>; @@ -593,6 +711,16 @@ def : Pat<(v16f32 (X86VBroadcast (v4f32 VR128X:$src))), def : Pat<(v8f64 (X86VBroadcast (v2f64 VR128X:$src))), (VBROADCASTSDZrr VR128X:$src)>; +def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))), + (VBROADCASTSSZrr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>; +def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))), + (VBROADCASTSDZrr (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>; + +def : Pat<(v16i32 (X86VBroadcast (v16i32 VR512:$src))), + (VPBROADCASTDZrr (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>; +def : Pat<(v8i64 (X86VBroadcast (v8i64 VR512:$src))), + (VPBROADCASTQZrr (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>; + def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))), (VBROADCASTSSZrr VR128X:$src)>; def : Pat<(v8f64 (int_x86_avx512_vbroadcast_sd_pd_512 (v2f64 VR128X:$src))), @@ -617,48 +745,91 @@ def : Pat<(v8i32 (X86VBroadcastm (v8i1 VK8WM:$mask), (loadi32 addr:$src))), //--- multiclass avx512_mask_broadcast opc, string OpcodeStr, - RegisterClass DstRC, RegisterClass KRC, - ValueType OpVT, ValueType SrcVT> { -def rr : AVX512XS8I { +let Predicates = [HasCDI] in +def Zrr : AVX512XS8I, EVEX; + []>, EVEX, EVEX_V512; + +let Predicates = [HasCDI, HasVLX] in { +def Z128rr : AVX512XS8I, EVEX, EVEX_V128; +def Z256rr : AVX512XS8I, EVEX, EVEX_V256; +} } let Predicates = [HasCDI] in { -defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512, - VK16, v16i32, v16i1>, EVEX_V512; -defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512, - VK8, v8i64, v8i1>, EVEX_V512, VEX_W; +defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", + VK16>; +defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", + VK8>, VEX_W; } //===----------------------------------------------------------------------===// // AVX-512 - VPERM // // -- immediate form -- -multiclass avx512_perm_imm opc, string OpcodeStr, RegisterClass RC, - SDNode OpNode, PatFrag mem_frag, - X86MemOperand x86memop, ValueType OpVT> { - def ri : AVX512AIi8 opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { + def ri : AVX512AIi8, + [(set _.RC:$dst, + (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>, EVEX; - def mi : AVX512AIi8, EVEX; + [(set _.RC:$dst, + (_.VT (OpNode (_.MemOpFrag addr:$src1), + (i8 imm:$src2))))]>, + EVEX, EVEX_CD8<_.EltSize, CD8VF>; +} +} + +multiclass avx512_permil OpcImm, bits<8> OpcVar, X86VectorVTInfo _, + X86VectorVTInfo Ctrl> : + avx512_perm_imm { + let ExeDomain = _.ExeDomain in { + def rr : AVX5128I, + EVEX_4V; + def rm : AVX5128I, + EVEX_4V; + } } -defm VPERMQZ : avx512_perm_imm<0x00, "vpermq", VR512, X86VPermi, memopv8i64, - i512mem, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -let ExeDomain = SSEPackedDouble in -defm VPERMPDZ : avx512_perm_imm<0x01, "vpermpd", VR512, X86VPermi, memopv8f64, - f512mem, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMQZ : avx512_perm_imm<0x00, "vpermq", X86VPermi, v8i64_info>, + EVEX_V512, VEX_W; +defm VPERMPDZ : avx512_perm_imm<0x01, "vpermpd", X86VPermi, v8f64_info>, + EVEX_V512, VEX_W; + +defm VPERMILPSZ : avx512_permil<0x04, 0x0C, v16f32_info, v16i32_info>, + EVEX_V512; +defm VPERMILPDZ : avx512_permil<0x05, 0x0D, v8f64_info, v8i64_info>, + EVEX_V512, VEX_W; + +def : Pat<(v16i32 (X86VPermilpi VR512:$src1, (i8 imm:$imm))), + (VPERMILPSZri VR512:$src1, imm:$imm)>; +def : Pat<(v8i64 (X86VPermilpi VR512:$src1, (i8 imm:$imm))), + (VPERMILPDZri VR512:$src1, imm:$imm)>; // -- VPERM - register form -- multiclass avx512_perm opc, string OpcodeStr, RegisterClass RC, @@ -919,98 +1090,295 @@ defm VCMPSDZ : avx512_cmp_scalar opc, string OpcodeStr, RegisterClass KRC, - RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, - SDNode OpNode, ValueType vt> { +multiclass avx512_icmp_packed opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { def rr : AVX512BI, EVEX_4V; + let mayLoad = 1 in def rm : AVX512BI, EVEX_4V; + def rrk : AVX512BI, EVEX_4V, EVEX_K; + let mayLoad = 1 in + def rmk : AVX512BI, EVEX_4V, EVEX_K; +} + +multiclass avx512_icmp_packed_rmb opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> : + avx512_icmp_packed { + let mayLoad = 1 in { + def rmb : AVX512BI, EVEX_4V, EVEX_B; + def rmbk : AVX512BI, EVEX_4V, EVEX_K, EVEX_B; + } } -defm VPCMPEQDZ : avx512_icmp_packed<0x76, "vpcmpeqd", VK16, VR512, i512mem, - memopv16i32, X86pcmpeqm, v16i32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPCMPEQQZ : avx512_icmp_packed<0x29, "vpcmpeqq", VK8, VR512, i512mem, - memopv8i64, X86pcmpeqm, v8i64>, T8PD, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; +multiclass avx512_icmp_packed_vl opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_icmp_packed, + EVEX_V512; -defm VPCMPGTDZ : avx512_icmp_packed<0x66, "vpcmpgtd", VK16, VR512, i512mem, - memopv16i32, X86pcmpgtm, v16i32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPCMPGTQZ : avx512_icmp_packed<0x37, "vpcmpgtq", VK8, VR512, i512mem, - memopv8i64, X86pcmpgtm, v8i64>, T8PD, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_packed, + EVEX_V256; + defm Z128 : avx512_icmp_packed, + EVEX_V128; + } +} + +multiclass avx512_icmp_packed_rmb_vl opc, string OpcodeStr, + SDNode OpNode, AVX512VLVectorVTInfo VTInfo, + Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_icmp_packed_rmb, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_packed_rmb, + EVEX_V256; + defm Z128 : avx512_icmp_packed_rmb, + EVEX_V128; + } +} + +defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm, + avx512vl_i8_info, HasBWI>, + EVEX_CD8<8, CD8VF>; + +defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm, + avx512vl_i16_info, HasBWI>, + EVEX_CD8<16, CD8VF>; + +defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm, + avx512vl_i32_info, HasAVX512>, + EVEX_CD8<32, CD8VF>; + +defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm, + avx512vl_i64_info, HasAVX512>, + T8PD, VEX_W, EVEX_CD8<64, CD8VF>; + +defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, + avx512vl_i8_info, HasBWI>, + EVEX_CD8<8, CD8VF>; + +defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, + avx512vl_i16_info, HasBWI>, + EVEX_CD8<16, CD8VF>; + +defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, + avx512vl_i32_info, HasAVX512>, + EVEX_CD8<32, CD8VF>; + +defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, + avx512vl_i64_info, HasAVX512>, + T8PD, VEX_W, EVEX_CD8<64, CD8VF>; def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), - (COPY_TO_REGCLASS (VPCMPGTDZrr + (COPY_TO_REGCLASS (VPCMPGTDZrr (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>; def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), - (COPY_TO_REGCLASS (VPCMPEQDZrr + (COPY_TO_REGCLASS (VPCMPEQDZrr (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>; -multiclass avx512_icmp_cc opc, RegisterClass WMRC, RegisterClass KRC, - RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, - SDNode OpNode, ValueType vt, Operand CC, string Suffix> { +multiclass avx512_icmp_cc opc, string Suffix, SDNode OpNode, + X86VectorVTInfo _> { def rri : AVX512AIi8, EVEX_4V; + let mayLoad = 1 in def rmi : AVX512AIi8, EVEX_4V; + [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), + imm:$cc))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V; + def rrik : AVX512AIi8, EVEX_4V, EVEX_K; + let mayLoad = 1 in + def rmik : AVX512AIi8, EVEX_4V, EVEX_K; + // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { def rri_alt : AVX512AIi8, EVEX_4V; + def rmi_alt : AVX512AIi8, EVEX_4V; def rrik_alt : AVX512AIi8, EVEX_4V, EVEX_K; - def rmi_alt : AVX512AIi8, EVEX_4V; def rmik_alt : AVX512AIi8, EVEX_4V, EVEX_K; } } -defm VPCMPDZ : avx512_icmp_cc<0x1F, VK16WM, VK16, VR512, i512mem, memopv16i32, - X86cmpm, v16i32, AVXCC, "d">, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16WM, VK16, VR512, i512mem, memopv16i32, - X86cmpmu, v16i32, AVXCC, "ud">, - EVEX_V512, EVEX_CD8<32, CD8VF>; +multiclass avx512_icmp_cc_rmb opc, string Suffix, SDNode OpNode, + X86VectorVTInfo _> : + avx512_icmp_cc { + let mayLoad = 1 in { + def rmib : AVX512AIi8, EVEX_4V, EVEX_B; + def rmibk : AVX512AIi8, EVEX_4V, EVEX_K, EVEX_B; + } + + // Accept explicit immediate argument form instead of comparison code. + let isAsmParserOnly = 1, hasSideEffects = 0 in { + def rmib_alt : AVX512AIi8, EVEX_4V, EVEX_B; + def rmibk_alt : AVX512AIi8, EVEX_4V, EVEX_K, EVEX_B; + } +} + +multiclass avx512_icmp_cc_vl opc, string Suffix, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_icmp_cc, EVEX_V512; -defm VPCMPQZ : avx512_icmp_cc<0x1F, VK8WM, VK8, VR512, i512mem, memopv8i64, - X86cmpm, v8i64, AVXCC, "q">, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -defm VPCMPUQZ : avx512_icmp_cc<0x1E, VK8WM, VK8, VR512, i512mem, memopv8i64, - X86cmpmu, v8i64, AVXCC, "uq">, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_cc, EVEX_V256; + defm Z128 : avx512_icmp_cc, EVEX_V128; + } +} + +multiclass avx512_icmp_cc_rmb_vl opc, string Suffix, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { + let Predicates = [prd] in + defm Z : avx512_icmp_cc_rmb, + EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_icmp_cc_rmb, + EVEX_V256; + defm Z128 : avx512_icmp_cc_rmb, + EVEX_V128; + } +} + +defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, avx512vl_i8_info, + HasBWI>, EVEX_CD8<8, CD8VF>; +defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, avx512vl_i8_info, + HasBWI>, EVEX_CD8<8, CD8VF>; + +defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, avx512vl_i16_info, + HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; +defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, avx512vl_i16_info, + HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; + +defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, avx512vl_i32_info, + HasAVX512>, EVEX_CD8<32, CD8VF>; +defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, avx512vl_i32_info, + HasAVX512>, EVEX_CD8<32, CD8VF>; + +defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, avx512vl_i64_info, + HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info, + HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; // avx512_cmp_packed - compare packed instructions multiclass avx512_cmp_packed; + def : Pat<(i1 (trunc (i32 GR32:$src))), (COPY_TO_REGCLASS (KMOVWkr (AND32ri $src, (i32 1))), VK1)>; @@ -1493,6 +1865,17 @@ def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))), def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))), (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>; +let Predicates = [HasVLX] in { + def : Pat<(v8i1 (insert_subvector undef, (v4i1 VK4:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>; + def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>; + def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), + (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>; + def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), + (v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>; +} + def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))), (v8i1 (COPY_TO_REGCLASS (KSHIFTLWri (COPY_TO_REGCLASS VK8:$src, VK16), (I8Imm $imm)), VK8))>; @@ -1886,10 +2269,16 @@ multiclass avx512_move_scalar , EVEX, VEX_LIG; + let mayStore = 1 in { def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), !strconcat(asm, " \t{$src, $dst|$dst, $src}"), [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>, EVEX, VEX_LIG; + def mrk: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, VK1WM:$mask, RC:$src), + !strconcat(asm, " \t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), + [], IIC_SSE_MOV_S_MR>, + EVEX, VEX_LIG, EVEX_K; + } // mayStore } //hasSideEffects = 0 } @@ -1909,6 +2298,10 @@ def : Pat<(f64 (X86select VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))), (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X), VK1WM:$mask, (f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>; +def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask), + (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)), + (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + // For the disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { def VMOVSSZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst), @@ -2198,95 +2591,130 @@ defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", alignednontemporalstore, // AVX-512 - Integer arithmetic // multiclass avx512_binop_rm opc, string OpcodeStr, SDNode OpNode, - ValueType OpVT, RegisterClass KRC, - RegisterClass RC, PatFrag memop_frag, - X86MemOperand x86memop, PatFrag scalar_mfrag, - X86MemOperand x86scalar_mop, string BrdcstStr, - OpndItins itins, bit IsCommutable = 0> { - let isCommutable = IsCommutable in - def rr : AVX512BI, EVEX_4V; - let AddedComplexity = 30 in { - let Constraints = "$src0 = $dst" in - def rrk : AVX512BI, EVEX_4V, EVEX_K; - def rrkz : AVX512BI, EVEX_4V, EVEX_KZ; + X86VectorVTInfo _, OpndItins itins, + bit IsCommutable = 0> { + defm rr : AVX512_maskable, + AVX512BIBase, EVEX_4V; + + let mayLoad = 1 in + defm rm : AVX512_maskable, + AVX512BIBase, EVEX_4V; +} + +multiclass avx512_binop_rmb opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, OpndItins itins, + bit IsCommutable = 0> : + avx512_binop_rm { + let mayLoad = 1 in + defm rmb : AVX512_maskable, + AVX512BIBase, EVEX_4V, EVEX_B; +} + +multiclass avx512_binop_rm_vl opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, OpndItins itins, + Predicate prd, bit IsCommutable = 0> { + let Predicates = [prd] in + defm Z : avx512_binop_rm, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_binop_rm, EVEX_V256; + defm Z128 : avx512_binop_rm, EVEX_V128; } +} - let mayLoad = 1 in { - def rm : AVX512BI, EVEX_4V; - let AddedComplexity = 30 in { - let Constraints = "$src0 = $dst" in - def rmk : AVX512BI, EVEX_4V, EVEX_K; - def rmkz : AVX512BI, EVEX_4V, EVEX_KZ; - } - def rmb : AVX512BI, EVEX_4V, EVEX_B; - let AddedComplexity = 30 in { - let Constraints = "$src0 = $dst" in - def rmbk : AVX512BI, EVEX_4V, EVEX_B, EVEX_K; - def rmbkz : AVX512BI, EVEX_4V, EVEX_B, EVEX_KZ; - } +multiclass avx512_binop_rmb_vl opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo, OpndItins itins, + Predicate prd, bit IsCommutable = 0> { + let Predicates = [prd] in + defm Z : avx512_binop_rmb, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_binop_rmb, EVEX_V256; + defm Z128 : avx512_binop_rmb, EVEX_V128; } } +multiclass avx512_binop_rm_vl_q opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rmb_vl, + VEX_W, EVEX_CD8<64, CD8VF>; +} + +multiclass avx512_binop_rm_vl_d opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rmb_vl, EVEX_CD8<32, CD8VF>; +} + +multiclass avx512_binop_rm_vl_w opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rm_vl, EVEX_CD8<16, CD8VF>; +} + +multiclass avx512_binop_rm_vl_b opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm NAME : avx512_binop_rm_vl, EVEX_CD8<8, CD8VF>; +} + +multiclass avx512_binop_rm_vl_dq opc_d, bits<8> opc_q, string OpcodeStr, + SDNode OpNode, OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm Q : avx512_binop_rm_vl_q; + + defm D : avx512_binop_rm_vl_d; +} + +multiclass avx512_binop_rm_vl_bw opc_b, bits<8> opc_w, string OpcodeStr, + SDNode OpNode, OpndItins itins, Predicate prd, + bit IsCommutable = 0> { + defm W : avx512_binop_rm_vl_w; + + defm B : avx512_binop_rm_vl_b; +} + +multiclass avx512_binop_rm_vl_all opc_b, bits<8> opc_w, + bits<8> opc_d, bits<8> opc_q, + string OpcodeStr, SDNode OpNode, + OpndItins itins, bit IsCommutable = 0> { + defm NAME : avx512_binop_rm_vl_dq, + avx512_binop_rm_vl_bw; +} + multiclass avx512_binop_rm2 opc, string OpcodeStr, ValueType DstVT, ValueType SrcVT, RegisterClass KRC, RegisterClass RC, PatFrag memop_frag, X86MemOperand x86memop, @@ -2344,25 +2772,16 @@ multiclass avx512_binop_rm2 opc, string OpcodeStr, ValueType DstVT, } } -defm VPADDDZ : avx512_binop_rm<0xFE, "vpaddd", add, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, EVEX_V512, EVEX_CD8<32, CD8VF>; - -defm VPSUBDZ : avx512_binop_rm<0xFA, "vpsubd", sub, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>; - -defm VPMULLDZ : avx512_binop_rm<0x40, "vpmulld", mul, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; - -defm VPADDQZ : avx512_binop_rm<0xD4, "vpaddq", add, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 1>, EVEX_CD8<64, CD8VF>, EVEX_V512, VEX_W; - -defm VPSUBQZ : avx512_binop_rm<0xFB, "vpsubq", sub, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 0>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add, + SSE_INTALU_ITINS_P, 1>; +defm VPSUB : avx512_binop_rm_vl_all<0xF8, 0xF9, 0xFA, 0xFB, "vpsub", sub, + SSE_INTALU_ITINS_P, 0>; +defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmull", mul, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; +defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmull", mul, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmull", mul, + SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD; defm VPMULDQZ : avx512_binop_rm2<0x28, "vpmuldq", v8i64, v16i32, VK8WM, VR512, memopv8i64, i512mem, loadi64, i64mem, "{1to8}", @@ -2383,41 +2802,33 @@ def : Pat<(v8i64 (int_x86_avx512_mask_pmul_dq_512 (v16i32 VR512:$src1), (v16i32 VR512:$src2), (bc_v8i64 (v16i32 immAllZerosV)), (i8 -1))), (VPMULDQZrr VR512:$src1, VR512:$src2)>; -defm VPMAXUDZ : avx512_binop_rm<0x3F, "vpmaxud", X86umax, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, - T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPMAXUQZ : avx512_binop_rm<0x3F, "vpmaxuq", X86umax, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 0>, - T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VPMAXSDZ : avx512_binop_rm<0x3D, "vpmaxsd", X86smax, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, - T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPMAXSQZ : avx512_binop_rm<0x3D, "vpmaxsq", X86smax, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 0>, - T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VPMINUDZ : avx512_binop_rm<0x3B, "vpminud", X86umin, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, - T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPMINUQZ : avx512_binop_rm<0x3B, "vpminuq", X86umin, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 0>, - T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VPMINSDZ : avx512_binop_rm<0x39, "vpminsd", X86smin, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_INTALU_ITINS_P, 1>, - T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPMINSQZ : avx512_binop_rm<0x39, "vpminsq", X86smin, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_INTALU_ITINS_P, 0>, - T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxs", X86smax, + SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; +defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxs", X86smax, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", X86smax, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; + +defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxu", X86umax, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxu", X86umax, + SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; +defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", X86umax, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; + +defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpmins", X86smin, + SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; +defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpmins", X86smin, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", X86smin, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; + +defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminu", X86umin, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminu", X86umin, + SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; +defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", X86umin, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; def : Pat <(v16i32 (int_x86_avx512_mask_pmaxs_d_512 (v16i32 VR512:$src1), (v16i32 VR512:$src2), (v16i32 immAllZerosV), (i16 -1))), @@ -2530,48 +2941,18 @@ multiclass avx512_pshuf_imm opc, string OpcodeStr, RegisterClass RC, defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, memopv16i32, i512mem, v16i32>, PD, EVEX_V512, EVEX_CD8<32, CD8VF>; -let ExeDomain = SSEPackedSingle in -defm VPERMILPSZ : avx512_pshuf_imm<0x04, "vpermilps", VR512, X86VPermilp, - memopv16f32, i512mem, v16f32>, TAPD, EVEX_V512, - EVEX_CD8<32, CD8VF>; -let ExeDomain = SSEPackedDouble in -defm VPERMILPDZ : avx512_pshuf_imm<0x05, "vpermilpd", VR512, X86VPermilp, - memopv8f64, i512mem, v8f64>, TAPD, EVEX_V512, - VEX_W, EVEX_CD8<32, CD8VF>; - -def : Pat<(v16i32 (X86VPermilp VR512:$src1, (i8 imm:$imm))), - (VPERMILPSZri VR512:$src1, imm:$imm)>; -def : Pat<(v8i64 (X86VPermilp VR512:$src1, (i8 imm:$imm))), - (VPERMILPDZri VR512:$src1, imm:$imm)>; - //===----------------------------------------------------------------------===// // AVX-512 Logical Instructions //===----------------------------------------------------------------------===// -defm VPANDDZ : avx512_binop_rm<0xDB, "vpandd", and, v16i32, VK16WM, VR512, memopv16i32, - i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPANDQZ : avx512_binop_rm<0xDB, "vpandq", and, v8i64, VK8WM, VR512, memopv8i64, - i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPORDZ : avx512_binop_rm<0xEB, "vpord", or, v16i32, VK16WM, VR512, memopv16i32, - i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPORQZ : avx512_binop_rm<0xEB, "vporq", or, v8i64, VK8WM, VR512, memopv8i64, - i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPXORDZ : avx512_binop_rm<0xEF, "vpxord", xor, v16i32, VK16WM, VR512, memopv16i32, - i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPXORQZ : avx512_binop_rm<0xEF, "vpxorq", xor, v8i64, VK8WM, VR512, memopv8i64, - i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPANDNDZ : avx512_binop_rm<0xDF, "vpandnd", X86andnp, v16i32, VK16WM, VR512, - memopv16i32, i512mem, loadi32, i32mem, "{1to16}", - SSE_BIT_ITINS_P, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VPANDNQZ : avx512_binop_rm<0xDF, "vpandnq", X86andnp, v8i64, VK8WM, VR512, - memopv8i64, i512mem, loadi64, i64mem, "{1to8}", - SSE_BIT_ITINS_P, 0>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPAND : avx512_binop_rm_vl_dq<0xDB, 0xDB, "vpand", and, + SSE_INTALU_ITINS_P, HasAVX512, 1>; +defm VPOR : avx512_binop_rm_vl_dq<0xEB, 0xEB, "vpor", or, + SSE_INTALU_ITINS_P, HasAVX512, 1>; +defm VPXOR : avx512_binop_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, + SSE_INTALU_ITINS_P, HasAVX512, 1>; +defm VPANDN : avx512_binop_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp, + SSE_INTALU_ITINS_P, HasAVX512, 1>; //===----------------------------------------------------------------------===// // AVX-512 FP arithmetic @@ -2988,157 +3369,133 @@ let Predicates = [HasAVX512] in { //===----------------------------------------------------------------------===// // FMA - Fused Multiply Operations // + let Constraints = "$src1 = $dst" in { -multiclass avx512_fma3p_rm opc, string OpcodeStr, - RegisterClass RC, X86MemOperand x86memop, - PatFrag mem_frag, X86MemOperand x86scalar_mop, PatFrag scalar_mfrag, - string BrdcstStr, SDNode OpNode, ValueType OpVT, - RegisterClass KRC> { - defm r: AVX512_masking_3src opc, string OpcodeStr, X86VectorVTInfo _, + SDPatternOperator OpNode = null_frag> { + defm r: AVX512_maskable_3src, + (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>, AVX512FMA3Base; let mayLoad = 1 in - def m: AVX512FMA3; - def mb: AVX512FMA3, EVEX_B; + [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2, + (_.MemOpFrag addr:$src3))))]>; + def mb: AVX512FMA3, EVEX_B; } } // Constraints = "$src1 = $dst" +multiclass avx512_fma3p_forms opc213, bits<8> opc231, + string OpcodeStr, X86VectorVTInfo VTI, + SDPatternOperator OpNode> { + defm v213 : avx512_fma3p_rm, + EVEX_V512, EVEX_CD8; + + defm v231 : avx512_fma3p_rm, + EVEX_V512, EVEX_CD8; +} + let ExeDomain = SSEPackedSingle in { - defm VFMADD213PSZ : avx512_fma3p_rm<0xA8, "vfmadd213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmadd, v16f32, VK16WM>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFMSUB213PSZ : avx512_fma3p_rm<0xAA, "vfmsub213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmsub, v16f32, VK16WM>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFMADDSUB213PSZ : avx512_fma3p_rm<0xA6, "vfmaddsub213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmaddsub, v16f32, VK16WM>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMSUBADD213PSZ : avx512_fma3p_rm<0xA7, "vfmsubadd213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmsubadd, v16f32, VK16WM>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFNMADD213PSZ : avx512_fma3p_rm<0xAC, "vfnmadd213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fnmadd, v16f32, VK16WM>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFNMSUB213PSZ : avx512_fma3p_rm<0xAE, "vfnmsub213ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fnmsub, v16f32, VK16WM>, EVEX_V512, - EVEX_CD8<32, CD8VF>; + defm VFMADDPSZ : avx512_fma3p_forms<0xA8, 0xB8, "vfmadd", + v16f32_info, X86Fmadd>; + defm VFMSUBPSZ : avx512_fma3p_forms<0xAA, 0xBA, "vfmsub", + v16f32_info, X86Fmsub>; + defm VFMADDSUBPSZ : avx512_fma3p_forms<0xA6, 0xB6, "vfmaddsub", + v16f32_info, X86Fmaddsub>; + defm VFMSUBADDPSZ : avx512_fma3p_forms<0xA7, 0xB7, "vfmsubadd", + v16f32_info, X86Fmsubadd>; + defm VFNMADDPSZ : avx512_fma3p_forms<0xAC, 0xBC, "vfnmadd", + v16f32_info, X86Fnmadd>; + defm VFNMSUBPSZ : avx512_fma3p_forms<0xAE, 0xBE, "vfnmsub", + v16f32_info, X86Fnmsub>; } let ExeDomain = SSEPackedDouble in { - defm VFMADD213PDZ : avx512_fma3p_rm<0xA8, "vfmadd213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmadd, v8f64, VK8WM>, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMSUB213PDZ : avx512_fma3p_rm<0xAA, "vfmsub213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmsub, v8f64, VK8WM>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFMADDSUB213PDZ : avx512_fma3p_rm<0xA6, "vfmaddsub213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmaddsub, v8f64, VK8WM>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMSUBADD213PDZ : avx512_fma3p_rm<0xA7, "vfmsubadd213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmsubadd, v8f64, VK8WM>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm VFNMADD213PDZ : avx512_fma3p_rm<0xAC, "vfnmadd213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fnmadd, v8f64, VK8WM>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFNMSUB213PDZ : avx512_fma3p_rm<0xAE, "vfnmsub213pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fnmsub, v8f64, VK8WM>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; + defm VFMADDPDZ : avx512_fma3p_forms<0xA8, 0xB8, "vfmadd", + v8f64_info, X86Fmadd>, VEX_W; + defm VFMSUBPDZ : avx512_fma3p_forms<0xAA, 0xBA, "vfmsub", + v8f64_info, X86Fmsub>, VEX_W; + defm VFMADDSUBPDZ : avx512_fma3p_forms<0xA6, 0xB6, "vfmaddsub", + v8f64_info, X86Fmaddsub>, VEX_W; + defm VFMSUBADDPDZ : avx512_fma3p_forms<0xA7, 0xB7, "vfmsubadd", + v8f64_info, X86Fmsubadd>, VEX_W; + defm VFNMADDPDZ : avx512_fma3p_forms<0xAC, 0xBC, "vfnmadd", + v8f64_info, X86Fnmadd>, VEX_W; + defm VFNMSUBPDZ : avx512_fma3p_forms<0xAE, 0xBE, "vfnmsub", + v8f64_info, X86Fnmsub>, VEX_W; } let Constraints = "$src1 = $dst" in { -multiclass avx512_fma3p_m132 opc, string OpcodeStr, - RegisterClass RC, X86MemOperand x86memop, - PatFrag mem_frag, X86MemOperand x86scalar_mop, PatFrag scalar_mfrag, - string BrdcstStr, SDNode OpNode, ValueType OpVT> { +multiclass avx512_fma3p_m132 opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { let mayLoad = 1 in - def m: AVX512FMA3; - def mb: AVX512FMA3, EVEX_B; + [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, (_.MemOpFrag addr:$src2), + _.RC:$src3)))]>; + def mb: AVX512FMA3, EVEX_B; } } // Constraints = "$src1 = $dst" let ExeDomain = SSEPackedSingle in { - defm VFMADD132PSZ : avx512_fma3p_m132<0x98, "vfmadd132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmadd, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFMSUB132PSZ : avx512_fma3p_m132<0x9A, "vfmsub132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmsub, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFMADDSUB132PSZ : avx512_fma3p_m132<0x96, "vfmaddsub132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmaddsub, v16f32>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFMSUBADD132PSZ : avx512_fma3p_m132<0x97, "vfmsubadd132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fmsubadd, v16f32>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm VFNMADD132PSZ : avx512_fma3p_m132<0x9C, "vfnmadd132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fnmadd, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - defm VFNMSUB132PSZ : avx512_fma3p_m132<0x9E, "vfnmsub132ps", VR512, f512mem, - memopv16f32, f32mem, loadf32, "{1to16}", - X86Fnmsub, v16f32>, EVEX_V512, - EVEX_CD8<32, CD8VF>; + defm VFMADD132PSZ : avx512_fma3p_m132<0x98, "vfmadd132ps", X86Fmadd, + v16f32_info>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + defm VFMSUB132PSZ : avx512_fma3p_m132<0x9A, "vfmsub132ps", X86Fmsub, + v16f32_info>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + defm VFMADDSUB132PSZ : avx512_fma3p_m132<0x96, "vfmaddsub132ps", X86Fmaddsub, + v16f32_info>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + defm VFMSUBADD132PSZ : avx512_fma3p_m132<0x97, "vfmsubadd132ps", X86Fmsubadd, + v16f32_info>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + defm VFNMADD132PSZ : avx512_fma3p_m132<0x9C, "vfnmadd132ps", X86Fnmadd, + v16f32_info>, + EVEX_V512, EVEX_CD8<32, CD8VF>; + defm VFNMSUB132PSZ : avx512_fma3p_m132<0x9E, "vfnmsub132ps", X86Fnmsub, + v16f32_info>, + EVEX_V512, EVEX_CD8<32, CD8VF>; } let ExeDomain = SSEPackedDouble in { - defm VFMADD132PDZ : avx512_fma3p_m132<0x98, "vfmadd132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmadd, v8f64>, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; - defm VFMSUB132PDZ : avx512_fma3p_m132<0x9A, "vfmsub132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmsub, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFMADDSUB132PDZ : avx512_fma3p_m132<0x96, "vfmaddsub132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmaddsub, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFMSUBADD132PDZ : avx512_fma3p_m132<0x97, "vfmsubadd132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fmsubadd, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFNMADD132PDZ : avx512_fma3p_m132<0x9C, "vfnmadd132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fnmadd, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; - defm VFNMSUB132PDZ : avx512_fma3p_m132<0x9E, "vfnmsub132pd", VR512, f512mem, - memopv8f64, f64mem, loadf64, "{1to8}", - X86Fnmsub, v8f64>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VF>; + defm VFMADD132PDZ : avx512_fma3p_m132<0x98, "vfmadd132pd", X86Fmadd, + v8f64_info>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + defm VFMSUB132PDZ : avx512_fma3p_m132<0x9A, "vfmsub132pd", X86Fmsub, + v8f64_info>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + defm VFMADDSUB132PDZ : avx512_fma3p_m132<0x96, "vfmaddsub132pd", X86Fmaddsub, + v8f64_info>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + defm VFMSUBADD132PDZ : avx512_fma3p_m132<0x97, "vfmsubadd132pd", X86Fmsubadd, + v8f64_info>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + defm VFNMADD132PDZ : avx512_fma3p_m132<0x9C, "vfnmadd132pd", X86Fnmadd, + v8f64_info>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + defm VFNMSUB132PDZ : avx512_fma3p_m132<0x9E, "vfnmsub132pd", X86Fnmsub, + v8f64_info>, + EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; } // Scalar FMA @@ -4562,34 +4919,29 @@ def : Pat<(v8i64 (X86Shufp VR512:$src1, (memopv8i64 addr:$src2), (i8 imm:$imm))), (VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>; -multiclass avx512_valign { - defm rri : AVX512_masking<0x03, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, i8imm:$src3), - "valign"##Suffix, +multiclass avx512_valign { + defm rri : AVX512_maskable<0x03, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i8imm:$src3), + "valign"##_.Suffix, "$src3, $src2, $src1", "$src1, $src2, $src3", - (IntVT (X86VAlign RC:$src2, RC:$src1, - (i8 imm:$src3))), - IntVT, RC, KRC>, + (_.VT (X86VAlign _.RC:$src2, _.RC:$src1, + (i8 imm:$src3)))>, AVX512AIi8Base, EVEX_4V; // Also match valign of packed floats. - def : Pat<(FloatVT (X86VAlign RC:$src1, RC:$src2, (i8 imm:$imm))), - (!cast(NAME##rri) RC:$src2, RC:$src1, imm:$imm)>; + def : Pat<(_.FloatVT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 imm:$imm))), + (!cast(NAME##rri) _.RC:$src2, _.RC:$src1, imm:$imm)>; let mayLoad = 1 in - def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, i8imm:$src3), - !strconcat("valign"##Suffix, + def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, i8imm:$src3), + !strconcat("valign"##_.Suffix, " \t{$src3, $src2, $src1, $dst|" "$dst, $src1, $src2, $src3}"), []>, EVEX_4V; } -defm VALIGND : avx512_valign<"d", VR512, VK16WM, GR16, i512mem, v16i32, v16f32>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VALIGNQ : avx512_valign<"q", VR512, VK8WM, GR8, i512mem, v8i64, v8f64>, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; +defm VALIGND : avx512_valign, EVEX_V512, EVEX_CD8<32, CD8VF>; +defm VALIGNQ : avx512_valign, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; // Helper fragments to match sext vXi1 to vXiY. def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>; @@ -4787,3 +5139,32 @@ def truncstorei1 : PatFrag<(ops node:$val, node:$ptr), def : Pat<(truncstorei1 GR8:$src, addr:$dst), (MOV8mr addr:$dst, GR8:$src)>; +multiclass cvt_by_vec_width opc, X86VectorVTInfo Vec, string OpcodeStr > { +def rr : AVX512XS8I, EVEX; +} + +multiclass cvt_mask_by_elt_width opc, AVX512VLVectorVTInfo VTInfo, + string OpcodeStr, Predicate prd> { +let Predicates = [prd] in + defm Z : cvt_by_vec_width, EVEX_V512; + + let Predicates = [prd, HasVLX] in { + defm Z256 : cvt_by_vec_width, EVEX_V256; + defm Z128 : cvt_by_vec_width, EVEX_V128; + } +} + +multiclass avx512_convert_mask_to_vector { + defm NAME##B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info, OpcodeStr, + HasBWI>; + defm NAME##W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, OpcodeStr, + HasBWI>, VEX_W; + defm NAME##D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, OpcodeStr, + HasDQI>; + defm NAME##Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, OpcodeStr, + HasDQI>, VEX_W; +} + +defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">;