X-Git-Url: http://plrg.eecs.uci.edu/git/?a=blobdiff_plain;f=lib%2FTarget%2FX86%2FX86InstrSSE.td;h=d8bb435c29cb6ad8353f0d100a613ffcd6cd171c;hb=9a9d275dc7897dfba7f41ce1b3770ca27ac149e8;hp=afdedd4de4a54e2de0c9fe5f6243fd382ba62f4a;hpb=d52e78efac2da94f9967db343abd46a0fa9cb3b4;p=oota-llvm.git diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index afdedd4de4a..d8bb435c29c 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -14,342 +14,6 @@ //===----------------------------------------------------------------------===// -//===----------------------------------------------------------------------===// -// SSE specific DAG Nodes. -//===----------------------------------------------------------------------===// - -def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>, - SDTCisFP<0>, SDTCisInt<2> ]>; -def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, - SDTCisFP<1>, SDTCisVT<3, i8>]>; - -def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>; -def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>; -def X86fand : SDNode<"X86ISD::FAND", SDTFPBinOp, - [SDNPCommutative, SDNPAssociative]>; -def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp, - [SDNPCommutative, SDNPAssociative]>; -def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp, - [SDNPCommutative, SDNPAssociative]>; -def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>; -def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>; -def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>; -def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; -def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; -def X86pshufb : SDNode<"X86ISD::PSHUFB", - SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>]>>; -def X86pextrb : SDNode<"X86ISD::PEXTRB", - SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>; -def X86pextrw : SDNode<"X86ISD::PEXTRW", - SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>; -def X86pinsrb : SDNode<"X86ISD::PINSRB", - SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, - SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>; -def X86pinsrw : SDNode<"X86ISD::PINSRW", - SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>, - SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>; -def X86insrtps : SDNode<"X86ISD::INSERTPS", - SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>, - SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>; -def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL", - SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>; -def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad, - [SDNPHasChain, SDNPMayLoad]>; -def X86vshl : SDNode<"X86ISD::VSHL", SDTIntShiftOp>; -def X86vshr : SDNode<"X86ISD::VSRL", SDTIntShiftOp>; -def X86cmpps : SDNode<"X86ISD::CMPPS", SDTX86VFCMP>; -def X86cmppd : SDNode<"X86ISD::CMPPD", SDTX86VFCMP>; -def X86pcmpeqb : SDNode<"X86ISD::PCMPEQB", SDTIntBinOp, [SDNPCommutative]>; -def X86pcmpeqw : SDNode<"X86ISD::PCMPEQW", SDTIntBinOp, [SDNPCommutative]>; -def X86pcmpeqd : SDNode<"X86ISD::PCMPEQD", SDTIntBinOp, [SDNPCommutative]>; -def X86pcmpeqq : SDNode<"X86ISD::PCMPEQQ", SDTIntBinOp, [SDNPCommutative]>; -def X86pcmpgtb : SDNode<"X86ISD::PCMPGTB", SDTIntBinOp>; -def X86pcmpgtw : SDNode<"X86ISD::PCMPGTW", SDTIntBinOp>; -def X86pcmpgtd : SDNode<"X86ISD::PCMPGTD", SDTIntBinOp>; -def X86pcmpgtq : SDNode<"X86ISD::PCMPGTQ", SDTIntBinOp>; - -def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, - SDTCisVT<1, v4f32>, - SDTCisVT<2, v4f32>]>; -def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; - -//===----------------------------------------------------------------------===// -// SSE Complex Patterns -//===----------------------------------------------------------------------===// - -// These are 'extloads' from a scalar to the low element of a vector, zeroing -// the top elements. These are used for the SSE 'ss' and 'sd' instruction -// forms. -def sse_load_f32 : ComplexPattern; -def sse_load_f64 : ComplexPattern; - -def ssmem : Operand { - let PrintMethod = "printf32mem"; - let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm); - let ParserMatchClass = X86MemAsmOperand; -} -def sdmem : Operand { - let PrintMethod = "printf64mem"; - let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm); - let ParserMatchClass = X86MemAsmOperand; -} - -//===----------------------------------------------------------------------===// -// SSE pattern fragments -//===----------------------------------------------------------------------===// - -def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>; -def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>; -def loadv4i32 : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>; -def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>; - -// FIXME: move this to a more appropriate place after all AVX is done. -def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>; -def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>; -def loadv8i32 : PatFrag<(ops node:$ptr), (v8i32 (load node:$ptr))>; -def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>; - -// Like 'store', but always requires vector alignment. -def alignedstore : PatFrag<(ops node:$val, node:$ptr), - (store node:$val, node:$ptr), [{ - return cast(N)->getAlignment() >= 16; -}]>; - -// Like 'load', but always requires vector alignment. -def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return cast(N)->getAlignment() >= 16; -}]>; - -def alignedloadfsf32 : PatFrag<(ops node:$ptr), - (f32 (alignedload node:$ptr))>; -def alignedloadfsf64 : PatFrag<(ops node:$ptr), - (f64 (alignedload node:$ptr))>; -def alignedloadv4f32 : PatFrag<(ops node:$ptr), - (v4f32 (alignedload node:$ptr))>; -def alignedloadv2f64 : PatFrag<(ops node:$ptr), - (v2f64 (alignedload node:$ptr))>; -def alignedloadv4i32 : PatFrag<(ops node:$ptr), - (v4i32 (alignedload node:$ptr))>; -def alignedloadv2i64 : PatFrag<(ops node:$ptr), - (v2i64 (alignedload node:$ptr))>; - -// FIXME: move this to a more appropriate place after all AVX is done. -def alignedloadv8f32 : PatFrag<(ops node:$ptr), - (v8f32 (alignedload node:$ptr))>; -def alignedloadv4f64 : PatFrag<(ops node:$ptr), - (v4f64 (alignedload node:$ptr))>; -def alignedloadv8i32 : PatFrag<(ops node:$ptr), - (v8i32 (alignedload node:$ptr))>; -def alignedloadv4i64 : PatFrag<(ops node:$ptr), - (v4i64 (alignedload node:$ptr))>; - -// Like 'load', but uses special alignment checks suitable for use in -// memory operands in most SSE instructions, which are required to -// be naturally aligned on some targets but not on others. If the subtarget -// allows unaligned accesses, match any load, though this may require -// setting a feature bit in the processor (on startup, for example). -// Opteron 10h and later implement such a feature. -def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return Subtarget->hasVectorUAMem() - || cast(N)->getAlignment() >= 16; -}]>; - -def memopfsf32 : PatFrag<(ops node:$ptr), (f32 (memop node:$ptr))>; -def memopfsf64 : PatFrag<(ops node:$ptr), (f64 (memop node:$ptr))>; -def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>; -def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>; -def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>; -def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>; -def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>; - -// FIXME: move this to a more appropriate place after all AVX is done. -def memopv8f32 : PatFrag<(ops node:$ptr), (v8f32 (memop node:$ptr))>; -def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>; - -// SSSE3 uses MMX registers for some instructions. They aren't aligned on a -// 16-byte boundary. -// FIXME: 8 byte alignment for mmx reads is not required -def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return cast(N)->getAlignment() >= 8; -}]>; - -def memopv8i8 : PatFrag<(ops node:$ptr), (v8i8 (memop64 node:$ptr))>; -def memopv4i16 : PatFrag<(ops node:$ptr), (v4i16 (memop64 node:$ptr))>; -def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop64 node:$ptr))>; -def memopv2i32 : PatFrag<(ops node:$ptr), (v2i32 (memop64 node:$ptr))>; - -// MOVNT Support -// Like 'store', but requires the non-temporal bit to be set -def nontemporalstore : PatFrag<(ops node:$val, node:$ptr), - (st node:$val, node:$ptr), [{ - if (StoreSDNode *ST = dyn_cast(N)) - return ST->isNonTemporal(); - return false; -}]>; - -def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), - (st node:$val, node:$ptr), [{ - if (StoreSDNode *ST = dyn_cast(N)) - return ST->isNonTemporal() && !ST->isTruncatingStore() && - ST->getAddressingMode() == ISD::UNINDEXED && - ST->getAlignment() >= 16; - return false; -}]>; - -def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), - (st node:$val, node:$ptr), [{ - if (StoreSDNode *ST = dyn_cast(N)) - return ST->isNonTemporal() && - ST->getAlignment() < 16; - return false; -}]>; - -def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>; -def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>; -def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>; -def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>; -def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>; -def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>; - -def vzmovl_v2i64 : PatFrag<(ops node:$src), - (bitconvert (v2i64 (X86vzmovl - (v2i64 (scalar_to_vector (loadi64 node:$src))))))>; -def vzmovl_v4i32 : PatFrag<(ops node:$src), - (bitconvert (v4i32 (X86vzmovl - (v4i32 (scalar_to_vector (loadi32 node:$src))))))>; - -def vzload_v2i64 : PatFrag<(ops node:$src), - (bitconvert (v2i64 (X86vzload node:$src)))>; - - -def fp32imm0 : PatLeaf<(f32 fpimm), [{ - return N->isExactlyValue(+0.0); -}]>; - -// BYTE_imm - Transform bit immediates into byte immediates. -def BYTE_imm : SDNodeXForm> 3 - return getI32Imm(N->getZExtValue() >> 3); -}]>; - -// SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to PSHUF*, -// SHUFP* etc. imm. -def SHUFFLE_get_shuf_imm : SDNodeXForm; - -// SHUFFLE_get_pshufhw_imm xform function: convert vector_shuffle mask to -// PSHUFHW imm. -def SHUFFLE_get_pshufhw_imm : SDNodeXForm; - -// SHUFFLE_get_pshuflw_imm xform function: convert vector_shuffle mask to -// PSHUFLW imm. -def SHUFFLE_get_pshuflw_imm : SDNodeXForm; - -// SHUFFLE_get_palign_imm xform function: convert vector_shuffle mask to -// a PALIGNR imm. -def SHUFFLE_get_palign_imm : SDNodeXForm; - -def splat_lo : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - ShuffleVectorSDNode *SVOp = cast(N); - return SVOp->isSplat() && SVOp->getSplatIndex() == 0; -}]>; - -def movddup : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isMOVDDUPMask(cast(N)); -}]>; - -def movhlps : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isMOVHLPSMask(cast(N)); -}]>; - -def movhlps_undef : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isMOVHLPS_v_undef_Mask(cast(N)); -}]>; - -def movlhps : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isMOVLHPSMask(cast(N)); -}]>; - -def movlp : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isMOVLPMask(cast(N)); -}]>; - -def movl : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isMOVLMask(cast(N)); -}]>; - -def movshdup : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isMOVSHDUPMask(cast(N)); -}]>; - -def movsldup : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isMOVSLDUPMask(cast(N)); -}]>; - -def unpckl : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isUNPCKLMask(cast(N)); -}]>; - -def unpckh : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isUNPCKHMask(cast(N)); -}]>; - -def unpckl_undef : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isUNPCKL_v_undef_Mask(cast(N)); -}]>; - -def unpckh_undef : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isUNPCKH_v_undef_Mask(cast(N)); -}]>; - -def pshufd : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isPSHUFDMask(cast(N)); -}], SHUFFLE_get_shuf_imm>; - -def shufp : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isSHUFPMask(cast(N)); -}], SHUFFLE_get_shuf_imm>; - -def pshufhw : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isPSHUFHWMask(cast(N)); -}], SHUFFLE_get_pshufhw_imm>; - -def pshuflw : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isPSHUFLWMask(cast(N)); -}], SHUFFLE_get_pshuflw_imm>; - -def palign : PatFrag<(ops node:$lhs, node:$rhs), - (vector_shuffle node:$lhs, node:$rhs), [{ - return X86::isPALIGNRMask(cast(N)); -}], SHUFFLE_get_palign_imm>; - //===----------------------------------------------------------------------===// // SSE scalar FP Instructions //===----------------------------------------------------------------------===// @@ -393,75 +57,103 @@ let Uses = [EFLAGS], usesCustomInserter = 1 in { /// sse12_fp_scalar - SSE 1 & 2 scalar instructions class multiclass sse12_fp_scalar opc, string OpcodeStr, SDNode OpNode, - RegisterClass RC, X86MemOperand x86memop> { + RegisterClass RC, X86MemOperand x86memop, + bit Is2Addr = 1> { let isCommutable = 1 in { def rr : SI; + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpNode RC:$src1, RC:$src2))]>; } def rm : SI; + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))]>; } /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class multiclass sse12_fp_scalar_int opc, string OpcodeStr, RegisterClass RC, - string asm, string SSEVer, string FPSizeStr, - Operand memopr, ComplexPattern mem_cpat> { + string asm, string SSEVer, string FPSizeStr, + Operand memopr, ComplexPattern mem_cpat, + bit Is2Addr = 1> { def rr_Int : SI("int_x86_sse", - !strconcat(SSEVer, !strconcat("_", - !strconcat(OpcodeStr, FPSizeStr)))) - RC:$src1, RC:$src2))]>; + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (!nameconcat("int_x86_sse", + !strconcat(SSEVer, !strconcat("_", + !strconcat(OpcodeStr, FPSizeStr)))) + RC:$src1, RC:$src2))]>; def rm_Int : SI("int_x86_sse", - !strconcat(SSEVer, !strconcat("_", - !strconcat(OpcodeStr, FPSizeStr)))) - RC:$src1, mem_cpat:$src2))]>; + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (!nameconcat("int_x86_sse", + !strconcat(SSEVer, !strconcat("_", + !strconcat(OpcodeStr, FPSizeStr)))) + RC:$src1, mem_cpat:$src2))]>; } /// sse12_fp_packed - SSE 1 & 2 packed instructions class multiclass sse12_fp_packed opc, string OpcodeStr, SDNode OpNode, RegisterClass RC, ValueType vt, X86MemOperand x86memop, PatFrag mem_frag, - Domain d, bit MayLoad = 0> { + Domain d, bit Is2Addr = 1> { let isCommutable = 1 in def rr : PI; - let mayLoad = MayLoad in + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], d>; + let mayLoad = 1 in def rm : PI; + !if(Is2Addr, + !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))], d>; } /// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class multiclass sse12_fp_packed_logical_rm opc, RegisterClass RC, Domain d, string OpcodeStr, X86MemOperand x86memop, - list pat_rr, list pat_rm> { + list pat_rr, list pat_rm, + bit Is2Addr = 1> { let isCommutable = 1 in - def rr : PI; - def rm : PI; + def rr : PI; + def rm : PI; } /// sse12_fp_packed_int - SSE 1 & 2 packed instructions intrinsics class multiclass sse12_fp_packed_int opc, string OpcodeStr, RegisterClass RC, - string asm, string SSEVer, string FPSizeStr, - X86MemOperand x86memop, PatFrag mem_frag, - Domain d> { + string asm, string SSEVer, string FPSizeStr, + X86MemOperand x86memop, PatFrag mem_frag, + Domain d, bit Is2Addr = 1> { def rr_Int : PI("int_x86_sse", - !strconcat(SSEVer, !strconcat("_", - !strconcat(OpcodeStr, FPSizeStr)))) - RC:$src1, RC:$src2))], d>; - def rm_Int : PI("int_x86_sse", - !strconcat(SSEVer, !strconcat("_", - !strconcat(OpcodeStr, FPSizeStr)))) - RC:$src1, (mem_frag addr:$src2)))], d>; + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set RC:$dst, (!nameconcat("int_x86_sse", + !strconcat(SSEVer, !strconcat("_", + !strconcat(OpcodeStr, FPSizeStr)))) + RC:$src1, RC:$src2))], d>; + def rm_Int : PI("int_x86_sse", + !strconcat(SSEVer, !strconcat("_", + !strconcat(OpcodeStr, FPSizeStr)))) + RC:$src1, (mem_frag addr:$src2)))], d>; } //===----------------------------------------------------------------------===// @@ -564,10 +256,10 @@ def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src), let isAsmParserOnly = 1 in { def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src), "movss\t{$src, $dst|$dst, $src}", - [(store FR32:$src, addr:$dst)]>, XS, VEX_4V; + [(store FR32:$src, addr:$dst)]>, XS, VEX; def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src), "movsd\t{$src, $dst|$dst, $src}", - [(store FR64:$src, addr:$dst)]>, XD, VEX_4V; + [(store FR64:$src, addr:$dst)]>, XD, VEX; } // Extract and store. @@ -824,6 +516,14 @@ multiclass sse12_cvt_s opc, RegisterClass SrcRC, RegisterClass DstRC, [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>; } +multiclass sse12_cvt_s_np opc, RegisterClass SrcRC, RegisterClass DstRC, + X86MemOperand x86memop, string asm> { + def rr : SI; + def rm : SI; +} + multiclass sse12_cvt_p opc, RegisterClass SrcRC, RegisterClass DstRC, SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, string asm, Domain d> { @@ -834,25 +534,40 @@ multiclass sse12_cvt_p opc, RegisterClass SrcRC, RegisterClass DstRC, } multiclass sse12_vcvt_avx opc, RegisterClass SrcRC, RegisterClass DstRC, - SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, - string asm> { + X86MemOperand x86memop, string asm> { def rr : SI; + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>; def rm : SI; + (ins DstRC:$src1, x86memop:$src), + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>; } let isAsmParserOnly = 1 in { -defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, - "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX; -defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, - "cvttsd2si\t{$src, $dst|$dst, $src}">, XD, VEX; -defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32, - "cvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}">, XS, - VEX_4V; -defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32, - "cvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}">, XD, - VEX_4V; +defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, + "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX; +defm VCVTTSS2SIr64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, + "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX, + VEX_W; +defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, + "cvttsd2si\t{$src, $dst|$dst, $src}">, XD, VEX; +defm VCVTTSD2SIr64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, + "cvttsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, + VEX_W; + +// The assembler can recognize rr 64-bit instructions by seeing a rxx +// register, but the same isn't true when only using memory operands, +// provide other assembly "l" and "q" forms to address this explicitly +// where appropriate to do so. +defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss">, XS, + VEX_4V; +defm VCVTSI2SSQ : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ssq">, XS, + VEX_4V, VEX_W; +defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd">, XD, + VEX_4V; +defm VCVTSI2SDL : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sdl">, XD, + VEX_4V; +defm VCVTSI2SDQ : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sdq">, XD, + VEX_4V, VEX_W; } defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, @@ -911,6 +626,14 @@ let isAsmParserOnly = 1 in { defm Int_VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si, f128mem, load, "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX; + // FIXME: The asm matcher has a hack to ignore instructions with _Int and Int_ + // Get rid of this hack or rename the intrinsics, there are several + // intructions that only match with the intrinsic form, why create duplicates + // to let them be recognized by the assembler? + defm VCVTSD2SI_alt : sse12_cvt_s_np<0x2D, FR64, GR32, f64mem, + "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX; + defm VCVTSD2SI64 : sse12_cvt_s_np<0x2D, FR64, GR64, f64mem, + "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_W; } defm Int_CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si, f32mem, load, "cvtss2si\t{$src, $dst|$dst, $src}">, XS; @@ -969,11 +692,17 @@ defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, XD; let isAsmParserOnly = 1, Pattern = [] in { -defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load, - "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS, VEX; -defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, f128mem, load, - "cvtdq2ps\t{$src, $dst|$dst, $src}", - SSEPackedSingle>, TB, VEX; +defm VCVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load, + "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS, VEX; +defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load, + "cvtss2si\t{$src, $dst|$dst, $src}">, XS, VEX, + VEX_W; +defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, f128mem, load, + "cvtdq2ps\t{$src, $dst|$dst, $src}", + SSEPackedSingle>, TB, VEX; +defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, VR256, undef, f256mem, load, + "cvtdq2ps\t{$src, $dst|$dst, $src}", + SSEPackedSingle>, TB, VEX; } let Pattern = [] in { defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load /*dummy*/, @@ -994,7 +723,7 @@ def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins FR64:$src1, f64mem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, XD, Requires<[HasAVX, HasSSE2, OptForSize]>, VEX_4V; + []>, XD, Requires<[HasAVX, OptForSize]>, VEX_4V; } def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), "cvtsd2ss\t{$src, $dst|$dst, $src}", @@ -1019,11 +748,11 @@ let isAsmParserOnly = 1 in { // SSE2 instructions with XS prefix def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src1, FR32:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, XS, Requires<[HasAVX, HasSSE2]>, VEX_4V; + []>, XS, Requires<[HasAVX]>, VEX_4V; def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins FR32:$src1, f32mem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, XS, VEX_4V, Requires<[HasAVX, HasSSE2, OptForSize]>; + []>, XS, VEX_4V, Requires<[HasAVX, OptForSize]>; } def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", @@ -1040,13 +769,13 @@ def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg, "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))]>, XS, VEX_4V, - Requires<[HasAVX, HasSSE2]>; + Requires<[HasAVX]>; def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, (load addr:$src2)))]>, XS, VEX_4V, - Requires<[HasAVX, HasSSE2]>; + Requires<[HasAVX]>; } let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg, @@ -1072,12 +801,12 @@ let isAsmParserOnly = 1 in { // SSE2 instructions without OpSize prefix def Int_VCVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtdq2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>, - TB, VEX, Requires<[HasAVX, HasSSE2]>; + TB, VEX, Requires<[HasAVX]>; def Int_VCVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vcvtdq2ps\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2ps (bitconvert (memopv2i64 addr:$src))))]>, - TB, VEX, Requires<[HasAVX, HasSSE2]>; + TB, VEX, Requires<[HasAVX]>; } def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtdq2ps\t{$src, $dst|$dst, $src}", @@ -1094,12 +823,12 @@ let isAsmParserOnly = 1 in { // SSE2 instructions with XS prefix def Int_VCVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>, - XS, VEX, Requires<[HasAVX, HasSSE2]>; + XS, VEX, Requires<[HasAVX]>; def Int_VCVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtdq2pd (bitconvert (memopv2i64 addr:$src))))]>, - XS, VEX, Requires<[HasAVX, HasSSE2]>; + XS, VEX, Requires<[HasAVX]>; } def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", @@ -1114,9 +843,13 @@ def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), // Convert packed single/double fp to doubleword let isAsmParserOnly = 1 in { def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; + "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; + "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + "cvtps2dq\t{$src, $dst|$dst, $src}", []>, VEX; } def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2dq\t{$src, $dst|$dst, $src}", []>; @@ -1146,12 +879,12 @@ let isAsmParserOnly = 1 in { // SSE2 packed instructions with XD prefix def Int_VCVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>, - XD, VEX, Requires<[HasAVX, HasSSE2]>; + XD, VEX, Requires<[HasAVX]>; def Int_VCVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "vcvtpd2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtpd2dq (memop addr:$src)))]>, - XD, VEX, Requires<[HasAVX, HasSSE2]>; + XD, VEX, Requires<[HasAVX]>; } def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2dq\t{$src, $dst|$dst, $src}", @@ -1170,6 +903,10 @@ def VCVTTPS2DQrr : VSSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTTPS2DQrm : VSSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTTPS2DQYrr : VSSI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTTPS2DQYrm : VSSI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX; } def CVTTPS2DQrr : SSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", []>; @@ -1182,12 +919,12 @@ def Int_VCVTTPS2DQrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))]>, - XS, VEX, Requires<[HasAVX, HasSSE2]>; + XS, VEX, Requires<[HasAVX]>; def Int_VCVTTPS2DQrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "vcvttps2dq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvttps2dq (memop addr:$src)))]>, - XS, VEX, Requires<[HasAVX, HasSSE2]>; + XS, VEX, Requires<[HasAVX]>; } def Int_CVTTPS2DQrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttps2dq\t{$src, $dst|$dst, $src}", @@ -1220,14 +957,39 @@ def Int_CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src), [(set VR128:$dst, (int_x86_sse2_cvttpd2dq (memop addr:$src)))]>; +let isAsmParserOnly = 1 in { +// The assembler can recognize rr 256-bit instructions by seeing a ymm +// register, but the same isn't true when using memory operands instead. +// Provide other assembly rr and rm forms to address this explicitly. +def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvttpd2dq\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTTPD2DQXrYr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), + "cvttpd2dq\t{$src, $dst|$dst, $src}", []>, VEX; + +// XMM only +def VCVTTPD2DQXrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvttpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvttpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX; + +// YMM only +def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), + "cvttpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), + "cvttpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L; +} + // Convert packed single to packed double -let isAsmParserOnly = 1 in { // SSE2 instructions without OpSize prefix +let isAsmParserOnly = 1, Predicates = [HasAVX] in { + // SSE2 instructions without OpSize prefix def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX, - Requires<[HasAVX]>; + "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX; def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), - "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX, - Requires<[HasAVX]>; + "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), + "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX; } def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB; @@ -1238,12 +1000,12 @@ let isAsmParserOnly = 1 in { def Int_VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>, - VEX, Requires<[HasAVX, HasSSE2]>; + VEX, Requires<[HasAVX]>; def Int_VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_cvtps2pd (load addr:$src)))]>, - VEX, Requires<[HasAVX, HasSSE2]>; + VEX, Requires<[HasAVX]>; } def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtps2pd\t{$src, $dst|$dst, $src}", @@ -1257,10 +1019,25 @@ def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), // Convert packed double to packed single let isAsmParserOnly = 1 in { +// The assembler can recognize rr 256-bit instructions by seeing a ymm +// register, but the same isn't true when using memory operands instead. +// Provide other assembly rr and rm forms to address this explicitly. def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "cvtpd2ps\t{$src, $dst|$dst, $src}", []>, VEX; -// FIXME: the memory form of this instruction should described using -// use extra asm syntax + "cvtpd2ps\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTPD2PSXrYr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), + "cvtpd2ps\t{$src, $dst|$dst, $src}", []>, VEX; + +// XMM only +def VCVTPD2PSXrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "cvtpd2psx\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "cvtpd2psx\t{$src, $dst|$dst, $src}", []>, VEX; + +// YMM only +def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), + "cvtpd2psy\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), + "cvtpd2psy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L; } def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2ps\t{$src, $dst|$dst, $src}", []>; @@ -1450,6 +1227,16 @@ let isAsmParserOnly = 1 in { "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}", "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}", SSEPackedDouble>, OpSize, VEX_4V; + let Pattern = [] in { + defm VCMPPSY : sse12_cmp_packed, VEX_4V; + defm VCMPPDY : sse12_cmp_packed, OpSize, VEX_4V; + } } let Constraints = "$src1 = $dst" in { defm CMPPS : sse12_cmp_packed { - def rmi : PIi8<0xC6, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, f128mem:$src2, i8imm:$src3), asm, - [(set VR128:$dst, (vt (shufp:$src3 - VR128:$src1, (mem_frag addr:$src2))))], d>; + def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, f128mem:$src2, i8imm:$src3), asm, + [(set RC:$dst, (vt (shufp:$src3 + RC:$src1, (mem_frag addr:$src2))))], d>; let isConvertibleToThreeAddress = IsConvertibleToThreeAddress in - def rri : PIi8<0xC6, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), asm, - [(set VR128:$dst, - (vt (shufp:$src3 VR128:$src1, VR128:$src2)))], d>; + def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, i8imm:$src3), asm, + [(set RC:$dst, + (vt (shufp:$src3 RC:$src1, RC:$src2)))], d>; } let isAsmParserOnly = 1 in { - defm VSHUFPS : sse12_shuffle, VEX_4V; - defm VSHUFPD : sse12_shuffle, OpSize, VEX_4V; + defm VSHUFPS : sse12_shuffle, VEX_4V; + defm VSHUFPSY : sse12_shuffle, VEX_4V; + defm VSHUFPD : sse12_shuffle, OpSize, VEX_4V; + defm VSHUFPDY : sse12_shuffle, OpSize, VEX_4V; } let Constraints = "$src1 = $dst" in { @@ -1598,6 +1391,12 @@ let isAsmParserOnly = 1 in { defm VMOVMSKPD : sse12_extr_sign_mask, OpSize, VEX; + // FIXME: merge with multiclass above when the intrinsics come. + def VMOVMSKPSYrr : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins VR256:$src), + "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX; + def VMOVMSKPDYrr : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins VR256:$src), + "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize, + VEX; } //===----------------------------------------------------------------------===// @@ -1646,69 +1445,64 @@ def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src), /// sse12_fp_alias_pack_logical - SSE 1 & 2 aliased packed FP logical ops /// multiclass sse12_fp_alias_pack_logical opc, string OpcodeStr, - SDNode OpNode, bit MayLoad = 0> { + SDNode OpNode> { let isAsmParserOnly = 1 in { - defm V#NAME#PS : sse12_fp_packed, VEX_4V; + defm V#NAME#PS : sse12_fp_packed, VEX_4V; - defm V#NAME#PD : sse12_fp_packed, OpSize, - VEX_4V; + defm V#NAME#PD : sse12_fp_packed, OpSize, VEX_4V; } let Constraints = "$src1 = $dst" in { - defm PS : sse12_fp_packed, TB; + defm PS : sse12_fp_packed, TB; - defm PD : sse12_fp_packed, TB, OpSize; + defm PD : sse12_fp_packed, TB, OpSize; } } // Alias bitwise logical operations using SSE logical ops on packed FP values. -defm FsAND : sse12_fp_alias_pack_logical<0x54, "and", X86fand>; -defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for>; -defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor>; +let mayLoad = 0 in { + defm FsAND : sse12_fp_alias_pack_logical<0x54, "and", X86fand>; + defm FsOR : sse12_fp_alias_pack_logical<0x56, "or", X86for>; + defm FsXOR : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor>; +} let neverHasSideEffects = 1, Pattern = [], isCommutable = 0 in - defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef, 1>; + defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef>; /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops /// multiclass sse12_fp_packed_logical opc, string OpcodeStr, SDNode OpNode, int HasPat = 0, list> Pattern = []> { - let isAsmParserOnly = 1 in { + let isAsmParserOnly = 1, Pattern = [] in { defm V#NAME#PS : sse12_fp_packed_logical_rm, + (memopv2i64 addr:$src2)))]), 0>, VEX_4V; defm V#NAME#PD : sse12_fp_packed_logical_rm, + (memopv2i64 addr:$src2)))]), 0>, OpSize, VEX_4V; } let Constraints = "$src1 = $dst" in { defm PS : sse12_fp_packed_logical_rm opc, string OpcodeStr, (memopv2i64 addr:$src2)))])>, TB; defm PD : sse12_fp_packed_logical_rm opc, string OpcodeStr, } } +/// sse12_fp_packed_logical_y - AVX 256-bit SSE 1 & 2 logical ops forms +/// +let isAsmParserOnly = 1 in { +multiclass sse12_fp_packed_logical_y opc, string OpcodeStr> { + defm PSY : sse12_fp_packed_logical_rm, VEX_4V; + + defm PDY : sse12_fp_packed_logical_rm, OpSize, VEX_4V; +} +} + +// AVX 256-bit packed logical ops forms +defm VAND : sse12_fp_packed_logical_y<0x54, "and">; +defm VOR : sse12_fp_packed_logical_y<0x56, "or">; +defm VXOR : sse12_fp_packed_logical_y<0x57, "xor">; +let isCommutable = 0 in + defm VANDN : sse12_fp_packed_logical_y<0x55, "andn">; + defm AND : sse12_fp_packed_logical<0x54, "and", and>; defm OR : sse12_fp_packed_logical<0x56, "or", or>; defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>; @@ -1753,7 +1566,7 @@ let isCommutable = 0 in // SSE 1 & 2 - Arithmetic Instructions //===----------------------------------------------------------------------===// -/// basic_sse12_fp_binop_rm - SSE 1 & 2 binops come in both scalar and +/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and /// vector forms. /// /// In addition, we also have a special variant of the scalar form here to @@ -1763,158 +1576,102 @@ let isCommutable = 0 in /// /// These three forms can each be reg+reg or reg+mem. /// -multiclass basic_sse12_fp_binop_rm opc, string OpcodeStr, - SDNode OpNode> { - - let isAsmParserOnly = 1 in { - defm V#NAME#SS : sse12_fp_scalar, XS, VEX_4V; - - defm V#NAME#SD : sse12_fp_scalar, XD, VEX_4V; - - defm V#NAME#PS : sse12_fp_packed, - VEX_4V; - - defm V#NAME#PD : sse12_fp_packed, - OpSize, VEX_4V; - - defm V#NAME#SS : sse12_fp_scalar_int, XS, VEX_4V; - - defm V#NAME#SD : sse12_fp_scalar_int, XD, VEX_4V; +multiclass basic_sse12_fp_binop_s opc, string OpcodeStr, SDNode OpNode, + bit Is2Addr = 1> { + defm SS : sse12_fp_scalar, XS; + defm SD : sse12_fp_scalar, XD; +} + +multiclass basic_sse12_fp_binop_p opc, string OpcodeStr, SDNode OpNode, + bit Is2Addr = 1> { + let mayLoad = 0 in { + defm PS : sse12_fp_packed, TB; + defm PD : sse12_fp_packed, TB, OpSize; } +} - let Constraints = "$src1 = $dst" in { - defm SS : sse12_fp_scalar, XS; - - defm SD : sse12_fp_scalar, XD; - - defm PS : sse12_fp_packed, TB; - - defm PD : sse12_fp_packed, TB, OpSize; - - defm SS : sse12_fp_scalar_int, XS; - - defm SD : sse12_fp_scalar_int, XD; +multiclass basic_sse12_fp_binop_p_y opc, string OpcodeStr, + SDNode OpNode> { + let mayLoad = 0 in { + defm PSY : sse12_fp_packed, TB; + defm PDY : sse12_fp_packed, TB, OpSize; } } -// Arithmetic instructions -defm ADD : basic_sse12_fp_binop_rm<0x58, "add", fadd>; -defm MUL : basic_sse12_fp_binop_rm<0x59, "mul", fmul>; - -let isCommutable = 0 in { - defm SUB : basic_sse12_fp_binop_rm<0x5C, "sub", fsub>; - defm DIV : basic_sse12_fp_binop_rm<0x5E, "div", fdiv>; +multiclass basic_sse12_fp_binop_s_int opc, string OpcodeStr, + bit Is2Addr = 1> { + defm SS : sse12_fp_scalar_int, XS; + defm SD : sse12_fp_scalar_int, XD; } -/// sse12_fp_binop_rm - Other SSE 1 & 2 binops -/// -/// This multiclass is like basic_sse12_fp_binop_rm, with the addition of -/// instructions for a full-vector intrinsic form. Operations that map -/// onto C operators don't use this form since they just use the plain -/// vector form instead of having a separate vector intrinsic form. -/// -multiclass sse12_fp_binop_rm opc, string OpcodeStr, - SDNode OpNode> { +multiclass basic_sse12_fp_binop_p_int opc, string OpcodeStr, + bit Is2Addr = 1> { + defm PS : sse12_fp_packed_int, TB; - let isAsmParserOnly = 1 in { - // Scalar operation, reg+reg. - defm V#NAME#SS : sse12_fp_scalar, XS, VEX_4V; - - defm V#NAME#SD : sse12_fp_scalar, XD, VEX_4V; - - defm V#NAME#PS : sse12_fp_packed, - VEX_4V; - - defm V#NAME#PD : sse12_fp_packed, - OpSize, VEX_4V; - - defm V#NAME#SS : sse12_fp_scalar_int, XS, VEX_4V; - - defm V#NAME#SD : sse12_fp_scalar_int, XD, VEX_4V; - - defm V#NAME#PS : sse12_fp_packed_int, VEX_4V; + defm PD : sse12_fp_packed_int, TB, OpSize; +} - defm V#NAME#PD : sse12_fp_packed_int, OpSize, - VEX_4V; - } +// Binary Arithmetic instructions +let isAsmParserOnly = 1 in { + defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, 0>, + basic_sse12_fp_binop_p<0x58, "add", fadd, 0>, + basic_sse12_fp_binop_p_y<0x58, "add", fadd>, VEX_4V; + defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, 0>, + basic_sse12_fp_binop_p<0x59, "mul", fmul, 0>, + basic_sse12_fp_binop_p_y<0x59, "mul", fmul>, VEX_4V; - let Constraints = "$src1 = $dst" in { - // Scalar operation, reg+reg. - defm SS : sse12_fp_scalar, XS; - defm SD : sse12_fp_scalar, XD; - defm PS : sse12_fp_packed, TB; - - defm PD : sse12_fp_packed, TB, OpSize; - - defm SS : sse12_fp_scalar_int, XS; - - defm SD : sse12_fp_scalar_int, XD; - - defm PS : sse12_fp_packed_int, TB; - - defm PD : sse12_fp_packed_int, TB, OpSize; + let isCommutable = 0 in { + defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, 0>, + basic_sse12_fp_binop_p<0x5C, "sub", fsub, 0>, + basic_sse12_fp_binop_p_y<0x5C, "sub", fsub>, VEX_4V; + defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, 0>, + basic_sse12_fp_binop_p<0x5E, "div", fdiv, 0>, + basic_sse12_fp_binop_p_y<0x5E, "div", fdiv>, VEX_4V; + defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, 0>, + basic_sse12_fp_binop_p<0x5F, "max", X86fmax, 0>, + basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax>, VEX_4V; + defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, 0>, + basic_sse12_fp_binop_p<0x5D, "min", X86fmin, 0>, + basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin>, VEX_4V; } } -let isCommutable = 0 in { - defm MAX : sse12_fp_binop_rm<0x5F, "max", X86fmax>; - defm MIN : sse12_fp_binop_rm<0x5D, "min", X86fmin>; +let Constraints = "$src1 = $dst" in { + defm ADD : basic_sse12_fp_binop_s<0x58, "add", fadd>, + basic_sse12_fp_binop_p<0x58, "add", fadd>, + basic_sse12_fp_binop_s_int<0x58, "add">; + defm MUL : basic_sse12_fp_binop_s<0x59, "mul", fmul>, + basic_sse12_fp_binop_p<0x59, "mul", fmul>, + basic_sse12_fp_binop_s_int<0x59, "mul">; + + let isCommutable = 0 in { + defm SUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub>, + basic_sse12_fp_binop_p<0x5C, "sub", fsub>, + basic_sse12_fp_binop_s_int<0x5C, "sub">; + defm DIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv>, + basic_sse12_fp_binop_p<0x5E, "div", fdiv>, + basic_sse12_fp_binop_s_int<0x5E, "div">; + defm MAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax>, + basic_sse12_fp_binop_p<0x5F, "max", X86fmax>, + basic_sse12_fp_binop_s_int<0x5F, "max">, + basic_sse12_fp_binop_p_int<0x5F, "max">; + defm MIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin>, + basic_sse12_fp_binop_p<0x5D, "min", X86fmin>, + basic_sse12_fp_binop_s_int<0x5D, "min">, + basic_sse12_fp_binop_p_int<0x5D, "min">; + } } /// Unop Arithmetic @@ -1931,6 +1688,10 @@ multiclass sse1_fp_unop_s opc, string OpcodeStr, def SSr : SSI; + // For scalar unary operations, fold a load into the operation + // only in OptForSize mode. It eliminates an instruction, but it also + // eliminates a whole-register clobber (the load), so it introduces a + // partial register update condition. def SSm : I, XS, @@ -1943,23 +1704,6 @@ multiclass sse1_fp_unop_s opc, string OpcodeStr, [(set VR128:$dst, (F32Int sse_load_f32:$src))]>; } -/// sse1_fp_unop_p - SSE1 unops in scalar form. -multiclass sse1_fp_unop_p opc, string OpcodeStr, - SDNode OpNode, Intrinsic V4F32Int> { - def PSr : PSI; - def PSm : PSI; - def PSr_Int : PSI; - def PSm_Int : PSI; -} - /// sse1_fp_unop_s_avx - AVX SSE1 unops in scalar form. multiclass sse1_fp_unop_s_avx opc, string OpcodeStr, SDNode OpNode, Intrinsic F32Int> { @@ -1969,7 +1713,7 @@ multiclass sse1_fp_unop_s_avx opc, string OpcodeStr, def SSm : I, XS, Requires<[HasAVX, HasSSE1, OptForSize]>; + []>, XS, Requires<[HasAVX, OptForSize]>; def SSr_Int : SSI opc, string OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; } +/// sse1_fp_unop_p - SSE1 unops in packed form. +multiclass sse1_fp_unop_p opc, string OpcodeStr, SDNode OpNode> { + def PSr : PSI; + def PSm : PSI; +} + +/// sse1_fp_unop_p_y - AVX 256-bit SSE1 unops in packed form. +multiclass sse1_fp_unop_p_y opc, string OpcodeStr, SDNode OpNode> { + def PSYr : PSI; + def PSYm : PSI; +} + +/// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms. +multiclass sse1_fp_unop_p_int opc, string OpcodeStr, + Intrinsic V4F32Int> { + def PSr_Int : PSI; + def PSm_Int : PSI; +} + + /// sse2_fp_unop_s - SSE2 unops in scalar form. multiclass sse2_fp_unop_s opc, string OpcodeStr, SDNode OpNode, Intrinsic F64Int> { def SDr : SDI; - def SDm : SDI; + [(set FR64:$dst, (OpNode (load addr:$src)))]>, XD, + Requires<[HasSSE2, OptForSize]>; def SDr_Int : SDI; @@ -1997,23 +1775,6 @@ multiclass sse2_fp_unop_s opc, string OpcodeStr, [(set VR128:$dst, (F64Int sse_load_f64:$src))]>; } -/// sse2_fp_unop_p - SSE2 unops in vector forms. -multiclass sse2_fp_unop_p opc, string OpcodeStr, - SDNode OpNode, Intrinsic V2F64Int> { - def PDr : PDI; - def PDm : PDI; - def PDr_Int : PDI; - def PDm_Int : PDI; -} - /// sse2_fp_unop_s_avx - AVX SSE2 unops in scalar form. multiclass sse2_fp_unop_s_avx opc, string OpcodeStr, SDNode OpNode, Intrinsic F64Int> { @@ -2034,44 +1795,79 @@ multiclass sse2_fp_unop_s_avx opc, string OpcodeStr, []>; } -let isAsmParserOnly = 1 in { - // Square root. - let Predicates = [HasAVX, HasSSE2] in { - defm VSQRT : sse2_fp_unop_s_avx<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd>, - VEX_4V; +/// sse2_fp_unop_p - SSE2 unops in vector forms. +multiclass sse2_fp_unop_p opc, string OpcodeStr, + SDNode OpNode> { + def PDr : PDI; + def PDm : PDI; +} - defm VSQRT : sse2_fp_unop_p<0x51, "vsqrt", fsqrt, int_x86_sse2_sqrt_pd>, VEX; - } +/// sse2_fp_unop_p_y - AVX SSE2 256-bit unops in vector forms. +multiclass sse2_fp_unop_p_y opc, string OpcodeStr, SDNode OpNode> { + def PDYr : PDI; + def PDYm : PDI; +} - let Predicates = [HasAVX, HasSSE1] in { +/// sse2_fp_unop_p_int - SSE2 intrinsic unops in vector forms. +multiclass sse2_fp_unop_p_int opc, string OpcodeStr, + Intrinsic V2F64Int> { + def PDr_Int : PDI; + def PDm_Int : PDI; +} + +let isAsmParserOnly = 1, Predicates = [HasAVX] in { + // Square root. defm VSQRT : sse1_fp_unop_s_avx<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss>, - VEX_4V; - defm VSQRT : sse1_fp_unop_p<0x51, "vsqrt", fsqrt, int_x86_sse_sqrt_ps>, VEX; + sse2_fp_unop_s_avx<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd>, + VEX_4V; + + defm VSQRT : sse1_fp_unop_p<0x51, "vsqrt", fsqrt>, + sse2_fp_unop_p<0x51, "vsqrt", fsqrt>, + sse1_fp_unop_p_y<0x51, "vsqrt", fsqrt>, + sse2_fp_unop_p_y<0x51, "vsqrt", fsqrt>, + VEX; + // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. defm VRSQRT : sse1_fp_unop_s_avx<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ss>, VEX_4V; - defm VRSQRT : sse1_fp_unop_p<0x52, "vrsqrt", X86frsqrt, int_x86_sse_rsqrt_ps>, - VEX; + defm VRSQRT : sse1_fp_unop_p<0x52, "vrsqrt", X86frsqrt>, + sse1_fp_unop_p_y<0x52, "vrsqrt", X86frsqrt>, VEX; + defm VRCP : sse1_fp_unop_s_avx<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss>, VEX_4V; - defm VRCP : sse1_fp_unop_p<0x53, "vrcp", X86frcp, int_x86_sse_rcp_ps>, - VEX; - } + defm VRCP : sse1_fp_unop_p<0x53, "vrcp", X86frcp>, + sse1_fp_unop_p_y<0x53, "vrcp", X86frcp>, VEX; } // Square root. defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss>, - sse1_fp_unop_p<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ps>, + sse1_fp_unop_p<0x51, "sqrt", fsqrt>, + sse1_fp_unop_p_int<0x51, "sqrt", int_x86_sse_sqrt_ps>, sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd>, - sse2_fp_unop_p<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_pd>; + sse2_fp_unop_p<0x51, "sqrt", fsqrt>, + sse2_fp_unop_p_int<0x51, "sqrt", int_x86_sse2_sqrt_pd>; // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ss>, - sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ps>; + sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt>, + sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps>; defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss>, - sse1_fp_unop_p<0x53, "rcp", X86frcp, int_x86_sse_rcp_ps>; + sse1_fp_unop_p<0x53, "rcp", X86frcp>, + sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps>; // There is no f64 version of the reciprocal approximation instructions. @@ -2205,6 +2001,7 @@ def PREFETCHNTA : PSI<0x18, MRM0m, (outs), (ins i8mem:$src), // Load, store, and memory fence def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>, TB, Requires<[HasSSE1]>; +def : Pat<(X86SFence), (SFENCE)>; // Alias instructions that map zero vector to pxor / xorp* for sse. // We set canFoldAsLoad because this can be converted to a constant-pool @@ -2247,35 +2044,47 @@ def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst), //===---------------------------------------------------------------------===// // SSE2 - Move Aligned/Unaligned Packed Integer Instructions //===---------------------------------------------------------------------===// + let ExeDomain = SSEPackedInt in { // SSE integer instructions let isAsmParserOnly = 1 in { - let neverHasSideEffects = 1 in - def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; - def VMOVDQUrr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX; + let neverHasSideEffects = 1 in { + def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; + def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; + } + def VMOVDQUrr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX; + def VMOVDQUYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX; let canFoldAsLoad = 1, mayLoad = 1 in { - def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "movdqa\t{$src, $dst|$dst, $src}", - [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>, - VEX; - def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), - "vmovdqu\t{$src, $dst|$dst, $src}", - [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>, - XS, VEX, Requires<[HasAVX, HasSSE2]>; + def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; + def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; + let Predicates = [HasAVX] in { + def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; + def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; + } } let mayStore = 1 in { - def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), - (ins i128mem:$dst, VR128:$src), - "movdqa\t{$src, $dst|$dst, $src}", - [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>, VEX; - def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), - "vmovdqu\t{$src, $dst|$dst, $src}", - [/*(store (v2i64 VR128:$src), addr:$dst)*/]>, - XS, VEX, Requires<[HasAVX, HasSSE2]>; + def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), + (ins i128mem:$dst, VR128:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; + def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs), + (ins i256mem:$dst, VR256:$src), + "movdqa\t{$src, $dst|$dst, $src}", []>, VEX; + let Predicates = [HasAVX] in { + def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; + def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), + "vmovdqu\t{$src, $dst|$dst, $src}",[]>, XS, VEX; + } } } @@ -2309,11 +2118,11 @@ let canFoldAsLoad = 1 in def VMOVDQUrm_Int : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovdqu\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>, - XS, VEX, Requires<[HasAVX, HasSSE2]>; + XS, VEX, Requires<[HasAVX]>; def VMOVDQUmr_Int : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "vmovdqu\t{$src, $dst|$dst, $src}", [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>, - XS, VEX, Requires<[HasAVX, HasSSE2]>; + XS, VEX, Requires<[HasAVX]>; } let canFoldAsLoad = 1 in @@ -2421,7 +2230,7 @@ multiclass PDI_binop_rm_v2i64 opc, string OpcodeStr, SDNode OpNode, // 128-bit Integer Arithmetic -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE2] in { +let isAsmParserOnly = 1, Predicates = [HasAVX] in { defm VPADDB : PDI_binop_rm<0xFC, "vpaddb", add, v16i8, 1, 0 /*3addr*/>, VEX_4V; defm VPADDW : PDI_binop_rm<0xFD, "vpaddw", add, v8i16, 1, 0>, VEX_4V; defm VPADDD : PDI_binop_rm<0xFE, "vpaddd", add, v4i32, 1, 0>, VEX_4V; @@ -2511,7 +2320,7 @@ defm PSADBW : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw, 1>; // SSE2 - Packed Integer Logical Instructions //===---------------------------------------------------------------------===// -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE2] in { +let isAsmParserOnly = 1, Predicates = [HasAVX] in { defm VPSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "vpsllw", int_x86_sse2_psll_w, int_x86_sse2_pslli_w, 0>, VEX_4V; @@ -2643,7 +2452,7 @@ let Predicates = [HasSSE2] in { // SSE2 - Packed Integer Comparison Instructions //===---------------------------------------------------------------------===// -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE2] in { +let isAsmParserOnly = 1, Predicates = [HasAVX] in { defm VPCMPEQB : PDI_binop_rm_int<0x74, "vpcmpeqb", int_x86_sse2_pcmpeq_b, 1, 0>, VEX_4V; defm VPCMPEQW : PDI_binop_rm_int<0x75, "vpcmpeqw", int_x86_sse2_pcmpeq_w, 1, @@ -2697,7 +2506,7 @@ def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))), // SSE2 - Packed Integer Pack Instructions //===---------------------------------------------------------------------===// -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE2] in { +let isAsmParserOnly = 1, Predicates = [HasAVX] in { defm VPACKSSWB : PDI_binop_rm_int<0x63, "vpacksswb", int_x86_sse2_packsswb_128, 0, 0>, VEX_4V; defm VPACKSSDW : PDI_binop_rm_int<0x6B, "vpackssdw", int_x86_sse2_packssdw_128, @@ -2735,7 +2544,7 @@ def mi : Ii8<0x70, MRMSrcMem, } } // ExeDomain = SSEPackedInt -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE2] in { +let isAsmParserOnly = 1, Predicates = [HasAVX] in { let AddedComplexity = 5 in defm VPSHUFD : sse2_pshuffle<"vpshufd", v4i32, pshufd, bc_v4i32>, OpSize, VEX; @@ -2783,7 +2592,7 @@ multiclass sse2_unpack opc, string OpcodeStr, ValueType vt, addr:$src2))))]>; } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE2] in { +let isAsmParserOnly = 1, Predicates = [HasAVX] in { defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, unpckl, bc_v16i8, 0>, VEX_4V; defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, unpckl, bc_v8i16, @@ -2893,7 +2702,7 @@ multiclass sse2_pinsrw { } // Extract -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE2] in +let isAsmParserOnly = 1, Predicates = [HasAVX] in def VPEXTRWri : Ii8<0xC5, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2), "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -2906,7 +2715,7 @@ def PEXTRWri : PDIi8<0xC5, MRMSrcReg, imm:$src2))]>; // Insert -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE2] in +let isAsmParserOnly = 1, Predicates = [HasAVX] in defm PINSRW : sse2_pinsrw<0>, OpSize, VEX_4V; let Constraints = "$src1 = $dst" in @@ -3098,7 +2907,7 @@ def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS, - VEX, Requires<[HasAVX, HasSSE2]>; + VEX, Requires<[HasAVX]>; def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "movq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, @@ -3134,7 +2943,7 @@ def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>, - XS, VEX, Requires<[HasAVX, HasSSE2]>; + XS, VEX, Requires<[HasAVX]>; let AddedComplexity = 20 in { def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), @@ -3157,7 +2966,7 @@ let isAsmParserOnly = 1, AddedComplexity = 15 in def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>, - XS, VEX, Requires<[HasAVX, HasSSE2]>; + XS, VEX, Requires<[HasAVX]>; let AddedComplexity = 15 in def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", @@ -3169,7 +2978,7 @@ def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (v2i64 (X86vzmovl (loadv2i64 addr:$src))))]>, - XS, VEX, Requires<[HasAVX, HasSSE2]>; + XS, VEX, Requires<[HasAVX]>; let AddedComplexity = 20 in { def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movq\t{$src, $dst|$dst, $src}", @@ -3183,11 +2992,13 @@ def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))), // Instructions to match in the assembler let isAsmParserOnly = 1 in { -// This instructions is in fact an alias to movd with 64 bit dst def VMOVQs64rr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W; def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W; +// Recognize "movd" with GR64 destination, but encode as a "movq" +def VMOVQd64rr_alt : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), + "movd\t{$src, $dst|$dst, $src}", []>, VEX, VEX_W; } // Instructions for the disassembler @@ -3214,19 +3025,14 @@ def LFENCE : I<0xAE, MRM_E8, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>, TB, Requires<[HasSSE2]>; def MFENCE : I<0xAE, MRM_F0, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>; +def : Pat<(X86LFence), (LFENCE)>; +def : Pat<(X86MFence), (MFENCE)>; + // Pause. This "instruction" is encoded as "rep; nop", so even though it // was introduced with SSE2, it's backward compatible. def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", []>, REP; -//TODO: custom lower this so as to never even generate the noop -def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), - (i8 0)), (NOOP)>; -def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; -def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; -def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), - (i8 1)), (MFENCE)>; - // Alias instructions that map zero vector to pxor / xorp* for sse. // We set canFoldAsLoad because this can be converted to a constant-pool // load of an all-ones value if folding it would be beneficial. @@ -3240,19 +3046,46 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, // SSE3 - Conversion Instructions //===---------------------------------------------------------------------===// -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE3] in { +// Convert Packed Double FP to Packed DW Integers +let isAsmParserOnly = 1, Predicates = [HasAVX] in { +// The assembler can recognize rr 256-bit instructions by seeing a ymm +// register, but the same isn't true when using memory operands instead. +// Provide other assembly rr and rm forms to address this explicitly. def VCVTPD2DQrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtpd2dq\t{$src, $dst|$dst, $src}", []>, VEX; -def VCVTDQ2PDrm : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), - "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX; -def VCVTDQ2PDrr : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), - "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTPD2DQXrYr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), + "vcvtpd2dq\t{$src, $dst|$dst, $src}", []>, VEX; + +// XMM only +def VCVTPD2DQXrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvtpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTPD2DQXrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "vcvtpd2dqx\t{$src, $dst|$dst, $src}", []>, VEX; + +// YMM only +def VCVTPD2DQYrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src), + "vcvtpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTPD2DQYrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), + "vcvtpd2dqy\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L; } def CVTPD2DQrm : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtpd2dq\t{$src, $dst|$dst, $src}", []>; def CVTPD2DQrr : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtpd2dq\t{$src, $dst|$dst, $src}", []>; + +// Convert Packed DW Integers to Packed Double FP +let isAsmParserOnly = 1, Predicates = [HasAVX] in { +def VCVTDQ2PDrm : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTDQ2PDrr : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTDQ2PDYrm : S3SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX; +def VCVTDQ2PDYrr : S3SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + "vcvtdq2pd\t{$src, $dst|$dst, $src}", []>, VEX; +} + def CVTDQ2PDrm : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", []>; def CVTDQ2PDrr : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), @@ -3274,9 +3107,20 @@ def rm : S3SI; } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE3] in { -defm VMOVSHDUP : sse3_replicate_sfp<0x16, movshdup, "vmovshdup">, VEX; -defm VMOVSLDUP : sse3_replicate_sfp<0x12, movsldup, "vmovsldup">, VEX; +multiclass sse3_replicate_sfp_y op, PatFrag rep_frag, + string OpcodeStr> { +def rr : S3SI; +def rm : S3SI; +} + +let isAsmParserOnly = 1, Predicates = [HasAVX] in { + // FIXME: Merge above classes when we have patterns for the ymm version + defm VMOVSHDUP : sse3_replicate_sfp<0x16, movshdup, "vmovshdup">, VEX; + defm VMOVSLDUP : sse3_replicate_sfp<0x12, movsldup, "vmovsldup">, VEX; + defm VMOVSHDUPY : sse3_replicate_sfp_y<0x16, movshdup, "vmovshdup">, VEX; + defm VMOVSLDUPY : sse3_replicate_sfp_y<0x12, movsldup, "vmovsldup">, VEX; } defm MOVSHDUP : sse3_replicate_sfp<0x16, movshdup, "movshdup">; defm MOVSLDUP : sse3_replicate_sfp<0x12, movsldup, "movsldup">; @@ -3293,15 +3137,30 @@ def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src), (undef))))]>; } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE3] in - defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX; +multiclass sse3_replicate_dfp_y { +def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + []>; +def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + []>; +} + +let isAsmParserOnly = 1, Predicates = [HasAVX] in { + // FIXME: Merge above classes when we have patterns for the ymm version + defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX; + defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX; +} defm MOVDDUP : sse3_replicate_dfp<"movddup">; // Move Unaligned Integer -let isAsmParserOnly = 1 in +let isAsmParserOnly = 1, Predicates = [HasAVX] in { def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vlddqu\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX; + def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), + "vlddqu\t{$src, $dst|$dst, $src}", []>, VEX; +} def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "lddqu\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>; @@ -3342,35 +3201,41 @@ let AddedComplexity = 20 in // SSE3 - Arithmetic //===---------------------------------------------------------------------===// -multiclass sse3_addsub { +multiclass sse3_addsub { def rr : I<0xD0, MRMSrcReg, - (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, (Int VR128:$src1, - VR128:$src2))]>; + [(set RC:$dst, (Int RC:$src1, RC:$src2))]>; def rm : I<0xD0, MRMSrcMem, - (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2), + (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, (Int VR128:$src1, - (memop addr:$src2)))]>; - + [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))]>; } -let isAsmParserOnly = 1, Predicates = [HasSSE3, HasAVX], +let isAsmParserOnly = 1, Predicates = [HasAVX], ExeDomain = SSEPackedDouble in { - defm VADDSUBPS : sse3_addsub, XD, - VEX_4V; - defm VADDSUBPD : sse3_addsub, OpSize, - VEX_4V; + defm VADDSUBPS : sse3_addsub, XD, VEX_4V; + defm VADDSUBPD : sse3_addsub, OpSize, VEX_4V; + let Pattern = [] in { + defm VADDSUBPSY : sse3_addsub, XD, VEX_4V; + defm VADDSUBPDY : sse3_addsub, OpSize, VEX_4V; + } } let Constraints = "$src1 = $dst", Predicates = [HasSSE3], ExeDomain = SSEPackedDouble in { - defm ADDSUBPS : sse3_addsub, XD; - defm ADDSUBPD : sse3_addsub, TB, OpSize; + defm ADDSUBPS : sse3_addsub, XD; + defm ADDSUBPD : sse3_addsub, TB, OpSize; } //===---------------------------------------------------------------------===// @@ -3378,51 +3243,65 @@ let Constraints = "$src1 = $dst", Predicates = [HasSSE3], //===---------------------------------------------------------------------===// // Horizontal ops -class S3D_Intrr o, string OpcodeStr, Intrinsic IntId, bit Is2Addr = 1> - : S3DI o, string OpcodeStr, ValueType vt, RegisterClass RC, + X86MemOperand x86memop, Intrinsic IntId, bit Is2Addr = 1> { + def rr : S3DI; -class S3D_Intrm o, string OpcodeStr, Intrinsic IntId, bit Is2Addr = 1> - : S3DI; + + def rm : S3DI; -class S3_Intrr o, string OpcodeStr, Intrinsic IntId, bit Is2Addr = 1> - : S3I; +} +multiclass S3_Int o, string OpcodeStr, ValueType vt, RegisterClass RC, + X86MemOperand x86memop, Intrinsic IntId, bit Is2Addr = 1> { + def rr : S3I; -class S3_Intrm o, string OpcodeStr, Intrinsic IntId, bit Is2Addr = 1> - : S3I; + + def rm : S3I; - -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE3] in { - def VHADDPSrr : S3D_Intrr<0x7C, "vhaddps", int_x86_sse3_hadd_ps, 0>, VEX_4V; - def VHADDPSrm : S3D_Intrm<0x7C, "vhaddps", int_x86_sse3_hadd_ps, 0>, VEX_4V; - def VHADDPDrr : S3_Intrr <0x7C, "vhaddpd", int_x86_sse3_hadd_pd, 0>, VEX_4V; - def VHADDPDrm : S3_Intrm <0x7C, "vhaddpd", int_x86_sse3_hadd_pd, 0>, VEX_4V; - def VHSUBPSrr : S3D_Intrr<0x7D, "vhsubps", int_x86_sse3_hsub_ps, 0>, VEX_4V; - def VHSUBPSrm : S3D_Intrm<0x7D, "vhsubps", int_x86_sse3_hsub_ps, 0>, VEX_4V; - def VHSUBPDrr : S3_Intrr <0x7D, "vhsubpd", int_x86_sse3_hsub_pd, 0>, VEX_4V; - def VHSUBPDrm : S3_Intrm <0x7D, "vhsubpd", int_x86_sse3_hsub_pd, 0>, VEX_4V; + [(set RC:$dst, (vt (IntId RC:$src1, (memop addr:$src2))))]>; +} + +let isAsmParserOnly = 1, Predicates = [HasAVX] in { + defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, + int_x86_sse3_hadd_ps, 0>, VEX_4V; + defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem, + int_x86_sse3_hadd_pd, 0>, VEX_4V; + defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, + int_x86_sse3_hsub_ps, 0>, VEX_4V; + defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem, + int_x86_sse3_hsub_pd, 0>, VEX_4V; + let Pattern = [] in { + defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, + int_x86_sse3_hadd_ps, 0>, VEX_4V; + defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem, + int_x86_sse3_hadd_pd, 0>, VEX_4V; + defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, + int_x86_sse3_hsub_ps, 0>, VEX_4V; + defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem, + int_x86_sse3_hsub_pd, 0>, VEX_4V; + } } let Constraints = "$src1 = $dst" in { - def HADDPSrr : S3D_Intrr<0x7C, "haddps", int_x86_sse3_hadd_ps>; - def HADDPSrm : S3D_Intrm<0x7C, "haddps", int_x86_sse3_hadd_ps>; - def HADDPDrr : S3_Intrr <0x7C, "haddpd", int_x86_sse3_hadd_pd>; - def HADDPDrm : S3_Intrm <0x7C, "haddpd", int_x86_sse3_hadd_pd>; - def HSUBPSrr : S3D_Intrr<0x7D, "hsubps", int_x86_sse3_hsub_ps>; - def HSUBPSrm : S3D_Intrm<0x7D, "hsubps", int_x86_sse3_hsub_ps>; - def HSUBPDrr : S3_Intrr <0x7D, "hsubpd", int_x86_sse3_hsub_pd>; - def HSUBPDrm : S3_Intrm <0x7D, "hsubpd", int_x86_sse3_hsub_pd>; + defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, + int_x86_sse3_hadd_ps>; + defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, + int_x86_sse3_hadd_pd>; + defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, + int_x86_sse3_hsub_ps>; + defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, + int_x86_sse3_hsub_pd>; } //===---------------------------------------------------------------------===// @@ -3456,7 +3335,7 @@ multiclass SS3I_unop_rm_int opc, string OpcodeStr, (bitconvert (mem_frag128 addr:$src))))]>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE3] in { +let isAsmParserOnly = 1, Predicates = [HasAVX] in { defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb", memopv8i8, memopv16i8, int_x86_ssse3_pabs_b, int_x86_ssse3_pabs_b_128>, VEX; @@ -3521,7 +3400,7 @@ multiclass SS3I_binop_rm_int opc, string OpcodeStr, (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE3] in { +let isAsmParserOnly = 1, Predicates = [HasAVX] in { let isCommutable = 0 in { defm VPHADDW : SS3I_binop_rm_int<0x01, "vphaddw", memopv4i16, memopv8i16, int_x86_ssse3_phadd_w, @@ -3645,7 +3524,7 @@ multiclass sse3_palign { []>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE3] in +let isAsmParserOnly = 1, Predicates = [HasAVX] in defm VPALIGN : sse3_palign<"vpalignr", 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm PALIGN : sse3_palign<"palignr">; @@ -3991,7 +3870,7 @@ multiclass SS41I_binop_rm_int8 opc, string OpcodeStr, Intrinsic IntId> { OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE41] in { +let isAsmParserOnly = 1, Predicates = [HasAVX] in { defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", int_x86_sse41_pmovsxbw>, VEX; defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd", int_x86_sse41_pmovsxwd>, @@ -4057,7 +3936,7 @@ multiclass SS41I_binop_rm_int4 opc, string OpcodeStr, Intrinsic IntId> { OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE41] in { +let isAsmParserOnly = 1, Predicates = [HasAVX] in { defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd>, VEX; defm VPMOVSXWQ : SS41I_binop_rm_int4<0x24, "vpmovsxwq", int_x86_sse41_pmovsxwq>, @@ -4098,7 +3977,7 @@ multiclass SS41I_binop_rm_int2 opc, string OpcodeStr, Intrinsic IntId> { OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE41] in { +let isAsmParserOnly = 1, Predicates = [HasAVX] in { defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq>, VEX; defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>, @@ -4140,7 +4019,7 @@ multiclass SS41I_extract8 opc, string OpcodeStr> { // (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst) } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE41] in +let isAsmParserOnly = 1, Predicates = [HasAVX] in defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX; defm PEXTRB : SS41I_extract8<0x14, "pextrb">; @@ -4158,7 +4037,7 @@ multiclass SS41I_extract16 opc, string OpcodeStr> { // (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst) } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE41] in +let isAsmParserOnly = 1, Predicates = [HasAVX] in defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX; defm PEXTRW : SS41I_extract16<0x15, "pextrw">; @@ -4180,7 +4059,7 @@ multiclass SS41I_extract32 opc, string OpcodeStr> { addr:$dst)]>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE41] in +let isAsmParserOnly = 1, Predicates = [HasAVX] in defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; defm PEXTRD : SS41I_extract32<0x16, "pextrd">; @@ -4201,7 +4080,7 @@ multiclass SS41I_extract64 opc, string OpcodeStr> { addr:$dst)]>, OpSize, REX_W; } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE41] in +let isAsmParserOnly = 1, Predicates = [HasAVX] in defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W; defm PEXTRQ : SS41I_extract64<0x16, "pextrq">; @@ -4224,8 +4103,13 @@ multiclass SS41I_extractf32 opc, string OpcodeStr> { addr:$dst)]>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE41] in +let isAsmParserOnly = 1, Predicates = [HasAVX] in { defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX; + def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst), + (ins VR128:$src1, i32i8imm:$src2), + "vextractps \t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, OpSize, VEX; +} defm EXTRACTPS : SS41I_extractf32<0x17, "extractps">; // Also match an EXTRACTPS store when the store is done as f32 instead of i32. @@ -4259,7 +4143,7 @@ multiclass SS41I_insert8 opc, string asm, bit Is2Addr = 1> { imm:$src3))]>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE41] in +let isAsmParserOnly = 1, Predicates = [HasAVX] in defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm PINSRB : SS41I_insert8<0x20, "pinsrb">; @@ -4285,7 +4169,7 @@ multiclass SS41I_insert32 opc, string asm, bit Is2Addr = 1> { imm:$src3)))]>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE41] in +let isAsmParserOnly = 1, Predicates = [HasAVX] in defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm PINSRD : SS41I_insert32<0x22, "pinsrd">; @@ -4311,7 +4195,7 @@ multiclass SS41I_insert64 opc, string asm, bit Is2Addr = 1> { imm:$src3)))]>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE41] in +let isAsmParserOnly = 1, Predicates = [HasAVX] in defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W; let Constraints = "$src1 = $dst" in defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; @@ -4344,7 +4228,7 @@ multiclass SS41I_insertf32 opc, string asm, bit Is2Addr = 1> { let Constraints = "$src1 = $dst" in defm INSERTPS : SS41I_insertf32<0x21, "insertps">; -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE41] in +let isAsmParserOnly = 1, Predicates = [HasAVX] in defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V; def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3), @@ -4395,33 +4279,33 @@ multiclass sse41_fp_unop_rm opcps, bits<8> opcpd, OpSize; } -multiclass sse41_fp_unop_rm_avx opcps, bits<8> opcpd, - string OpcodeStr> { +multiclass sse41_fp_unop_rm_avx_p opcps, bits<8> opcpd, + RegisterClass RC, X86MemOperand x86memop, string OpcodeStr> { // Intrinsic operation, reg. // Vector intrinsic operation, reg def PSr : SS4AIi8, OpSize; // Vector intrinsic operation, mem def PSm : Ii8, TA, OpSize, Requires<[HasSSE41]>; // Vector intrinsic operation, reg def PDr : SS4AIi8, OpSize; // Vector intrinsic operation, mem def PDm : SS4AIi8, OpSize; @@ -4478,8 +4362,8 @@ multiclass sse41_fp_binop_rm opcss, bits<8> opcsd, OpSize; } -multiclass sse41_fp_binop_rm_avx opcss, bits<8> opcsd, - string OpcodeStr> { +multiclass sse41_fp_binop_rm_avx_s opcss, bits<8> opcsd, + string OpcodeStr> { // Intrinsic operation, reg. def SSr : SS4AIi8 opcss, bits<8> opcsd, } // FP round - roundss, roundps, roundsd, roundpd -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE41] in { +let isAsmParserOnly = 1, Predicates = [HasAVX] in { // Intrinsic form defm VROUND : sse41_fp_unop_rm<0x08, 0x09, "vround", int_x86_sse41_round_ps, int_x86_sse41_round_pd>, @@ -4519,8 +4403,11 @@ let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE41] in { int_x86_sse41_round_ss, int_x86_sse41_round_sd, 0>, VEX_4V; // Instructions for the assembler - defm VROUND : sse41_fp_unop_rm_avx<0x08, 0x09, "vround">, VEX; - defm VROUND : sse41_fp_binop_rm_avx<0x0A, 0x0B, "vround">, VEX_4V; + defm VROUND : sse41_fp_unop_rm_avx_p<0x08, 0x09, VR128, f128mem, "vround">, + VEX; + defm VROUNDY : sse41_fp_unop_rm_avx_p<0x08, 0x09, VR256, f256mem, "vround">, + VEX; + defm VROUND : sse41_fp_binop_rm_avx_s<0x0A, 0x0B, "vround">, VEX_4V; } defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", @@ -4529,6 +4416,57 @@ let Constraints = "$src1 = $dst" in defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round", int_x86_sse41_round_ss, int_x86_sse41_round_sd>; +//===----------------------------------------------------------------------===// +// SSE4.1 - Packed Bit Test +//===----------------------------------------------------------------------===// + +// ptest instruction we'll lower to this in X86ISelLowering primarily from +// the intel intrinsic that corresponds to this. +let Defs = [EFLAGS], isAsmParserOnly = 1, Predicates = [HasAVX] in { +def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), + "vptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86ptest VR128:$src1, VR128:$src2))]>, + OpSize, VEX; +def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2), + "vptest\t{$src2, $src1|$src1, $src2}", []>, OpSize, VEX; + +def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2), + "vptest\t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86ptest VR128:$src1, (load addr:$src2)))]>, + OpSize, VEX; +def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2), + "vptest\t{$src2, $src1|$src1, $src2}", []>, OpSize, VEX; +} + +let Defs = [EFLAGS] in { +def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), + "ptest \t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86ptest VR128:$src1, VR128:$src2))]>, + OpSize; +def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2), + "ptest \t{$src2, $src1|$src1, $src2}", + [(set EFLAGS, (X86ptest VR128:$src1, (load addr:$src2)))]>, + OpSize; +} + +// The bit test instructions below are AVX only +multiclass avx_bittest opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop> { + def rr : SS48I, OpSize, VEX; + def rm : SS48I, OpSize, VEX; +} + +let Defs = [EFLAGS], isAsmParserOnly = 1, Predicates = [HasAVX] in { + defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem>; + defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem>; + defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem>; + defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem>; +} + //===----------------------------------------------------------------------===// // SSE4.1 - Misc Instructions //===----------------------------------------------------------------------===// @@ -4548,7 +4486,7 @@ multiclass SS41I_unop_rm_int_v16 opc, string OpcodeStr, (bitconvert (memopv8i16 addr:$src))))]>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE41] in +let isAsmParserOnly = 1, Predicates = [HasAVX] in defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw", int_x86_sse41_phminposuw>, VEX; defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw", @@ -4574,7 +4512,7 @@ multiclass SS41I_binop_rm_int opc, string OpcodeStr, (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE41] in { +let isAsmParserOnly = 1, Predicates = [HasAVX] in { let isCommutable = 0 in defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw, 0>, VEX_4V; @@ -4641,86 +4579,105 @@ multiclass SS48I_binop_rm opc, string OpcodeStr, SDNode OpNode, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE41] in +let isAsmParserOnly = 1, Predicates = [HasAVX] in defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32>; /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate multiclass SS41I_binop_rmi_int opc, string OpcodeStr, - Intrinsic IntId128, bit Is2Addr = 1> { + Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, bit Is2Addr = 1> { let isCommutable = 1 in - def rri : SS4AIi8, + [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>, OpSize; - def rmi : SS4AIi8, + [(set RC:$dst, + (IntId RC:$src1, + (bitconvert (memop_frag addr:$src2)), imm:$src3))]>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE41] in { +let isAsmParserOnly = 1, Predicates = [HasAVX] in { let isCommutable = 0 in { defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, - 0>, VEX_4V; + VR128, memopv16i8, i128mem, 0>, VEX_4V; defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, - 0>, VEX_4V; + VR128, memopv16i8, i128mem, 0>, VEX_4V; + let Pattern = [] in { + defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps, + VR256, memopv32i8, i256mem, 0>, VEX_4V; + defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd, + VR256, memopv32i8, i256mem, 0>, VEX_4V; + } defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw, - 0>, VEX_4V; + VR128, memopv16i8, i128mem, 0>, VEX_4V; defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, - 0>, VEX_4V; + VR128, memopv16i8, i128mem, 0>, VEX_4V; } defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, - 0>, VEX_4V; + VR128, memopv16i8, i128mem, 0>, VEX_4V; defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd, - 0>, VEX_4V; + VR128, memopv16i8, i128mem, 0>, VEX_4V; + let Pattern = [] in + defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, + VR256, memopv32i8, i256mem, 0>, VEX_4V; } let Constraints = "$src1 = $dst" in { let isCommutable = 0 in { - defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps>; - defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd>; - defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw>; - defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw>; + defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps, + VR128, memopv16i8, i128mem>; + defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd, + VR128, memopv16i8, i128mem>; + defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw, + VR128, memopv16i8, i128mem>; + defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw, + VR128, memopv16i8, i128mem>; } - defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps>; - defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd>; + defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, + VR128, memopv16i8, i128mem>; + defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd, + VR128, memopv16i8, i128mem>; } /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE41] in { - multiclass SS41I_quaternary_int_avx opc, string OpcodeStr> { - def rr : I, OpSize, TA, VEX_4V, VEX_I8IMM; +let isAsmParserOnly = 1, Predicates = [HasAVX] in { +multiclass SS41I_quaternary_int_avx opc, string OpcodeStr, + RegisterClass RC, X86MemOperand x86memop> { + def rr : I, OpSize, TA, VEX_4V, VEX_I8IMM; - def rm : I, OpSize, TA, VEX_4V, VEX_I8IMM; - } + def rm : I, OpSize, TA, VEX_4V, VEX_I8IMM; +} } -defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd">; -defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps">; -defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb">; +defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, i128mem>; +defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, i128mem>; +defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, i256mem>; +defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, i256mem>; + +defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem>; /// SS41I_ternary_int - SSE 4.1 ternary operator let Uses = [XMM0], Constraints = "$src1 = $dst" in { @@ -4746,31 +4703,7 @@ defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", int_x86_sse41_blendvpd>; defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", int_x86_sse41_blendvps>; defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>; -// ptest instruction we'll lower to this in X86ISelLowering primarily from -// the intel intrinsic that corresponds to this. -let Defs = [EFLAGS], isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE41] in { -def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), - "vptest\t{$src2, $src1|$src1, $src2}", - [(set EFLAGS, (X86ptest VR128:$src1, VR128:$src2))]>, - OpSize, VEX; -def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2), - "vptest\t{$src2, $src1|$src1, $src2}", - [(set EFLAGS, (X86ptest VR128:$src1, (load addr:$src2)))]>, - OpSize, VEX; -} - -let Defs = [EFLAGS] in { -def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2), - "ptest \t{$src2, $src1|$src1, $src2}", - [(set EFLAGS, (X86ptest VR128:$src1, VR128:$src2))]>, - OpSize; -def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2), - "ptest \t{$src2, $src1|$src1, $src2}", - [(set EFLAGS, (X86ptest VR128:$src1, (load addr:$src2)))]>, - OpSize; -} - -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE41] in +let isAsmParserOnly = 1, Predicates = [HasAVX] in def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>, @@ -4804,7 +4737,7 @@ multiclass SS42I_binop_rm_int opc, string OpcodeStr, (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE42] in +let isAsmParserOnly = 1, Predicates = [HasAVX] in defm VPCMPGTQ : SS42I_binop_rm_int<0x37, "vpcmpgtq", int_x86_sse42_pcmpgtq, 0>, VEX_4V; let Constraints = "$src1 = $dst" in @@ -4834,7 +4767,7 @@ let Defs = [EFLAGS], usesCustomInserter = 1 in { } let Defs = [XMM0, EFLAGS], isAsmParserOnly = 1, - Predicates = [HasAVX, HasSSE42] in { + Predicates = [HasAVX] in { def VPCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2, i8imm:$src3), "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX; @@ -4869,7 +4802,7 @@ let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { OpSize; } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE42], +let isAsmParserOnly = 1, Predicates = [HasAVX], Defs = [XMM0, EFLAGS], Uses = [EAX, EDX] in { def VPCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src3, i8imm:$src5), @@ -4904,7 +4837,7 @@ let Defs = [ECX, EFLAGS] in { } } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE42] in { +let isAsmParserOnly = 1, Predicates = [HasAVX] in { defm VPCMPISTRI : SS42AI_pcmpistri, VEX; defm VPCMPISTRIA : SS42AI_pcmpistri, @@ -4943,7 +4876,7 @@ let Defs = [ECX, EFLAGS], Uses = [EAX, EDX] in { } } -let isAsmParserOnly = 1, Predicates = [HasAVX, HasSSE42] in { +let isAsmParserOnly = 1, Predicates = [HasAVX] in { defm VPCMPESTRI : SS42AI_pcmpestri, VEX; defm VPCMPESTRIA : SS42AI_pcmpestri, @@ -5158,3 +5091,106 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), (int_x86_aesni_aeskeygenassist (bitconvert (memopv2i64 addr:$src1)), imm:$src2))]>, OpSize; + +//===----------------------------------------------------------------------===// +// AVX Instructions +//===----------------------------------------------------------------------===// + +let isAsmParserOnly = 1 in { + +// Load from memory and broadcast to all elements of the destination operand +class avx_broadcast opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop> : + AVX8I, VEX; + +def VBROADCASTSS : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem>; +def VBROADCASTSSY : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem>; +def VBROADCASTSD : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem>; +def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem>; + +// Insert packed floating-point values +def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR128:$src2, i8imm:$src3), + "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, VEX_4V; +def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f128mem:$src2, i8imm:$src3), + "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, VEX_4V; + +// Extract packed floating-point values +def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst), + (ins VR256:$src1, i8imm:$src2), + "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, VEX; +def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), + (ins f128mem:$dst, VR256:$src1, i8imm:$src2), + "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, VEX; + +// Conditional SIMD Packed Loads and Stores +multiclass avx_movmask_rm opc_rm, bits<8> opc_mr, string OpcodeStr> { + def rm : AVX8I, VEX_4V; + def Yrm : AVX8I, VEX_4V; + def mr : AVX8I, VEX_4V; + def Ymr : AVX8I, VEX_4V; +} + +defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps">; +defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd">; + +// Permute Floating-Point Values +multiclass avx_permil opc_rm, bits<8> opc_rmi, string OpcodeStr, + RegisterClass RC, X86MemOperand x86memop> { + def rr : AVX8I, VEX_4V; + def rm : AVX8I, VEX_4V; + def ri : AVXAIi8, VEX; + def mi : AVXAIi8, VEX; +} + +defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem>; +defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem>; +defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem>; +defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem>; + +def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, i8imm:$src3), + "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, VEX_4V; +def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, f256mem:$src2, i8imm:$src3), + "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + []>, VEX_4V; + +// Zero All YMM registers +def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", []>, VEX, VEX_L; + +// Zero Upper bits of YMM registers +def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", []>, VEX; + +} // isAsmParserOnly