From 2038252c6a36efd18cc0bef216fa2c5bb9236617 Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Tue, 10 Jul 2007 00:05:58 +0000 Subject: [PATCH] Define non-intrinsic instructions for vector min, max, sqrt, rsqrt, and rcp, in addition to the intrinsic forms. Add spill-folding entries for these new instructions, and for the scalar min and max instrinsic instructions which were missing. And add some preliminary ISelLowering code for using the new non-intrinsic vector sqrt instruction, and fneg and fabs. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@38478 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 57 +++- lib/Target/X86/X86ISelLowering.h | 6 + lib/Target/X86/X86InstrSSE.td | 510 +++++++++++++++++++---------- lib/Target/X86/X86RegisterInfo.cpp | 20 ++ 4 files changed, 398 insertions(+), 195 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b8dad13ee0f..3bf2b9f6c5d 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -331,6 +331,13 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::ValueType)VT, Expand); setOperationAction(ISD::EXTRACT_VECTOR_ELT, (MVT::ValueType)VT, Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::FABS, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::FSIN, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::FCOS, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::FREM, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::FPOWI, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::FSQRT, (MVT::ValueType)VT, Expand); + setOperationAction(ISD::FCOPYSIGN, (MVT::ValueType)VT, Expand); } if (Subtarget->hasMMX()) { @@ -408,6 +415,9 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) setOperationAction(ISD::FSUB, MVT::v4f32, Legal); setOperationAction(ISD::FMUL, MVT::v4f32, Legal); setOperationAction(ISD::FDIV, MVT::v4f32, Legal); + setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); + setOperationAction(ISD::FNEG, MVT::v4f32, Custom); + setOperationAction(ISD::FABS, MVT::v4f32, Custom); setOperationAction(ISD::LOAD, MVT::v4f32, Legal); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); @@ -435,6 +445,9 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) setOperationAction(ISD::FSUB, MVT::v2f64, Legal); setOperationAction(ISD::FMUL, MVT::v2f64, Legal); setOperationAction(ISD::FDIV, MVT::v2f64, Legal); + setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); + setOperationAction(ISD::FNEG, MVT::v2f64, Custom); + setOperationAction(ISD::FABS, MVT::v2f64, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); @@ -3326,16 +3339,21 @@ SDOperand X86TargetLowering::LowerFP_TO_SINT(SDOperand Op, SelectionDAG &DAG) { SDOperand X86TargetLowering::LowerFABS(SDOperand Op, SelectionDAG &DAG) { MVT::ValueType VT = Op.getValueType(); - const Type *OpNTy = MVT::getTypeForValueType(VT); + MVT::ValueType EltVT = VT; + if (MVT::isVector(VT)) + EltVT = MVT::getVectorElementType(VT); + const Type *OpNTy = MVT::getTypeForValueType(EltVT); std::vector CV; - if (VT == MVT::f64) { - CV.push_back(ConstantFP::get(OpNTy, BitsToDouble(~(1ULL << 63)))); - CV.push_back(ConstantFP::get(OpNTy, 0.0)); + if (EltVT == MVT::f64) { + Constant *C = ConstantFP::get(OpNTy, BitsToDouble(~(1ULL << 63))); + CV.push_back(C); + CV.push_back(C); } else { - CV.push_back(ConstantFP::get(OpNTy, BitsToFloat(~(1U << 31)))); - CV.push_back(ConstantFP::get(OpNTy, 0.0)); - CV.push_back(ConstantFP::get(OpNTy, 0.0)); - CV.push_back(ConstantFP::get(OpNTy, 0.0)); + Constant *C = ConstantFP::get(OpNTy, BitsToFloat(~(1U << 31))); + CV.push_back(C); + CV.push_back(C); + CV.push_back(C); + CV.push_back(C); } Constant *CS = ConstantStruct::get(CV); SDOperand CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4); @@ -3350,16 +3368,21 @@ SDOperand X86TargetLowering::LowerFABS(SDOperand Op, SelectionDAG &DAG) { SDOperand X86TargetLowering::LowerFNEG(SDOperand Op, SelectionDAG &DAG) { MVT::ValueType VT = Op.getValueType(); - const Type *OpNTy = MVT::getTypeForValueType(VT); + MVT::ValueType EltVT = VT; + if (MVT::isVector(VT)) + EltVT = MVT::getVectorElementType(VT); + const Type *OpNTy = MVT::getTypeForValueType(EltVT); std::vector CV; - if (VT == MVT::f64) { - CV.push_back(ConstantFP::get(OpNTy, BitsToDouble(1ULL << 63))); - CV.push_back(ConstantFP::get(OpNTy, 0.0)); + if (EltVT == MVT::f64) { + Constant *C = ConstantFP::get(OpNTy, BitsToDouble(1ULL << 63)); + CV.push_back(C); + CV.push_back(C); } else { - CV.push_back(ConstantFP::get(OpNTy, BitsToFloat(1U << 31))); - CV.push_back(ConstantFP::get(OpNTy, 0.0)); - CV.push_back(ConstantFP::get(OpNTy, 0.0)); - CV.push_back(ConstantFP::get(OpNTy, 0.0)); + Constant *C = ConstantFP::get(OpNTy, BitsToFloat(1U << 31)); + CV.push_back(C); + CV.push_back(C); + CV.push_back(C); + CV.push_back(C); } Constant *CS = ConstantStruct::get(CV); SDOperand CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4); @@ -4284,6 +4307,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PINSRW: return "X86ISD::PINSRW"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMIN: return "X86ISD::FMIN"; + case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; + case X86ISD::FRCP: return "X86ISD::FRCP"; case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; case X86ISD::THREAD_POINTER: return "X86ISD::THREAD_POINTER"; } diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 00d93755bd1..b9aaefa5c87 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -177,6 +177,12 @@ namespace llvm { /// FMAX, FMIN - Floating point max and min. /// FMAX, FMIN, + + /// FRSQRT, FRCP - Floating point reciprocal-sqrt and reciprocal + /// approximation. Note that these typically require refinement + /// in order to obtain suitable precision. + FRSQRT, FRCP, + // Thread Local Storage TLSADDR, THREAD_POINTER }; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 2cbd31e5ea4..5fc7a65a084 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -31,6 +31,8 @@ def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp, [SDNPCommutative, SDNPAssociative]>; def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp, [SDNPCommutative, SDNPAssociative]>; +def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>; +def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>; def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>; def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest, [SDNPHasChain, SDNPOutFlag]>; @@ -247,16 +249,6 @@ class PSI o, Format F, dag ops, string asm, list pattern> class PSIi8 o, Format F, dag ops, string asm, list pattern> : Ii8, TB, Requires<[HasSSE1]>; -// Helpers for defining instructions that directly correspond to intrinsics. -multiclass SS_IntUnary o, string OpcodeStr, Intrinsic IntId> { - def r : SSI; - def m : SSI; -} - // Move Instructions def MOVSSrr : SSI<0x10, MRMSrcReg, (ops FR32:$dst, FR32:$src), "movss {$src, $dst|$dst, $src}", []>; @@ -267,18 +259,6 @@ def MOVSSmr : SSI<0x11, MRMDestMem, (ops f32mem:$dst, FR32:$src), "movss {$src, $dst|$dst, $src}", [(store FR32:$src, addr:$dst)]>; -def SQRTSSr : SSI<0x51, MRMSrcReg, (ops FR32:$dst, FR32:$src), - "sqrtss {$src, $dst|$dst, $src}", - [(set FR32:$dst, (fsqrt FR32:$src))]>; -def SQRTSSm : SSI<0x51, MRMSrcMem, (ops FR32:$dst, f32mem:$src), - "sqrtss {$src, $dst|$dst, $src}", - [(set FR32:$dst, (fsqrt (loadf32 addr:$src)))]>; - -// Aliases to match intrinsics which expect XMM operand(s). -defm SQRTSS_Int : SS_IntUnary<0x51, "sqrtss" , int_x86_sse_sqrt_ss>; -defm RSQRTSS_Int : SS_IntUnary<0x52, "rsqrtss", int_x86_sse_rsqrt_ss>; -defm RCPSS_Int : SS_IntUnary<0x53, "rcpss" , int_x86_sse_rcp_ss>; - // Conversion instructions def CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (ops GR32:$dst, FR32:$src), "cvttss2si {$src, $dst|$dst, $src}", @@ -425,20 +405,20 @@ def FsANDNPSrm : PSI<0x55, MRMSrcMem, "andnps {$src2, $dst|$dst, $src2}", []>; } -/// scalar_sse1_fp_binop_rm - Scalar SSE1 binops come in three basic forms: -/// -/// 1. f32 - This comes in SSE1 form for floats. -/// 2. rr vs rm - They include a reg+reg form and a reg+mem form. +/// basic_sse1_fp_binop_rm - SSE1 binops come in both scalar and vector forms. +/// +/// In addition, we also have a special variant of the scalar form here to +/// represent the associated intrinsic operation. This form is unlike the +/// plain scalar form, in that it takes an entire vector (instead of a scalar) +/// and leaves the top elements undefined. /// -/// In addition, scalar SSE ops have an intrinsic form. This form is unlike the -/// normal form, in that they take an entire vector (instead of a scalar) and -/// leave the top elements undefined. This adds another two variants of the -/// above permutations, giving us 8 forms for 'instruction'. +/// These three forms can each be reg+reg or reg+mem, so there are a total of +/// six "instructions". /// let isTwoAddress = 1 in { -multiclass scalar_sse1_fp_binop_rm opc, string OpcodeStr, - SDNode OpNode, Intrinsic F32Int, - bit Commutable = 0> { +multiclass basic_sse1_fp_binop_rm opc, string OpcodeStr, + SDNode OpNode, Intrinsic F32Int, + bit Commutable = 0> { // Scalar operation, reg+reg. def SSrr : SSI opc, string OpcodeStr, !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"), [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>; - // Vector intrinsic operation, reg+reg. + // Vector operation, reg+reg. + def PSrr : PSI { + let isCommutable = Commutable; + } + + // Vector operation, reg+mem. + def PSrm : PSI; + + // Intrinsic operation, reg+reg. def SSrr_Int : SSI { let isCommutable = Commutable; } - // Vector intrinsic operation, reg+mem. + // Intrinsic operation, reg+mem. def SSrm_Int : SSI opc, string OpcodeStr, } // Arithmetic instructions -defm ADD : scalar_sse1_fp_binop_rm<0x58, "add", fadd, int_x86_sse_add_ss, 1>; -defm MUL : scalar_sse1_fp_binop_rm<0x59, "mul", fmul, int_x86_sse_mul_ss, 1>; -defm SUB : scalar_sse1_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse_sub_ss>; -defm DIV : scalar_sse1_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse_div_ss>; +defm ADD : basic_sse1_fp_binop_rm<0x58, "add", fadd, int_x86_sse_add_ss, 1>; +defm MUL : basic_sse1_fp_binop_rm<0x59, "mul", fmul, int_x86_sse_mul_ss, 1>; +defm SUB : basic_sse1_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse_sub_ss>; +defm DIV : basic_sse1_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse_div_ss>; + +/// sse1_fp_binop_rm - Other SSE1 binops +/// +/// This multiclass is like basic_sse1_fp_binop_rm, with the addition of +/// instructions for a full-vector intrinsic form. Operations that map +/// onto C operators don't use this form since they just use the plain +/// vector form instead of having a separate vector intrinsic form. +/// +/// This provides a total of eight "instructions". +/// +let isTwoAddress = 1 in { +multiclass sse1_fp_binop_rm opc, string OpcodeStr, + SDNode OpNode, + Intrinsic F32Int, + Intrinsic V4F32Int, + bit Commutable = 0> { + + // Scalar operation, reg+reg. + def SSrr : SSI { + let isCommutable = Commutable; + } + + // Scalar operation, reg+mem. + def SSrm : SSI; + + // Vector operation, reg+reg. + def PSrr : PSI { + let isCommutable = Commutable; + } + + // Vector operation, reg+mem. + def PSrm : PSI; + + // Intrinsic operation, reg+reg. + def SSrr_Int : SSI { + let isCommutable = Commutable; + } + + // Intrinsic operation, reg+mem. + def SSrm_Int : SSI; + + // Vector intrinsic operation, reg+reg. + def PSrr_Int : PSI { + let isCommutable = Commutable; + } + + // Vector intrinsic operation, reg+mem. + def PSrm_Int : PSI; +} +} -defm MAX : scalar_sse1_fp_binop_rm<0x5F, "max", X86fmax, int_x86_sse_max_ss>; -defm MIN : scalar_sse1_fp_binop_rm<0x5D, "min", X86fmin, int_x86_sse_min_ss>; +defm MAX : sse1_fp_binop_rm<0x5F, "max", X86fmax, + int_x86_sse_max_ss, int_x86_sse_max_ps>; +defm MIN : sse1_fp_binop_rm<0x5D, "min", X86fmin, + int_x86_sse_min_ss, int_x86_sse_min_ps>; //===----------------------------------------------------------------------===// // SSE packed FP Instructions @@ -550,70 +611,85 @@ def MOVHLPSrr : PSI<0x12, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2), -/// packed_sse1_fp_binop_rm - Packed SSE binops come in three basic forms: -/// 1. v4f32 - This comes in SSE1 form for float. -/// 2. rr vs rm - They include a reg+reg form and a ref+mem form. +// Arithmetic + +/// sse1_fp_unop_rm - SSE1 unops come in both scalar and vector forms. /// -let isTwoAddress = 1 in { -multiclass packed_sse1_fp_binop_rm opc, string OpcodeStr, - SDNode OpNode, bit Commutable = 0> { - // Packed operation, reg+reg. - def PSrr : PSI { +/// In addition, we also have a special variant of the scalar form here to +/// represent the associated intrinsic operation. This form is unlike the +/// plain scalar form, in that it takes an entire vector (instead of a +/// scalar) and leaves the top elements undefined. +/// +/// And, we have a special variant form for a full-vector intrinsic form. +/// +/// These four forms can each have a reg or a mem operand, so there are a +/// total of eight "instructions". +/// +multiclass sse1_fp_unop_rm opc, string OpcodeStr, + SDNode OpNode, + Intrinsic F32Int, + Intrinsic V4F32Int, + bit Commutable = 0> { + // Scalar operation, reg. + def SSr : SSI { let isCommutable = Commutable; } - // Packed operation, reg+mem. - def PSrm : PSI; -} -} - -defm ADD : packed_sse1_fp_binop_rm<0x58, "add", fadd, 1>; -defm MUL : packed_sse1_fp_binop_rm<0x59, "mul", fmul, 1>; -defm DIV : packed_sse1_fp_binop_rm<0x5E, "div", fdiv>; -defm SUB : packed_sse1_fp_binop_rm<0x5C, "sub", fsub>; - -// Arithmetic + // Scalar operation, mem. + def SSm : SSI; + + // Vector operation, reg. + def PSr : PSI { + let isCommutable = Commutable; + } -class PS_Intr o, string OpcodeStr, Intrinsic IntId> - : PSI; -class PS_Intm o, string OpcodeStr, Intrinsic IntId> - : PSI; - -class PS_Intrr o, string OpcodeStr, Intrinsic IntId> - : PSI; -class PS_Intrm o, string OpcodeStr, Intrinsic IntId> - : PSI; + // Vector operation, mem. + def PSm : PSI; -def SQRTPSr : PS_Intr<0x51, "sqrtps", int_x86_sse_sqrt_ps>; -def SQRTPSm : PS_Intm<0x51, "sqrtps", int_x86_sse_sqrt_ps>; + // Intrinsic operation, reg. + def SSr_Int : SSI { + let isCommutable = Commutable; + } -def RSQRTPSr : PS_Intr<0x52, "rsqrtps", int_x86_sse_rsqrt_ps>; -def RSQRTPSm : PS_Intm<0x52, "rsqrtps", int_x86_sse_rsqrt_ps>; -def RCPPSr : PS_Intr<0x53, "rcpps", int_x86_sse_rcp_ps>; -def RCPPSm : PS_Intm<0x53, "rcpps", int_x86_sse_rcp_ps>; + // Intrinsic operation, mem. + def SSm_Int : SSI; -let isTwoAddress = 1 in { - let isCommutable = 1 in { - def MAXPSrr : PS_Intrr<0x5F, "maxps", int_x86_sse_max_ps>; - def MINPSrr : PS_Intrr<0x5D, "minps", int_x86_sse_min_ps>; + // Vector intrinsic operation, reg + def PSr_Int : PSI { + let isCommutable = Commutable; } - def MAXPSrm : PS_Intrm<0x5F, "maxps", int_x86_sse_max_ps>; - def MINPSrm : PS_Intrm<0x5D, "minps", int_x86_sse_min_ps>; + // Vector intrinsic operation, mem + def PSm_Int : PSI; } +// Square root. +defm SQRT : sse1_fp_unop_rm<0x51, "sqrt", fsqrt, + int_x86_sse_sqrt_ss, int_x86_sse_sqrt_ps>; + +// Reciprocal approximations. Note that these typically require refinement +// in order to obtain suitable precision. +defm RSQRT : sse1_fp_unop_rm<0x52, "rsqrt", X86frsqrt, + int_x86_sse_rsqrt_ss, int_x86_sse_rsqrt_ps>; +defm RCP : sse1_fp_unop_rm<0x53, "rcp", X86frcp, + int_x86_sse_rcp_ss, int_x86_sse_rcp_ps>; + // Logical let isTwoAddress = 1 in { let isCommutable = 1 in { @@ -835,16 +911,6 @@ class PDI o, Format F, dag ops, string asm, list pattern> class PDIi8 o, Format F, dag ops, string asm, list pattern> : Ii8, TB, OpSize, Requires<[HasSSE2]>; -// Helpers for defining instructions that directly correspond to intrinsics. -multiclass SD_IntUnary o, string OpcodeStr, Intrinsic IntId> { - def r : SDI; - def m : SDI; -} - // Move Instructions def MOVSDrr : SDI<0x10, MRMSrcReg, (ops FR64:$dst, FR64:$src), "movsd {$src, $dst|$dst, $src}", []>; @@ -855,16 +921,6 @@ def MOVSDmr : SDI<0x11, MRMDestMem, (ops f64mem:$dst, FR64:$src), "movsd {$src, $dst|$dst, $src}", [(store FR64:$src, addr:$dst)]>; -def SQRTSDr : SDI<0x51, MRMSrcReg, (ops FR64:$dst, FR64:$src), - "sqrtsd {$src, $dst|$dst, $src}", - [(set FR64:$dst, (fsqrt FR64:$src))]>; -def SQRTSDm : SDI<0x51, MRMSrcMem, (ops FR64:$dst, f64mem:$src), - "sqrtsd {$src, $dst|$dst, $src}", - [(set FR64:$dst, (fsqrt (loadf64 addr:$src)))]>; - -// Aliases to match intrinsics which expect XMM operand(s). -defm SQRTSD_Int : SD_IntUnary<0x51, "sqrtsd" , int_x86_sse2_sqrt_sd>; - // Conversion instructions def CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (ops GR32:$dst, FR64:$src), "cvttsd2si {$src, $dst|$dst, $src}", @@ -1013,20 +1069,20 @@ def FsANDNPDrm : PDI<0x55, MRMSrcMem, "andnpd {$src2, $dst|$dst, $src2}", []>; } -/// scalar_sse2_fp_binop_rm - Scalar SSE2 binops come in three basic forms: -/// -/// 1. f64 - This comes in SSE2 form for doubles. -/// 2. rr vs rm - They include a reg+reg form and a reg+mem form. +/// basic_sse2_fp_binop_rm - SSE2 binops come in both scalar and vector forms. +/// +/// In addition, we also have a special variant of the scalar form here to +/// represent the associated intrinsic operation. This form is unlike the +/// plain scalar form, in that it takes an entire vector (instead of a scalar) +/// and leaves the top elements undefined. /// -/// In addition, scalar SSE ops have an intrinsic form. This form is unlike the -/// normal form, in that they take an entire vector (instead of a scalar) and -/// leave the top elements undefined. This adds another two variants of the -/// above permutations, giving us 8 forms for 'instruction'. +/// These three forms can each be reg+reg or reg+mem, so there are a total of +/// six "instructions". /// let isTwoAddress = 1 in { -multiclass scalar_sse2_fp_binop_rm opc, string OpcodeStr, - SDNode OpNode, Intrinsic F64Int, - bit Commutable = 0> { +multiclass basic_sse2_fp_binop_rm opc, string OpcodeStr, + SDNode OpNode, Intrinsic F64Int, + bit Commutable = 0> { // Scalar operation, reg+reg. def SDrr : SDI opc, string OpcodeStr, !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"), [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>; - // Vector intrinsic operation, reg+reg. + // Vector operation, reg+reg. + def PDrr : PDI { + let isCommutable = Commutable; + } + + // Vector operation, reg+mem. + def PDrm : PDI; + + // Intrinsic operation, reg+reg. def SDrr_Int : SDI { let isCommutable = Commutable; } - // Vector intrinsic operation, reg+mem. + // Intrinsic operation, reg+mem. def SDrm_Int : SDI opc, string OpcodeStr, } // Arithmetic instructions -defm ADD : scalar_sse2_fp_binop_rm<0x58, "add", fadd, int_x86_sse2_add_sd, 1>; -defm MUL : scalar_sse2_fp_binop_rm<0x59, "mul", fmul, int_x86_sse2_mul_sd, 1>; -defm SUB : scalar_sse2_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse2_sub_sd>; -defm DIV : scalar_sse2_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse2_div_sd>; +defm ADD : basic_sse2_fp_binop_rm<0x58, "add", fadd, int_x86_sse2_add_sd, 1>; +defm MUL : basic_sse2_fp_binop_rm<0x59, "mul", fmul, int_x86_sse2_mul_sd, 1>; +defm SUB : basic_sse2_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse2_sub_sd>; +defm DIV : basic_sse2_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse2_div_sd>; -defm MAX : scalar_sse2_fp_binop_rm<0x5F, "max", X86fmax, int_x86_sse2_max_sd>; -defm MIN : scalar_sse2_fp_binop_rm<0x5D, "min", X86fmin, int_x86_sse2_min_sd>; +/// sse2_fp_binop_rm - Other SSE2 binops +/// +/// This multiclass is like basic_sse2_fp_binop_rm, with the addition of +/// instructions for a full-vector intrinsic form. Operations that map +/// onto C operators don't use this form since they just use the plain +/// vector form instead of having a separate vector intrinsic form. +/// +/// This provides a total of eight "instructions". +/// +let isTwoAddress = 1 in { +multiclass sse2_fp_binop_rm opc, string OpcodeStr, + SDNode OpNode, + Intrinsic F64Int, + Intrinsic V2F64Int, + bit Commutable = 0> { + + // Scalar operation, reg+reg. + def SDrr : SDI { + let isCommutable = Commutable; + } + + // Scalar operation, reg+mem. + def SDrm : SDI; + + // Vector operation, reg+reg. + def PDrr : PDI { + let isCommutable = Commutable; + } + + // Vector operation, reg+mem. + def PDrm : PDI; + + // Intrinsic operation, reg+reg. + def SDrr_Int : SDI { + let isCommutable = Commutable; + } + + // Intrinsic operation, reg+mem. + def SDrm_Int : SDI; + + // Vector intrinsic operation, reg+reg. + def PDrr_Int : PDI { + let isCommutable = Commutable; + } + + // Vector intrinsic operation, reg+mem. + def PDrm_Int : PDI; +} +} + +defm MAX : sse2_fp_binop_rm<0x5F, "max", X86fmax, + int_x86_sse2_max_sd, int_x86_sse2_max_pd>; +defm MIN : sse2_fp_binop_rm<0x5D, "min", X86fmin, + int_x86_sse2_min_sd, int_x86_sse2_min_pd>; //===----------------------------------------------------------------------===// // SSE packed FP Instructions @@ -1234,65 +1371,80 @@ def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem, Requires<[HasSSE2]>; } -/// packed_sse2_fp_binop_rm - Packed SSE binops come in three basic forms: -/// 1. v2f64 - This comes in SSE2 form for doubles. -/// 2. rr vs rm - They include a reg+reg form and a ref+mem form. +// Arithmetic + +/// sse2_fp_unop_rm - SSE2 unops come in both scalar and vector forms. /// -let isTwoAddress = 1 in { -multiclass packed_sse2_fp_binop_rm opc, string OpcodeStr, - SDNode OpNode, bit Commutable = 0> { - // Packed operation, reg+reg. - def PDrr : PDI { +/// In addition, we also have a special variant of the scalar form here to +/// represent the associated intrinsic operation. This form is unlike the +/// plain scalar form, in that it takes an entire vector (instead of a +/// scalar) and leaves the top elements undefined. +/// +/// And, we have a special variant form for a full-vector intrinsic form. +/// +/// These four forms can each have a reg or a mem operand, so there are a +/// total of eight "instructions". +/// +multiclass sse2_fp_unop_rm opc, string OpcodeStr, + SDNode OpNode, + Intrinsic F64Int, + Intrinsic V2F64Int, + bit Commutable = 0> { + // Scalar operation, reg. + def SDr : SDI { let isCommutable = Commutable; } - // Packed operation, reg+mem. - def PDrm : PDI; -} -} - -defm ADD : packed_sse2_fp_binop_rm<0x58, "add", fadd, 1>; -defm MUL : packed_sse2_fp_binop_rm<0x59, "mul", fmul, 1>; -defm DIV : packed_sse2_fp_binop_rm<0x5E, "div", fdiv>; -defm SUB : packed_sse2_fp_binop_rm<0x5C, "sub", fsub>; + // Scalar operation, mem. + def SDm : SDI; + + // Vector operation, reg. + def PDr : PDI { + let isCommutable = Commutable; + } -// Arithmetic + // Vector operation, mem. + def PDm : PDI; -class PD_Intr o, string OpcodeStr, Intrinsic IntId> - : PDI; -class PD_Intm o, string OpcodeStr, Intrinsic IntId> - : PDI; - -class PD_Intrr o, string OpcodeStr, Intrinsic IntId> - : PDI; -class PD_Intrm o, string OpcodeStr, Intrinsic IntId> - : PDI; + // Intrinsic operation, reg. + def SDr_Int : SDI { + let isCommutable = Commutable; + } -def SQRTPDr : PD_Intr<0x51, "sqrtpd", int_x86_sse2_sqrt_pd>; -def SQRTPDm : PD_Intm<0x51, "sqrtpd", int_x86_sse2_sqrt_pd>; + // Intrinsic operation, mem. + def SDm_Int : SDI; -let isTwoAddress = 1 in { - let isCommutable = 1 in { - def MAXPDrr : PD_Intrr<0x5F, "maxpd", int_x86_sse2_max_pd>; - def MINPDrr : PD_Intrr<0x5D, "minpd", int_x86_sse2_min_pd>; + // Vector intrinsic operation, reg + def PDr_Int : PDI { + let isCommutable = Commutable; } - def MAXPDrm : PD_Intrm<0x5F, "maxpd", int_x86_sse2_max_pd>; - def MINPDrm : PD_Intrm<0x5D, "minpd", int_x86_sse2_min_pd>; + // Vector intrinsic operation, mem + def PDm_Int : PDI; } +// Square root. +defm SQRT : sse2_fp_unop_rm<0x51, "sqrt", fsqrt, + int_x86_sse2_sqrt_sd, int_x86_sse2_sqrt_pd>; + +// There is no f64 version of the reciprocal approximation instructions. + // Logical let isTwoAddress = 1 in { let isCommutable = 1 in { diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 56ff6a7a82c..3ea437fe1e8 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -758,9 +758,21 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI, { X86::IMUL16rr, X86::IMUL16rm }, { X86::IMUL32rr, X86::IMUL32rm }, { X86::MAXPDrr, X86::MAXPDrm }, + { X86::MAXPDrr_Int, X86::MAXPDrm_Int }, { X86::MAXPSrr, X86::MAXPSrm }, + { X86::MAXPSrr_Int, X86::MAXPSrm_Int }, + { X86::MAXSDrr, X86::MAXSDrm }, + { X86::MAXSDrr_Int, X86::MAXSDrm_Int }, + { X86::MAXSSrr, X86::MAXSSrm }, + { X86::MAXSSrr_Int, X86::MAXSSrm_Int }, { X86::MINPDrr, X86::MINPDrm }, + { X86::MINPDrr_Int, X86::MINPDrm_Int }, { X86::MINPSrr, X86::MINPSrm }, + { X86::MINPSrr_Int, X86::MINPSrm_Int }, + { X86::MINSDrr, X86::MINSDrm }, + { X86::MINSDrr_Int, X86::MINSDrm_Int }, + { X86::MINSSrr, X86::MINSSrm }, + { X86::MINSSrr_Int, X86::MINSSrm_Int }, { X86::MULPDrr, X86::MULPDrm }, { X86::MULPSrr, X86::MULPSrm }, { X86::MULSDrr, X86::MULSDrm }, @@ -825,15 +837,23 @@ MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI, { X86::PUNPCKLWDrr, X86::PUNPCKLWDrm }, { X86::PXORrr, X86::PXORrm }, { X86::RCPPSr, X86::RCPPSm }, + { X86::RCPPSr_Int, X86::RCPPSm_Int }, { X86::RSQRTPSr, X86::RSQRTPSm }, + { X86::RSQRTPSr_Int, X86::RSQRTPSm_Int }, + { X86::RSQRTSSr, X86::RSQRTSSm }, + { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int }, { X86::SBB32rr, X86::SBB32rm }, { X86::SBB64rr, X86::SBB64rm }, { X86::SHUFPDrri, X86::SHUFPDrmi }, { X86::SHUFPSrri, X86::SHUFPSrmi }, { X86::SQRTPDr, X86::SQRTPDm }, + { X86::SQRTPDr_Int, X86::SQRTPDm_Int }, { X86::SQRTPSr, X86::SQRTPSm }, + { X86::SQRTPSr_Int, X86::SQRTPSm_Int }, { X86::SQRTSDr, X86::SQRTSDm }, + { X86::SQRTSDr_Int, X86::SQRTSDm_Int }, { X86::SQRTSSr, X86::SQRTSSm }, + { X86::SQRTSSr_Int, X86::SQRTSSm_Int }, { X86::SUB16rr, X86::SUB16rm }, { X86::SUB32rr, X86::SUB32rm }, { X86::SUB64rr, X86::SUB64rm }, -- 2.34.1