From: Elena Demikhovsky Date: Wed, 1 Aug 2012 12:06:00 +0000 (+0000) Subject: Added FMA functionality to X86 target. X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=1503aba4a036f5394c7983417bc1e64613b2fc77;p=oota-llvm.git Added FMA functionality to X86 target. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@161110 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 747bc446a71..0f019ef716c 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -5679,7 +5679,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast || DAG.getTarget().Options.UnsafeFPMath) && DAG.getTarget().getTargetLowering()->isFMAFasterThanMulAndAdd(VT) && - TLI.isOperationLegal(ISD::FMA, VT)) { + TLI.isOperationLegalOrCustom(ISD::FMA, VT)) { // fold (fadd (fmul x, y), z) -> (fma x, y, z) if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse()) { @@ -5704,6 +5704,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { ConstantFPSDNode *N0CFP = dyn_cast(N0); ConstantFPSDNode *N1CFP = dyn_cast(N1); EVT VT = N->getValueType(0); + DebugLoc dl = N->getDebugLoc(); // fold vector ops if (VT.isVector()) { @@ -5724,11 +5725,11 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { if (isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options)) return GetNegatedExpression(N1, DAG, LegalOperations); if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) - return DAG.getNode(ISD::FNEG, N->getDebugLoc(), VT, N1); + return DAG.getNode(ISD::FNEG, dl, VT, N1); } // fold (fsub A, (fneg B)) -> (fadd A, B) if (isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options)) - return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0, + return DAG.getNode(ISD::FADD, dl, VT, N0, GetNegatedExpression(N1, DAG, LegalOperations)); // If 'unsafe math' is enabled, fold @@ -5756,23 +5757,34 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast || DAG.getTarget().Options.UnsafeFPMath) && DAG.getTarget().getTargetLowering()->isFMAFasterThanMulAndAdd(VT) && - TLI.isOperationLegal(ISD::FMA, VT)) { + TLI.isOperationLegalOrCustom(ISD::FMA, VT)) { // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse()) { - return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT, + return DAG.getNode(ISD::FMA, dl, VT, N0.getOperand(0), N0.getOperand(1), - DAG.getNode(ISD::FNEG, N1->getDebugLoc(), VT, N1)); + DAG.getNode(ISD::FNEG, dl, VT, N1)); } // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) // Note: Commutes FSUB operands. if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse()) { - return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT, - DAG.getNode(ISD::FNEG, N1->getDebugLoc(), VT, + return DAG.getNode(ISD::FMA, dl, VT, + DAG.getNode(ISD::FNEG, dl, VT, N1.getOperand(0)), N1.getOperand(1), N0); } + + // fold (fsub (-(fmul, x, y)), z) -> (fma (fneg x), y, (fneg z)) + if (N0.getOpcode() == ISD::FNEG && + N0.getOperand(0).getOpcode() == ISD::FMUL && + N0->hasOneUse() && N0.getOperand(0).hasOneUse()) { + SDValue N00 = N0.getOperand(0).getOperand(0); + SDValue N01 = N0.getOperand(0).getOperand(1); + return DAG.getNode(ISD::FMA, dl, VT, + DAG.getNode(ISD::FNEG, dl, VT, N00), N01, + DAG.getNode(ISD::FNEG, dl, VT, N1)); + } } return SDValue(); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 40283d87f87..fa19ad13886 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -730,6 +730,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FMA, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); @@ -1071,6 +1072,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); + if (Subtarget->hasFMA()) { + setOperationAction(ISD::FMA, MVT::v8f32, Custom); + setOperationAction(ISD::FMA, MVT::v4f64, Custom); + setOperationAction(ISD::FMA, MVT::v4f32, Custom); + setOperationAction(ISD::FMA, MVT::v2f64, Custom); + setOperationAction(ISD::FMA, MVT::f32, Custom); + setOperationAction(ISD::FMA, MVT::f64, Custom); + } if (Subtarget->hasAVX2()) { setOperationAction(ISD::ADD, MVT::v4i64, Legal); setOperationAction(ISD::ADD, MVT::v8i32, Legal); @@ -1220,6 +1229,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); + setTargetDAGCombine(ISD::FMA); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); @@ -11289,6 +11299,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; case X86ISD::SAHF: return "X86ISD::SAHF"; case X86ISD::RDRAND: return "X86ISD::RDRAND"; + case X86ISD::FMADD: return "X86ISD::FMADD"; + case X86ISD::FMSUB: return "X86ISD::FMSUB"; + case X86ISD::FNMADD: return "X86ISD::FNMADD"; + case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; + case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; + case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; } } @@ -15108,6 +15124,40 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget* Subtarget) { + DebugLoc dl = N->getDebugLoc(); + EVT VT = N->getValueType(0); + + EVT ScalarVT = VT.getScalarType(); + if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasFMA()) + return SDValue(); + + SDValue A = N->getOperand(0); + SDValue B = N->getOperand(1); + SDValue C = N->getOperand(2); + + bool NegA = (A.getOpcode() == ISD::FNEG); + bool NegB = (B.getOpcode() == ISD::FNEG); + bool NegC = (C.getOpcode() == ISD::FNEG); + + // Negative multiplication when NegA xor NegB + bool NegMul = (NegA != NegB); + if (NegA) + A = A.getOperand(0); + if (NegB) + B = B.getOperand(0); + if (NegC) + C = C.getOperand(0); + + unsigned Opcode; + if (!NegMul) + Opcode = (!NegC)? X86ISD::FMADD : X86ISD::FMSUB; + else + Opcode = (!NegC)? X86ISD::FNMADD : X86ISD::FNMSUB; + return DAG.getNode(Opcode, dl, VT, A, B, C); +} + static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { @@ -15447,6 +15497,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VPERMILP: case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); + case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); } return SDValue(); diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 78e4d757c2d..3bc2e31ce5c 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -294,6 +294,14 @@ namespace llvm { // PMULUDQ - Vector multiply packed unsigned doubleword integers PMULUDQ, + // FMA nodes + FMADD, + FNMADD, + FMSUB, + FNMSUB, + FMADDSUB, + FMSUBADD, + // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack, // according to %al. An operator is needed so that this can be expanded // with control flow. @@ -597,6 +605,12 @@ namespace llvm { virtual bool isZExtFree(Type *Ty1, Type *Ty2) const; virtual bool isZExtFree(EVT VT1, EVT VT2) const; + /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than + /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to + /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd + /// is expanded to mul + add. + virtual bool isFMAFasterThanMulAndAdd(EVT) const { return true; } + /// isNarrowingProfitable - Return true if it's profitable to narrow /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow /// from i32 to i8 but not from i32 to i16. diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td index 8802a2ecdeb..265b4bb997f 100644 --- a/lib/Target/X86/X86InstrFMA.td +++ b/lib/Target/X86/X86InstrFMA.td @@ -19,156 +19,240 @@ let Constraints = "$src1 = $dst" in { multiclass fma3p_rm opc, string OpcodeStr> { let neverHasSideEffects = 1 in { def r : FMA3; + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>; let mayLoad = 1 in def m : FMA3; + (ins VR128:$src1, VR128:$src2, f128mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>; def rY : FMA3; + (ins VR256:$src1, VR256:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>; let mayLoad = 1 in def mY : FMA3; + (ins VR256:$src1, VR256:$src2, f256mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>; } // neverHasSideEffects = 1 } -// Intrinsic for 132 pattern +// Intrinsic for 213 pattern multiclass fma3p_rm_int opc, string OpcodeStr, PatFrag MemFrag128, PatFrag MemFrag256, - Intrinsic Int128, Intrinsic Int256> { + Intrinsic Int128, Intrinsic Int256, SDNode Op213, + ValueType OpVT128, ValueType OpVT256> { def r_Int : FMA3; + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, (Int128 VR128:$src2, VR128:$src1, + VR128:$src3))]>; + + def r : FMA3; + def m_Int : FMA3; + (ins VR128:$src1, VR128:$src2, f128mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, (Int128 VR128:$src2, VR128:$src1, + (MemFrag128 addr:$src3)))]>; + + def m : FMA3; + + def rY_Int : FMA3; + (ins VR256:$src1, VR256:$src2, VR256:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR256:$dst, (Int256 VR256:$src2, VR256:$src1, + VR256:$src3))]>; + + def rY : FMA3; + def mY_Int : FMA3; + (ins VR256:$src1, VR256:$src2, f256mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR256:$dst, (Int256 VR256:$src2, VR256:$src1, + (MemFrag256 addr:$src3)))]>; + + def mY : FMA3; } } // Constraints = "$src1 = $dst" multiclass fma3p_forms opc132, bits<8> opc213, bits<8> opc231, string OpcodeStr, string PackTy, PatFrag MemFrag128, PatFrag MemFrag256, - Intrinsic Int128, Intrinsic Int256> { - defm r132 : fma3p_rm_int ; - defm r132 : fma3p_rm ; - defm r213 : fma3p_rm ; - defm r231 : fma3p_rm ; + Intrinsic Int128, Intrinsic Int256, SDNode Op, + ValueType OpTy128, ValueType OpTy256> { + defm r213 : fma3p_rm_int ; + defm r132 : fma3p_rm ; + defm r231 : fma3p_rm ; } // Fused Multiply-Add let ExeDomain = SSEPackedSingle in { defm VFMADDPS : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", memopv4f32, - memopv8f32, int_x86_fma_vfmadd_ps, int_x86_fma_vfmadd_ps_256>; - defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", memopv4f32, - memopv8f32, int_x86_fma_vfmsub_ps, int_x86_fma_vfmsub_ps_256>; + memopv8f32, int_x86_fma_vfmadd_ps, + int_x86_fma_vfmadd_ps_256, X86Fmadd, + v4f32, v8f32>; + defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", memopv4f32, + memopv8f32, int_x86_fma_vfmsub_ps, + int_x86_fma_vfmsub_ps_256, X86Fmsub, + v4f32, v8f32>; defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", - memopv4f32, memopv8f32, int_x86_fma_vfmaddsub_ps, - int_x86_fma_vfmaddsub_ps_256>; + memopv4f32, memopv8f32, + int_x86_fma_vfmaddsub_ps, + int_x86_fma_vfmaddsub_ps_256, X86Fmaddsub, + v4f32, v8f32>; defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps", - memopv4f32, memopv8f32, int_x86_fma_vfmsubadd_ps, - int_x86_fma_vfmaddsub_ps_256>; + memopv4f32, memopv8f32, + int_x86_fma_vfmsubadd_ps, + int_x86_fma_vfmaddsub_ps_256, X86Fmsubadd, + v4f32, v8f32>; } let ExeDomain = SSEPackedDouble in { defm VFMADDPD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", memopv2f64, - memopv4f64, int_x86_fma_vfmadd_pd, int_x86_fma_vfmadd_pd_256>, VEX_W; + memopv4f64, int_x86_fma_vfmadd_pd, + int_x86_fma_vfmadd_pd_256, X86Fmadd, v2f64, + v4f64>, VEX_W; defm VFMSUBPD : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", memopv2f64, - memopv4f64, int_x86_fma_vfmsub_pd, int_x86_fma_vfmsub_pd_256>, VEX_W; - defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", memopv2f64, - memopv4f64, int_x86_fma_vfmaddsub_pd, int_x86_fma_vfmaddsub_pd_256>, VEX_W; - defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd", memopv2f64, - memopv4f64, int_x86_fma_vfmsubadd_pd, int_x86_fma_vfmsubadd_pd_256>, VEX_W; + memopv4f64, int_x86_fma_vfmsub_pd, + int_x86_fma_vfmsub_pd_256, X86Fmsub, v2f64, + v4f64>, VEX_W; + defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", + memopv2f64, memopv4f64, + int_x86_fma_vfmaddsub_pd, + int_x86_fma_vfmaddsub_pd_256, X86Fmaddsub, + v2f64, v4f64>, VEX_W; + defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd", + memopv2f64, memopv4f64, + int_x86_fma_vfmsubadd_pd, + int_x86_fma_vfmsubadd_pd_256, X86Fmsubadd, + v2f64, v4f64>, VEX_W; } // Fused Negative Multiply-Add let ExeDomain = SSEPackedSingle in { defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", memopv4f32, - memopv8f32, int_x86_fma_vfnmadd_ps, int_x86_fma_vfnmadd_ps_256>; + memopv8f32, int_x86_fma_vfnmadd_ps, + int_x86_fma_vfnmadd_ps_256, X86Fnmadd, v4f32, + v8f32>; defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", memopv4f32, - memopv8f32, int_x86_fma_vfnmsub_ps, int_x86_fma_vfnmsub_ps_256>; + memopv8f32, int_x86_fma_vfnmsub_ps, + int_x86_fma_vfnmsub_ps_256, X86Fnmsub, v4f32, + v8f32>; } let ExeDomain = SSEPackedDouble in { defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", memopv2f64, - memopv4f64, int_x86_fma_vfnmadd_pd, int_x86_fma_vfnmadd_pd_256>, VEX_W; - defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", memopv2f64, - memopv4f64, int_x86_fma_vfnmsub_pd, int_x86_fma_vfnmsub_pd_256>, VEX_W; + memopv4f64, int_x86_fma_vfnmadd_pd, + int_x86_fma_vfnmadd_pd_256, X86Fnmadd, v2f64, + v4f64>, VEX_W; + defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", + memopv2f64, + memopv4f64, int_x86_fma_vfnmsub_pd, + int_x86_fma_vfnmsub_pd_256, X86Fnmsub, v2f64, + v4f64>, VEX_W; } - let Constraints = "$src1 = $dst" in { multiclass fma3s_rm opc, string OpcodeStr, X86MemOperand x86memop, RegisterClass RC> { let neverHasSideEffects = 1 in { def r : FMA3; + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>; let mayLoad = 1 in def m : FMA3; + (ins RC:$src1, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>; } // neverHasSideEffects = 1 } multiclass fma3s_rm_int opc, string OpcodeStr, Operand memop, - ComplexPattern mem_cpat, Intrinsic IntId> { + ComplexPattern mem_cpat, Intrinsic IntId, + RegisterClass RC, SDNode OpNode, ValueType OpVT> { def r_Int : FMA3; + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, (IntId VR128:$src2, VR128:$src1, + VR128:$src3))]>; def m_Int : FMA3; + (ins VR128:$src1, VR128:$src2, memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set VR128:$dst, + (IntId VR128:$src2, VR128:$src1, mem_cpat:$src3))]>; + def r : FMA3; + let mayLoad = 1 in + def m : FMA3; } } // Constraints = "$src1 = $dst" multiclass fma3s_forms opc132, bits<8> opc213, bits<8> opc231, - string OpStr, Intrinsic IntF32, Intrinsic IntF64> { + string OpStr, Intrinsic IntF32, Intrinsic IntF64, + SDNode OpNode> { defm SSr132 : fma3s_rm; - defm SSr213 : fma3s_rm; defm SSr231 : fma3s_rm; - defm SDr132 : fma3s_rm, VEX_W; - defm SDr213 : fma3s_rm, VEX_W; - defm SDr231 : fma3s_rm, VEX_W; - defm SSr132 : fma3s_rm_int ; - defm SDr132 : fma3s_rm_int ; + defm SDr132 : fma3s_rm, + VEX_W; + defm SDr231 : fma3s_rm, + VEX_W; + defm SSr213 : fma3s_rm_int ; + defm SDr213 : fma3s_rm_int , VEX_W; } defm VFMADD : fma3s_forms<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss, - int_x86_fma_vfmadd_sd>, VEX_LIG; + int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG; defm VFMSUB : fma3s_forms<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss, - int_x86_fma_vfmsub_sd>, VEX_LIG; + int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG; defm VFNMADD : fma3s_forms<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss, - int_x86_fma_vfnmadd_sd>, VEX_LIG; + int_x86_fma_vfnmadd_sd, X86Fnmadd>, VEX_LIG; defm VFNMSUB : fma3s_forms<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss, - int_x86_fma_vfnmsub_sd>, VEX_LIG; + int_x86_fma_vfnmsub_sd, X86Fnmsub>, VEX_LIG; //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index ec030dd5467..19de855ac5e 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -125,7 +125,10 @@ def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, -SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>; + SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>; + +def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>, + SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>; def X86PAlign : SDNode<"X86ISD::PALIGN", SDTShuff3OpI>; @@ -160,9 +163,15 @@ def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; -def X86Blendpw : SDNode<"X86ISD::BLENDPW", SDTBlend>; -def X86Blendps : SDNode<"X86ISD::BLENDPS", SDTBlend>; -def X86Blendpd : SDNode<"X86ISD::BLENDPD", SDTBlend>; +def X86Blendpw : SDNode<"X86ISD::BLENDPW", SDTBlend>; +def X86Blendps : SDNode<"X86ISD::BLENDPS", SDTBlend>; +def X86Blendpd : SDNode<"X86ISD::BLENDPD", SDTBlend>; +def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; +def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>; +def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFma>; +def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFma>; +def X86Fmaddsub : SDNode<"X86ISD::FMSUBADD", SDTFma>; +def X86Fmsubadd : SDNode<"X86ISD::FMADDSUB", SDTFma>; //===----------------------------------------------------------------------===// // SSE Complex Patterns diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 89a57b53f3c..3f16372497c 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -1130,8 +1130,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VFMADDSDr132r, X86::VFMADDSDr132m, 0 }, { X86::VFMADDSSr213r, X86::VFMADDSSr213m, 0 }, { X86::VFMADDSDr213r, X86::VFMADDSDr213m, 0 }, - { X86::VFMADDSSr132r_Int, X86::VFMADDSSr132m_Int, 0 }, - { X86::VFMADDSDr132r_Int, X86::VFMADDSDr132m_Int, 0 }, + { X86::VFMADDSSr213r_Int, X86::VFMADDSSr213m_Int, 0 }, + { X86::VFMADDSDr213r_Int, X86::VFMADDSDr213m_Int, 0 }, { X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_16 }, { X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_16 }, @@ -1145,10 +1145,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VFMADDPDr132rY, X86::VFMADDPDr132mY, TB_ALIGN_32 }, { X86::VFMADDPSr213rY, X86::VFMADDPSr213mY, TB_ALIGN_32 }, { X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_32 }, - { X86::VFMADDPSr132r_Int, X86::VFMADDPSr132m_Int, TB_ALIGN_16 }, - { X86::VFMADDPDr132r_Int, X86::VFMADDPDr132m_Int, TB_ALIGN_16 }, - { X86::VFMADDPSr132rY_Int, X86::VFMADDPSr132mY_Int, TB_ALIGN_32 }, - { X86::VFMADDPDr132rY_Int, X86::VFMADDPDr132mY_Int, TB_ALIGN_32 }, + { X86::VFMADDPSr213r_Int, X86::VFMADDPSr213m_Int, TB_ALIGN_16 }, + { X86::VFMADDPDr213r_Int, X86::VFMADDPDr213m_Int, TB_ALIGN_16 }, + { X86::VFMADDPSr213rY_Int, X86::VFMADDPSr213mY_Int, TB_ALIGN_32 }, + { X86::VFMADDPDr213rY_Int, X86::VFMADDPDr213mY_Int, TB_ALIGN_32 }, { X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, 0 }, { X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, 0 }, @@ -1156,8 +1156,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, 0 }, { X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, 0 }, { X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, 0 }, - { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr132m_Int, 0 }, - { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr132m_Int, 0 }, + { X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr213m_Int, 0 }, + { X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr213m_Int, 0 }, { X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_16 }, { X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_16 }, @@ -1171,10 +1171,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VFNMADDPDr132rY, X86::VFNMADDPDr132mY, TB_ALIGN_32 }, { X86::VFNMADDPSr213rY, X86::VFNMADDPSr213mY, TB_ALIGN_32 }, { X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_32 }, - { X86::VFNMADDPSr132r_Int, X86::VFNMADDPSr132m_Int, TB_ALIGN_16 }, - { X86::VFNMADDPDr132r_Int, X86::VFNMADDPDr132m_Int, TB_ALIGN_16 }, - { X86::VFNMADDPSr132rY_Int, X86::VFNMADDPSr132mY_Int, TB_ALIGN_32 }, - { X86::VFNMADDPDr132rY_Int, X86::VFNMADDPDr132mY_Int, TB_ALIGN_32 }, + { X86::VFNMADDPSr213r_Int, X86::VFNMADDPSr213m_Int, TB_ALIGN_16 }, + { X86::VFNMADDPDr213r_Int, X86::VFNMADDPDr213m_Int, TB_ALIGN_16 }, + { X86::VFNMADDPSr213rY_Int, X86::VFNMADDPSr213mY_Int, TB_ALIGN_32 }, + { X86::VFNMADDPDr213rY_Int, X86::VFNMADDPDr213mY_Int, TB_ALIGN_32 }, { X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, 0 }, { X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, 0 }, @@ -1182,8 +1182,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, 0 }, { X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, 0 }, { X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, 0 }, - { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr132m_Int, 0 }, - { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr132m_Int, 0 }, + { X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr213m_Int, 0 }, + { X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr213m_Int, 0 }, { X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_16 }, { X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_16 }, @@ -1197,10 +1197,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VFMSUBPDr132rY, X86::VFMSUBPDr132mY, TB_ALIGN_32 }, { X86::VFMSUBPSr213rY, X86::VFMSUBPSr213mY, TB_ALIGN_32 }, { X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_32 }, - { X86::VFMSUBPSr132r_Int, X86::VFMSUBPSr132m_Int, TB_ALIGN_16 }, - { X86::VFMSUBPDr132r_Int, X86::VFMSUBPDr132m_Int, TB_ALIGN_16 }, - { X86::VFMSUBPSr132rY_Int, X86::VFMSUBPSr132mY_Int, TB_ALIGN_32 }, - { X86::VFMSUBPDr132rY_Int, X86::VFMSUBPDr132mY_Int, TB_ALIGN_32 }, + { X86::VFMSUBPSr213r_Int, X86::VFMSUBPSr213m_Int, TB_ALIGN_16 }, + { X86::VFMSUBPDr213r_Int, X86::VFMSUBPDr213m_Int, TB_ALIGN_16 }, + { X86::VFMSUBPSr213rY_Int, X86::VFMSUBPSr213mY_Int, TB_ALIGN_32 }, + { X86::VFMSUBPDr213rY_Int, X86::VFMSUBPDr213mY_Int, TB_ALIGN_32 }, { X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, 0 }, { X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, 0 }, @@ -1208,8 +1208,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, 0 }, { X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, 0 }, { X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, 0 }, - { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr132m_Int, 0 }, - { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr132m_Int, 0 }, + { X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr213m_Int, 0 }, + { X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr213m_Int, 0 }, { X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_16 }, { X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_16 }, @@ -1223,10 +1223,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr132mY, TB_ALIGN_32 }, { X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr213mY, TB_ALIGN_32 }, { X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr213mY, TB_ALIGN_32 }, - { X86::VFNMSUBPSr132r_Int, X86::VFNMSUBPSr132m_Int, TB_ALIGN_16 }, - { X86::VFNMSUBPDr132r_Int, X86::VFNMSUBPDr132m_Int, TB_ALIGN_16 }, - { X86::VFNMSUBPSr132rY_Int, X86::VFNMSUBPSr132mY_Int, TB_ALIGN_32 }, - { X86::VFNMSUBPDr132rY_Int, X86::VFNMSUBPDr132mY_Int, TB_ALIGN_32 }, + { X86::VFNMSUBPSr213r_Int, X86::VFNMSUBPSr213m_Int, TB_ALIGN_16 }, + { X86::VFNMSUBPDr213r_Int, X86::VFNMSUBPDr213m_Int, TB_ALIGN_16 }, + { X86::VFNMSUBPSr213rY_Int, X86::VFNMSUBPSr213mY_Int, TB_ALIGN_32 }, + { X86::VFNMSUBPDr213rY_Int, X86::VFNMSUBPDr213mY_Int, TB_ALIGN_32 }, { X86::VFMADDSUBPSr231r, X86::VFMADDSUBPSr231m, TB_ALIGN_16 }, { X86::VFMADDSUBPDr231r, X86::VFMADDSUBPDr231m, TB_ALIGN_16 }, @@ -1240,10 +1240,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr132mY, TB_ALIGN_32 }, { X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr213mY, TB_ALIGN_32 }, { X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr213mY, TB_ALIGN_32 }, - { X86::VFMADDSUBPSr132r_Int, X86::VFMADDSUBPSr132m_Int, TB_ALIGN_16 }, - { X86::VFMADDSUBPDr132r_Int, X86::VFMADDSUBPDr132m_Int, TB_ALIGN_16 }, - { X86::VFMADDSUBPSr132rY_Int, X86::VFMADDSUBPSr132mY_Int, TB_ALIGN_32 }, - { X86::VFMADDSUBPDr132rY_Int, X86::VFMADDSUBPDr132mY_Int, TB_ALIGN_32 }, + { X86::VFMADDSUBPSr213r_Int, X86::VFMADDSUBPSr213m_Int, TB_ALIGN_16 }, + { X86::VFMADDSUBPDr213r_Int, X86::VFMADDSUBPDr213m_Int, TB_ALIGN_16 }, + { X86::VFMADDSUBPSr213rY_Int, X86::VFMADDSUBPSr213mY_Int, TB_ALIGN_32 }, + { X86::VFMADDSUBPDr213rY_Int, X86::VFMADDSUBPDr213mY_Int, TB_ALIGN_32 }, { X86::VFMSUBADDPSr231r, X86::VFMSUBADDPSr231m, TB_ALIGN_16 }, { X86::VFMSUBADDPDr231r, X86::VFMSUBADDPDr231m, TB_ALIGN_16 }, @@ -1257,10 +1257,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm) { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr132mY, TB_ALIGN_32 }, { X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr213mY, TB_ALIGN_32 }, { X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_32 }, - { X86::VFMSUBADDPSr132r_Int, X86::VFMSUBADDPSr132m_Int, TB_ALIGN_16 }, - { X86::VFMSUBADDPDr132r_Int, X86::VFMSUBADDPDr132m_Int, TB_ALIGN_16 }, - { X86::VFMSUBADDPSr132rY_Int, X86::VFMSUBADDPSr132mY_Int, TB_ALIGN_32 }, - { X86::VFMSUBADDPDr132rY_Int, X86::VFMSUBADDPDr132mY_Int, TB_ALIGN_32 }, + { X86::VFMSUBADDPSr213r_Int, X86::VFMSUBADDPSr213m_Int, TB_ALIGN_16 }, + { X86::VFMSUBADDPDr213r_Int, X86::VFMSUBADDPDr213m_Int, TB_ALIGN_16 }, + { X86::VFMSUBADDPSr213rY_Int, X86::VFMSUBADDPSr213mY_Int, TB_ALIGN_32 }, + { X86::VFMSUBADDPDr213rY_Int, X86::VFMSUBADDPDr213mY_Int, TB_ALIGN_32 }, }; for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) { @@ -3481,6 +3481,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, OpcodeTablePtr = &RegOp2MemOpTable1; } else if (i == 2) { OpcodeTablePtr = &RegOp2MemOpTable2; + } else if (i == 3) { + OpcodeTablePtr = &RegOp2MemOpTable3; } // If table selected... diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll index 5deedb9dd9b..b0c1d0a0dd1 100644 --- a/test/CodeGen/X86/fma.ll +++ b/test/CodeGen/X86/fma.ll @@ -1,8 +1,11 @@ -; RUN: llc < %s -mtriple=i386-apple-darwin10 | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s +; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA-INST +; RUN: llc < %s -mtriple=i386-apple-darwin10 | FileCheck %s --check-prefix=CHECK-FMA-CALL +; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA-INST +; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=CHECK-FMA-CALL ; CHECK: test_f32 -; CHECK: _fmaf +; CHECK-FMA-INST: vfmadd213ss +; CHECK-FMA-CALL: _fmaf define float @test_f32(float %a, float %b, float %c) nounwind readnone ssp { entry: @@ -11,7 +14,8 @@ entry: } ; CHECK: test_f64 -; CHECK: _fma +; CHECK-FMA-INST: vfmadd213sd +; CHECK-FMA-CALL: _fma define double @test_f64(double %a, double %b, double %c) nounwind readnone ssp { entry: diff --git a/test/CodeGen/X86/fma3-intrinsics.ll b/test/CodeGen/X86/fma3-intrinsics.ll index 8659dfe3266..90529e09d75 100755 --- a/test/CodeGen/X86/fma3-intrinsics.ll +++ b/test/CodeGen/X86/fma3-intrinsics.ll @@ -1,42 +1,42 @@ ; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 -mattr=avx2,+fma | FileCheck %s define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { - ; CHECK: fmadd132ss %xmm + ; CHECK: fmadd213ss %xmm %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind ret <4 x float> %res } declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { - ; CHECK: fmadd132ps + ; CHECK: fmadd213ps %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind ret <4 x float> %res } declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) { - ; CHECK: fmadd132ps {{.*\(%r.*}}, %ymm + ; CHECK: fmadd213ps {{.*\(%r.*}}, %ymm %res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind ret <8 x float> %res } declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone define <4 x float> @test_x86_fnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { - ; CHECK: fnmadd132ss %xmm + ; CHECK: fnmadd213ss %xmm %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind ret <4 x float> %res } declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { - ; CHECK: fnmadd132ps + ; CHECK: fnmadd213ps %res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind ret <4 x float> %res } declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) { - ; CHECK: fnmadd132ps {{.*\(%r.*}}, %ymm + ; CHECK: fnmadd213ps {{.*\(%r.*}}, %ymm %res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind ret <8 x float> %res } @@ -44,28 +44,28 @@ declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x define <4 x float> @test_x86_fmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { - ; CHECK: fmsub132ss + ; CHECK: fmsub213ss %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind ret <4 x float> %res } declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { - ; CHECK: fmsub132ps + ; CHECK: fmsub213ps %res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind ret <4 x float> %res } declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone define <4 x float> @test_x86_fnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { - ; CHECK: fnmsub132ss + ; CHECK: fnmsub213ss %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind ret <4 x float> %res } declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { - ; CHECK: fnmsub132ps + ; CHECK: fnmsub213ps %res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind ret <4 x float> %res } @@ -74,28 +74,28 @@ declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x floa ;;;; define <2 x double> @test_x86_fmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { - ; CHECK: fmadd132sd + ; CHECK: fmadd213sd %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind ret <2 x double> %res } declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone define <2 x double> @test_x86_fmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { - ; CHECK: fmadd132pd + ; CHECK: fmadd213pd %res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind ret <2 x double> %res } declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone define <2 x double> @test_x86_fnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { - ; CHECK: fnmadd132sd + ; CHECK: fnmadd213sd %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind ret <2 x double> %res } declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone define <2 x double> @test_x86_fnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { - ; CHECK: fnmadd132pd + ; CHECK: fnmadd213pd %res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind ret <2 x double> %res } @@ -104,28 +104,28 @@ declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x d define <2 x double> @test_x86_fmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { - ; CHECK: fmsub132sd + ; CHECK: fmsub213sd %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind ret <2 x double> %res } declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { - ; CHECK: fmsub132pd + ; CHECK: fmsub213pd %res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind ret <2 x double> %res } declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone define <2 x double> @test_x86_fnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { - ; CHECK: fnmsub132sd + ; CHECK: fnmsub213sd %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind ret <2 x double> %res } declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone define <2 x double> @test_x86_fnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { - ; CHECK: fnmsub132pd + ; CHECK: fnmsub213pd %res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind ret <2 x double> %res } diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll new file mode 100644 index 00000000000..b04663a6024 --- /dev/null +++ b/test/CodeGen/X86/fma_patterns.ll @@ -0,0 +1,139 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=avx2,+fma -fp-contract=fast | FileCheck %s + +; CHECK: test_x86_fmadd_ps +; CHECK: vfmadd213ps %xmm2, %xmm0, %xmm1 +; CHECK: ret +define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { + %x = fmul <4 x float> %a0, %a1 + %res = fadd <4 x float> %x, %a2 + ret <4 x float> %res +} + +; CHECK: test_x86_fmsub_ps +; CHECK: fmsub213ps %xmm2, %xmm0, %xmm1 +; CHECK: ret +define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { + %x = fmul <4 x float> %a0, %a1 + %res = fsub <4 x float> %x, %a2 + ret <4 x float> %res +} + +; CHECK: test_x86_fnmadd_ps +; CHECK: fnmadd213ps %xmm2, %xmm0, %xmm1 +; CHECK: ret +define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { + %x = fmul <4 x float> %a0, %a1 + %res = fsub <4 x float> %a2, %x + ret <4 x float> %res +} + +; CHECK: test_x86_fnmsub_ps +; CHECK: fnmsub213ps %xmm2, %xmm0, %xmm1 +; CHECK: ret +define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { + %x = fmul <4 x float> %a0, %a1 + %y = fsub <4 x float> , %x + %res = fsub <4 x float> %y, %a2 + ret <4 x float> %res +} + +; CHECK: test_x86_fmadd_ps_y +; CHECK: vfmadd213ps %ymm2, %ymm0, %ymm1 +; CHECK: ret +define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) { + %x = fmul <8 x float> %a0, %a1 + %res = fadd <8 x float> %x, %a2 + ret <8 x float> %res +} + +; CHECK: test_x86_fmsub_ps_y +; CHECK: vfmsub213ps %ymm2, %ymm0, %ymm1 +; CHECK: ret +define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) { + %x = fmul <8 x float> %a0, %a1 + %res = fsub <8 x float> %x, %a2 + ret <8 x float> %res +} + +; CHECK: test_x86_fnmadd_ps_y +; CHECK: vfnmadd213ps %ymm2, %ymm0, %ymm1 +; CHECK: ret +define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) { + %x = fmul <8 x float> %a0, %a1 + %res = fsub <8 x float> %a2, %x + ret <8 x float> %res +} + +; CHECK: test_x86_fnmsub_ps_y +; CHECK: vfnmsub213ps %ymm2, %ymm0, %ymm1 +; CHECK: ret +define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) { + %x = fmul <8 x float> %a0, %a1 + %y = fsub <8 x float> , %x + %res = fsub <8 x float> %y, %a2 + ret <8 x float> %res +} + +; CHECK: test_x86_fmadd_pd_y +; CHECK: vfmadd213pd %ymm2, %ymm0, %ymm1 +; CHECK: ret +define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) { + %x = fmul <4 x double> %a0, %a1 + %res = fadd <4 x double> %x, %a2 + ret <4 x double> %res +} + +; CHECK: test_x86_fmsub_pd_y +; CHECK: vfmsub213pd %ymm2, %ymm0, %ymm1 +; CHECK: ret +define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) { + %x = fmul <4 x double> %a0, %a1 + %res = fsub <4 x double> %x, %a2 + ret <4 x double> %res +} + +; CHECK: test_x86_fmsub_pd +; CHECK: vfmsub213pd %xmm2, %xmm0, %xmm1 +; CHECK: ret +define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { + %x = fmul <2 x double> %a0, %a1 + %res = fsub <2 x double> %x, %a2 + ret <2 x double> %res +} + +; CHECK: test_x86_fnmadd_ss +; CHECK: vfnmadd213ss %xmm2, %xmm0, %xmm1 +; CHECK: ret +define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) { + %x = fmul float %a0, %a1 + %res = fsub float %a2, %x + ret float %res +} + +; CHECK: test_x86_fnmadd_sd +; CHECK: vfnmadd213sd %xmm2, %xmm0, %xmm1 +; CHECK: ret +define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) { + %x = fmul double %a0, %a1 + %res = fsub double %a2, %x + ret double %res +} + +; CHECK: test_x86_fmsub_sd +; CHECK: vfmsub213sd %xmm2, %xmm0, %xmm1 +; CHECK: ret +define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) { + %x = fmul double %a0, %a1 + %res = fsub double %x, %a2 + ret double %res +} + +; CHECK: test_x86_fnmsub_ss +; CHECK: vfnmsub213ss %xmm2, %xmm0, %xmm1 +; CHECK: ret +define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) { + %x = fsub float -0.000000e+00, %a0 + %y = fmul float %x, %a1 + %res = fsub float %y, %a2 + ret float %res +}