From: Craig Topper Date: Fri, 31 Aug 2012 15:40:30 +0000 (+0000) Subject: Add support for converting llvm.fma to fma4 instructions. X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=bf4043768c6726db523f99460645842e5024fc7f;p=oota-llvm.git Add support for converting llvm.fma to fma4 instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@162999 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp index 5039887e1a2..a54d35dda2a 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -379,6 +379,8 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate, } switch (type) { + case TYPE_XMM32: + case TYPE_XMM64: case TYPE_XMM128: mcInst.addOperand(MCOperand::CreateReg(X86::XMM0 + (immediate >> 4))); return; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b2b30fe387c..f9184f693db 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1052,7 +1052,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); - if (Subtarget->hasFMA()) { + if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { setOperationAction(ISD::FMA, MVT::v8f32, Custom); setOperationAction(ISD::FMA, MVT::v4f64, Custom); setOperationAction(ISD::FMA, MVT::v4f32, Custom); @@ -15606,7 +15606,8 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT ScalarVT = VT.getScalarType(); - if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasFMA()) + if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || + (!Subtarget->hasFMA() && !Subtarget->hasFMA4())) return SDValue(); SDValue A = N->getOperand(0); @@ -15628,9 +15629,10 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode; if (!NegMul) - Opcode = (!NegC)? X86ISD::FMADD : X86ISD::FMSUB; + Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; else - Opcode = (!NegC)? X86ISD::FNMADD : X86ISD::FNMSUB; + Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; + return DAG.getNode(Opcode, dl, VT, A, B, C); } diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td index 445bbaa4c17..f99440d973f 100644 --- a/lib/Target/X86/X86InstrFMA.td +++ b/lib/Target/X86/X86InstrFMA.td @@ -193,34 +193,57 @@ defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss, //===----------------------------------------------------------------------===// -multiclass fma4s opc, string OpcodeStr, Operand memop, - ComplexPattern mem_cpat, Intrinsic Int> { - def rr : FMA4 opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, ValueType OpVT, SDNode OpNode, + PatFrag mem_frag> { + def rr : FMA4, VEX_W, MemOp4; - def rm : FMA4, VEX_W, MemOp4; + def rm : FMA4, VEX_W, MemOp4; - def mr : FMA4, VEX_W, MemOp4; + def mr : FMA4; + [(set RC:$dst, + (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>; // For disassembler let isCodeGenOnly = 1 in - def rr_REV : FMA4; } +multiclass fma4s_int opc, string OpcodeStr, Operand memop, + ComplexPattern mem_cpat, Intrinsic Int> { + def rr_Int : FMA4, VEX_W, MemOp4; + def rm_Int : FMA4, VEX_W, MemOp4; + def mr_Int : FMA4; +} + multiclass fma4p opc, string OpcodeStr, SDNode OpNode, ValueType OpVT128, ValueType OpVT256, PatFrag ld_frag128, PatFrag ld_frag256> { @@ -277,34 +300,47 @@ let isCodeGenOnly = 1 in { let Predicates = [HasFMA4] in { -defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", ssmem, sse_load_f32, - int_x86_fma_vfmadd_ss>; -defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", sdmem, sse_load_f64, - int_x86_fma_vfmadd_sd>; +defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>, + fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32, + int_x86_fma_vfmadd_ss>; +defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>, + fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64, + int_x86_fma_vfmadd_sd>; +defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>, + fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32, + int_x86_fma_vfmsub_ss>; +defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>, + fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64, + int_x86_fma_vfmsub_sd>; +defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32, + X86Fnmadd, loadf32>, + fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32, + int_x86_fma_vfnmadd_ss>; +defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64, + X86Fnmadd, loadf64>, + fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64, + int_x86_fma_vfnmadd_sd>; +defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32, + X86Fnmsub, loadf32>, + fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32, + int_x86_fma_vfnmsub_ss>; +defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64, + X86Fnmsub, loadf64>, + fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64, + int_x86_fma_vfnmsub_sd>; + defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32, memopv4f32, memopv8f32>; defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64, memopv2f64, memopv4f64>; -defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", ssmem, sse_load_f32, - int_x86_fma_vfmsub_ss>; -defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", sdmem, sse_load_f64, - int_x86_fma_vfmsub_sd>; defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32, memopv4f32, memopv8f32>; defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64, memopv2f64, memopv4f64>; -defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", ssmem, sse_load_f32, - int_x86_fma_vfnmadd_ss>; -defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", sdmem, sse_load_f64, - int_x86_fma_vfnmadd_sd>; defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32, memopv4f32, memopv8f32>; defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64, memopv2f64, memopv4f64>; -defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", ssmem, sse_load_f32, - int_x86_fma_vfnmsub_ss>; -defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", sdmem, sse_load_f64, - int_x86_fma_vfnmsub_sd>; defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32, memopv4f32, memopv8f32>; defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64, diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll index b22d029a093..cc8a80ca3a6 100644 --- a/test/CodeGen/X86/fma_patterns.ll +++ b/test/CodeGen/X86/fma_patterns.ll @@ -1,9 +1,13 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=avx2,+fma -fp-contract=fast | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 -fp-contract=fast | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver1 -fp-contract=fast | FileCheck %s --check-prefix=CHECK_FMA4 ; CHECK: test_x86_fmadd_ps ; CHECK: vfmadd213ps %xmm2, %xmm0, %xmm1 ; CHECK: ret +; CHECK_FMA4: test_x86_fmadd_ps +; CHECK_FMA4: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4: ret define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { %x = fmul <4 x float> %a0, %a1 %res = fadd <4 x float> %x, %a2 @@ -13,6 +17,9 @@ define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo ; CHECK: test_x86_fmsub_ps ; CHECK: fmsub213ps %xmm2, %xmm0, %xmm1 ; CHECK: ret +; CHECK_FMA4: test_x86_fmsub_ps +; CHECK_FMA4: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4: ret define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { %x = fmul <4 x float> %a0, %a1 %res = fsub <4 x float> %x, %a2 @@ -22,6 +29,9 @@ define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo ; CHECK: test_x86_fnmadd_ps ; CHECK: fnmadd213ps %xmm2, %xmm0, %xmm1 ; CHECK: ret +; CHECK_FMA4: test_x86_fnmadd_ps +; CHECK_FMA4: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4: ret define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { %x = fmul <4 x float> %a0, %a1 %res = fsub <4 x float> %a2, %x @@ -31,6 +41,9 @@ define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x fl ; CHECK: test_x86_fnmsub_ps ; CHECK: fnmsub213ps %xmm2, %xmm0, %xmm1 ; CHECK: ret +; CHECK_FMA4: test_x86_fnmsub_ps +; CHECK_FMA4: fnmsubps %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4: ret define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { %x = fmul <4 x float> %a0, %a1 %y = fsub <4 x float> , %x @@ -41,6 +54,9 @@ define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x fl ; CHECK: test_x86_fmadd_ps_y ; CHECK: vfmadd213ps %ymm2, %ymm0, %ymm1 ; CHECK: ret +; CHECK_FMA4: test_x86_fmadd_ps_y +; CHECK_FMA4: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK_FMA4: ret define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) { %x = fmul <8 x float> %a0, %a1 %res = fadd <8 x float> %x, %a2 @@ -50,6 +66,9 @@ define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x f ; CHECK: test_x86_fmsub_ps_y ; CHECK: vfmsub213ps %ymm2, %ymm0, %ymm1 ; CHECK: ret +; CHECK_FMA4: test_x86_fmsub_ps_y +; CHECK_FMA4: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK_FMA4: ret define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) { %x = fmul <8 x float> %a0, %a1 %res = fsub <8 x float> %x, %a2 @@ -59,6 +78,9 @@ define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x f ; CHECK: test_x86_fnmadd_ps_y ; CHECK: vfnmadd213ps %ymm2, %ymm0, %ymm1 ; CHECK: ret +; CHECK_FMA4: test_x86_fnmadd_ps_y +; CHECK_FMA4: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK_FMA4: ret define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) { %x = fmul <8 x float> %a0, %a1 %res = fsub <8 x float> %a2, %x @@ -78,6 +100,9 @@ define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x ; CHECK: test_x86_fmadd_pd_y ; CHECK: vfmadd213pd %ymm2, %ymm0, %ymm1 ; CHECK: ret +; CHECK_FMA4: test_x86_fmadd_pd_y +; CHECK_FMA4: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK_FMA4: ret define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) { %x = fmul <4 x double> %a0, %a1 %res = fadd <4 x double> %x, %a2 @@ -87,6 +112,9 @@ define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4 ; CHECK: test_x86_fmsub_pd_y ; CHECK: vfmsub213pd %ymm2, %ymm0, %ymm1 ; CHECK: ret +; CHECK_FMA4: test_x86_fmsub_pd_y +; CHECK_FMA4: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK_FMA4: ret define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) { %x = fmul <4 x double> %a0, %a1 %res = fsub <4 x double> %x, %a2 @@ -96,6 +124,9 @@ define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4 ; CHECK: test_x86_fmsub_pd ; CHECK: vfmsub213pd %xmm2, %xmm0, %xmm1 ; CHECK: ret +; CHECK_FMA4: test_x86_fmsub_pd +; CHECK_FMA4: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4: ret define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { %x = fmul <2 x double> %a0, %a1 %res = fsub <2 x double> %x, %a2 @@ -105,6 +136,9 @@ define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x ; CHECK: test_x86_fnmadd_ss ; CHECK: vfnmadd213ss %xmm2, %xmm0, %xmm1 ; CHECK: ret +; CHECK_FMA4: test_x86_fnmadd_ss +; CHECK_FMA4: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4: ret define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) { %x = fmul float %a0, %a1 %res = fsub float %a2, %x @@ -114,6 +148,9 @@ define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) { ; CHECK: test_x86_fnmadd_sd ; CHECK: vfnmadd213sd %xmm2, %xmm0, %xmm1 ; CHECK: ret +; CHECK_FMA4: test_x86_fnmadd_sd +; CHECK_FMA4: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4: ret define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) { %x = fmul double %a0, %a1 %res = fsub double %a2, %x @@ -123,6 +160,9 @@ define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) { ; CHECK: test_x86_fmsub_sd ; CHECK: vfmsub213sd %xmm2, %xmm0, %xmm1 ; CHECK: ret +; CHECK_FMA4: test_x86_fmsub_sd +; CHECK_FMA4: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4: ret define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) { %x = fmul double %a0, %a1 %res = fsub double %x, %a2 @@ -132,6 +172,9 @@ define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) { ; CHECK: test_x86_fnmsub_ss ; CHECK: vfnmsub213ss %xmm2, %xmm0, %xmm1 ; CHECK: ret +; CHECK_FMA4: test_x86_fnmsub_ss +; CHECK_FMA4: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK_FMA4: ret define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) { %x = fsub float -0.000000e+00, %a0 %y = fmul float %x, %a1 diff --git a/utils/TableGen/X86RecognizableInstr.cpp b/utils/TableGen/X86RecognizableInstr.cpp index 7ac2336d732..4b12279cdd0 100644 --- a/utils/TableGen/X86RecognizableInstr.cpp +++ b/utils/TableGen/X86RecognizableInstr.cpp @@ -1145,6 +1145,8 @@ OperandEncoding RecognizableInstr::immediateEncodingFromString // register IDs in 8-bit immediates nowadays. ENCODING("VR256", ENCODING_IB) ENCODING("VR128", ENCODING_IB) + ENCODING("FR32", ENCODING_IB) + ENCODING("FR64", ENCODING_IB) errs() << "Unhandled immediate encoding " << s << "\n"; llvm_unreachable("Unhandled immediate encoding"); }