From: Sebastian Pop Date: Mon, 5 Mar 2012 17:39:52 +0000 (+0000) Subject: updated patch for the ARM fused multiply add/sub X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=74bebde7c4e2d1cfd4a16c19ce3c87521df67639;p=oota-llvm.git updated patch for the ARM fused multiply add/sub In this update: - I assumed neon2 does not imply vfpv4, but neon and vfpv4 imply neon2. - I kept setting .fpu=neon-vfpv4 code attribute because that is what the assembler understands. Patch by Ana Pazos git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@152036 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index 16af8cfbe2c..b05fe629b74 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -38,9 +38,9 @@ def FeatureVFP4 : SubtargetFeature<"vfp4", "HasVFPv4", "true", def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true", "Enable NEON instructions", [FeatureVFP3]>; -def FeatureNEONVFP4 : SubtargetFeature<"neon-vfpv4", "HasNEONVFPv4", "true", - "Enable NEON-VFP4 instructions", - [FeatureVFP4, FeatureNEON]>; +def FeatureNEON2 : SubtargetFeature<"neon2", "HasNEON2", "true", + "Enable Advanced SIMD2 instructions", + [FeatureNEON]>; def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true", "Enable Thumb2 instructions">; def FeatureNoARM : SubtargetFeature<"noarm", "NoARM", "true", @@ -76,6 +76,8 @@ def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding", def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP", "true", "Use NEON for single precision FP">; +// Allow more precision in FP computation +def FPContractions : Predicate<"!TM.Options.NoExcessFPPrecision">; // Disable 32-bit to 16-bit narrowing for experimentation. def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true", diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp index 9d8c97a8323..4ec19ccb42e 100644 --- a/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/ARMAsmPrinter.cpp @@ -732,10 +732,10 @@ void ARMAsmPrinter::emitAttributes() { if (Subtarget->hasNEON() && emitFPU) { /* NEON is not exactly a VFP architecture, but GAS emit one of * neon/neon-vfpv4/vfpv3/vfpv2 for .fpu parameters */ - if (Subtarget->hasNEONVFP4()) + if (Subtarget->hasNEON2()) AttrEmitter->EmitTextAttribute(ARMBuildAttrs::Advanced_SIMD_arch, "neon-vfpv4"); else - AttrEmitter->EmitTextAttribute(ARMBuildAttrs::Advanced_SIMD_arch, "neon"); + AttrEmitter->EmitTextAttribute(ARMBuildAttrs::Advanced_SIMD_arch, "neon"); /* If emitted for NEON, omit from VFP below, since you can have both * NEON and VFP in build attributes but only one .fpu */ emitFPU = false; diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 6f510ba665d..0b1406e657a 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -184,9 +184,9 @@ def HasVFP4 : Predicate<"Subtarget->hasVFP4()">, def NoVFP4 : Predicate<"!Subtarget->hasVFP4()">; def HasNEON : Predicate<"Subtarget->hasNEON()">, AssemblerPredicate<"FeatureNEON">; -def HasNEONVFP4 : Predicate<"Subtarget->hasNEONVFP4()">, - AssemblerPredicate<"FeatureNEONVFP4">; -def NoNEONVFP4 : Predicate<"!Subtarget->hasNEONVFP4()">; +def HasNEON2 : Predicate<"Subtarget->hasNEON2()">, + AssemblerPredicate<"FeatureNEON2">; +def NoNEON2 : Predicate<"!Subtarget->hasNEON2()">; def HasFP16 : Predicate<"Subtarget->hasFP16()">, AssemblerPredicate<"FeatureFP16">; def HasDivide : Predicate<"Subtarget->hasDivide()">, diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 387e16d1462..17fe80851c0 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -4060,10 +4060,10 @@ defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>; def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32", v2f32, fmul_su, fadd_mlx>, - Requires<[HasNEON, UseFPVMLx, NoNEONVFP4]>; + Requires<[HasNEON, UseFPVMLx, NoNEON2]>; def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32", v4f32, fmul_su, fadd_mlx>, - Requires<[HasNEON, UseFPVMLx, NoNEONVFP4]>; + Requires<[HasNEON, UseFPVMLx, NoNEON2]>; defm VMLAsl : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>; def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32", @@ -4118,10 +4118,10 @@ defm VMLS : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>; def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32", v2f32, fmul_su, fsub_mlx>, - Requires<[HasNEON, UseFPVMLx, NoNEONVFP4]>; + Requires<[HasNEON, UseFPVMLx, NoNEON2]>; def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32", v4f32, fmul_su, fsub_mlx>, - Requires<[HasNEON, UseFPVMLx, NoNEONVFP4]>; + Requires<[HasNEON, UseFPVMLx, NoNEON2]>; defm VMLSsl : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>; def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32", @@ -4174,19 +4174,19 @@ defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>; // Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations. def VFMAfd : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32", v2f32, fmul_su, fadd_mlx>, - Requires<[HasNEONVFP4]>; + Requires<[HasNEON2,FPContractions]>; def VFMAfq : N3VQMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACQ, "vfma", "f32", v4f32, fmul_su, fadd_mlx>, - Requires<[HasNEONVFP4]>; + Requires<[HasNEON2,FPContractions]>; // Fused Vector Multiply Subtract (floating-point) def VFMSfd : N3VDMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACD, "vfms", "f32", v2f32, fmul_su, fsub_mlx>, - Requires<[HasNEONVFP4]>; + Requires<[HasNEON2,FPContractions]>; def VFMSfq : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32", v4f32, fmul_su, fsub_mlx>, - Requires<[HasNEONVFP4]>; + Requires<[HasNEON2,FPContractions]>; // Vector Subtract Operations. @@ -5541,13 +5541,13 @@ def : N3VSPat; def : N3VSPat; def : N3VSPat; def : N3VSMulOpPat, - Requires<[HasNEON, UseNEONForFP, UseFPVMLx, NoNEONVFP4]>; + Requires<[HasNEON, UseNEONForFP, UseFPVMLx, NoNEON2]>; def : N3VSMulOpPat, - Requires<[HasNEON, UseNEONForFP, UseFPVMLx, NoNEONVFP4]>; + Requires<[HasNEON, UseNEONForFP, UseFPVMLx, NoNEON2]>; def : N3VSMulOpPat, - Requires<[HasNEONVFP4, UseNEONForFP]>; + Requires<[HasNEON2, UseNEONForFP,FPContractions]>; def : N3VSMulOpPat, - Requires<[HasNEONVFP4, UseNEONForFP]>; + Requires<[HasNEON2, UseNEONForFP,FPContractions]>; def : N2VSPat; def : N2VSPat; def : N3VSPat; diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td index bf32b49640f..aa10af756d7 100644 --- a/lib/Target/ARM/ARMInstrVFP.td +++ b/lib/Target/ARM/ARMInstrVFP.td @@ -1030,7 +1030,7 @@ def VFMAD : ADbI<0b11101, 0b10, 0, 0, [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP4]>; + Requires<[HasVFP4,FPContractions]>; def VFMAS : ASbIn<0b11101, 0b10, 0, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), @@ -1038,17 +1038,17 @@ def VFMAS : ASbIn<0b11101, 0b10, 0, 0, [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP4,DontUseNEONForFP]> { + Requires<[HasVFP4,DontUseNEONForFP,FPContractions]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines. } def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), (VFMAD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP4]>; + Requires<[HasVFP4,FPContractions]>; def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), (VFMAS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP4,DontUseNEONForFP]>; + Requires<[HasVFP4,DontUseNEONForFP,FPContractions]>; def VFMSD : ADbI<0b11101, 0b10, 1, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), @@ -1056,7 +1056,7 @@ def VFMSD : ADbI<0b11101, 0b10, 1, 0, [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP4]>; + Requires<[HasVFP4,FPContractions]>; def VFMSS : ASbIn<0b11101, 0b10, 1, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), @@ -1064,17 +1064,17 @@ def VFMSS : ASbIn<0b11101, 0b10, 1, 0, [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP4,DontUseNEONForFP]> { + Requires<[HasVFP4,DontUseNEONForFP,FPContractions]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines. } def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))), (VFMSD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP4]>; + Requires<[HasVFP4,FPContractions]>; def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)), (VFMSS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP4,DontUseNEONForFP]>; + Requires<[HasVFP4,DontUseNEONForFP,FPContractions]>; def VFNMAD : ADbI<0b11101, 0b01, 1, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), @@ -1082,7 +1082,7 @@ def VFNMAD : ADbI<0b11101, 0b01, 1, 0, [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP4]>; + Requires<[HasVFP4,FPContractions]>; def VFNMAS : ASbI<0b11101, 0b01, 1, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), @@ -1090,17 +1090,17 @@ def VFNMAS : ASbI<0b11101, 0b01, 1, 0, [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP4,DontUseNEONForFP]> { + Requires<[HasVFP4,DontUseNEONForFP,FPContractions]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines. } def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin), (VFNMAD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP4]>; + Requires<[HasVFP4,FPContractions]>; def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin), (VFNMAS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP4,DontUseNEONForFP]>; + Requires<[HasVFP4,DontUseNEONForFP,FPContractions]>; def VFNMSD : ADbI<0b11101, 0b01, 0, 0, (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm), @@ -1108,24 +1108,24 @@ def VFNMSD : ADbI<0b11101, 0b01, 0, 0, [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm), (f64 DPR:$Ddin)))]>, RegConstraint<"$Ddin = $Dd">, - Requires<[HasVFP4]>; + Requires<[HasVFP4,FPContractions]>; def VFNMSS : ASbI<0b11101, 0b01, 0, 0, (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm), IIC_fpFMAC32, "vfnms", ".f32\t$Sd, $Sn, $Sm", [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>, RegConstraint<"$Sdin = $Sd">, - Requires<[HasVFP4,DontUseNEONForFP]> { + Requires<[HasVFP4,DontUseNEONForFP,FPContractions]> { // Some single precision VFP instructions may be executed on both NEON and // VFP pipelines. } def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin), (VFNMSD DPR:$dstin, DPR:$a, DPR:$b)>, - Requires<[HasVFP4]>; + Requires<[HasVFP4,FPContractions]>; def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin), (VFNMSS SPR:$dstin, SPR:$a, SPR:$b)>, - Requires<[HasVFP4,DontUseNEONForFP]>; + Requires<[HasVFP4,DontUseNEONForFP,FPContractions]>; //===----------------------------------------------------------------------===// // FP Conditional moves. diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index 64b2b620eb4..1bd6f1cbdf5 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -49,7 +49,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU, , HasVFPv3(false) , HasVFPv4(false) , HasNEON(false) - , HasNEONVFPv4(false) + , HasNEON2(false) , UseNEONForSinglePrecisionFP(false) , SlowFPVMLx(false) , HasVMLxForwarding(false) diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index c94795f753f..3d9c03d5dd2 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -51,7 +51,7 @@ protected: bool HasVFPv3; bool HasVFPv4; bool HasNEON; - bool HasNEONVFPv4; + bool HasNEON2; /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been /// specified. Use the method useNEONForSinglePrecisionFP() to @@ -205,7 +205,7 @@ protected: bool hasVFP3() const { return HasVFPv3; } bool hasVFP4() const { return HasVFPv4; } bool hasNEON() const { return HasNEON; } - bool hasNEONVFP4() const { return HasNEONVFPv4; } + bool hasNEON2() const { return HasNEON2 || (HasNEON && HasVFPv4); } bool useNEONForSinglePrecisionFP() const { return hasNEON() && UseNEONForSinglePrecisionFP; } diff --git a/test/CodeGen/ARM/fusedMAC.ll b/test/CodeGen/ARM/fusedMAC.ll index d7b8ed165b4..40e8bb2f00f 100644 --- a/test/CodeGen/ARM/fusedMAC.ll +++ b/test/CodeGen/ARM/fusedMAC.ll @@ -1,7 +1,7 @@ -; RUN: llc < %s -march=arm -mattr=+neon-vfpv4 | FileCheck %s +; RUN: llc < %s -march=arm -mattr=+neon,+vfp4 | FileCheck %s ; Check generated fused MAC and MLS. -define double @fusedMACTest1(double %d1, double %d2, double %d3) nounwind readnone noinline { +define double @fusedMACTest1(double %d1, double %d2, double %d3) { ;CHECK: fusedMACTest1: ;CHECK: vfma.f64 %1 = fmul double %d1, %d2 @@ -9,7 +9,7 @@ define double @fusedMACTest1(double %d1, double %d2, double %d3) nounwind readno ret double %2 } -define float @fusedMACTest2(float %f1, float %f2, float %f3) nounwind readnone noinline { +define float @fusedMACTest2(float %f1, float %f2, float %f3) { ;CHECK: fusedMACTest2: ;CHECK: vfma.f32 %1 = fmul float %f1, %f2 @@ -17,7 +17,7 @@ define float @fusedMACTest2(float %f1, float %f2, float %f3) nounwind readnone n ret float %2 } -define double @fusedMACTest3(double %d1, double %d2, double %d3) nounwind readnone noinline { +define double @fusedMACTest3(double %d1, double %d2, double %d3) { ;CHECK: fusedMACTest3: ;CHECK: vfms.f64 %1 = fmul double %d2, %d3 @@ -25,7 +25,7 @@ define double @fusedMACTest3(double %d1, double %d2, double %d3) nounwind readno ret double %2 } -define float @fusedMACTest4(float %f1, float %f2, float %f3) nounwind readnone noinline { +define float @fusedMACTest4(float %f1, float %f2, float %f3) { ;CHECK: fusedMACTest4: ;CHECK: vfms.f32 %1 = fmul float %f2, %f3 @@ -33,7 +33,7 @@ define float @fusedMACTest4(float %f1, float %f2, float %f3) nounwind readnone n ret float %2 } -define double @fusedMACTest5(double %d1, double %d2, double %d3) nounwind readnone noinline { +define double @fusedMACTest5(double %d1, double %d2, double %d3) { ;CHECK: fusedMACTest5: ;CHECK: vfnma.f64 %1 = fmul double %d1, %d2 @@ -42,7 +42,7 @@ define double @fusedMACTest5(double %d1, double %d2, double %d3) nounwind readno ret double %3 } -define float @fusedMACTest6(float %f1, float %f2, float %f3) nounwind { +define float @fusedMACTest6(float %f1, float %f2, float %f3) { ;CHECK: fusedMACTest6: ;CHECK: vfnma.f32 %1 = fmul float %f1, %f2 @@ -51,7 +51,7 @@ define float @fusedMACTest6(float %f1, float %f2, float %f3) nounwind { ret float %3 } -define double @fusedMACTest7(double %d1, double %d2, double %d3) nounwind { +define double @fusedMACTest7(double %d1, double %d2, double %d3) { ;CHECK: fusedMACTest7: ;CHECK: vfnms.f64 %1 = fmul double %d1, %d2 @@ -59,10 +59,42 @@ define double @fusedMACTest7(double %d1, double %d2, double %d3) nounwind { ret double %2 } -define float @fusedMACTest8(float %f1, float %f2, float %f3) nounwind { +define float @fusedMACTest8(float %f1, float %f2, float %f3) { ;CHECK: fusedMACTest8: ;CHECK: vfnms.f32 %1 = fmul float %f1, %f2 %2 = fsub float %1, %f3 ret float %2 } + +define <2 x float> @fusedMACTest9(<2 x float> %a, <2 x float> %b) { +;CHECK: fusedMACTest9: +;CHECK: vfma.f32 + %mul = fmul <2 x float> %a, %b + %add = fadd <2 x float> %mul, %a + ret <2 x float> %add +} + +define <2 x float> @fusedMACTest10(<2 x float> %a, <2 x float> %b) { +;CHECK: fusedMACTest10: +;CHECK: vfms.f32 + %mul = fmul <2 x float> %a, %b + %sub = fsub <2 x float> %a, %mul + ret <2 x float> %sub +} + +define <4 x float> @fusedMACTest11(<4 x float> %a, <4 x float> %b) { +;CHECK: fusedMACTest11: +;CHECK: vfma.f32 + %mul = fmul <4 x float> %a, %b + %add = fadd <4 x float> %mul, %a + ret <4 x float> %add +} + +define <4 x float> @fusedMACTest12(<4 x float> %a, <4 x float> %b) { +;CHECK: fusedMACTest12: +;CHECK: vfms.f32 + %mul = fmul <4 x float> %a, %b + %sub = fsub <4 x float> %a, %mul + ret <4 x float> %sub +}