Fix a number of problems with ARM fused multiply add/subtract instructions.

author Evan Cheng <evan.cheng@apple.com>

Wed, 11 Apr 2012 00:13:00 +0000 (00:13 +0000)

committer Evan Cheng <evan.cheng@apple.com>

Wed, 11 Apr 2012 00:13:00 +0000 (00:13 +0000)
author Evan Cheng <evan.cheng@apple.com>
Wed, 11 Apr 2012 00:13:00 +0000 (00:13 +0000)
committer Evan Cheng <evan.cheng@apple.com>
Wed, 11 Apr 2012 00:13:00 +0000 (00:13 +0000)
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td

index b05fe629b746004abf49d185391c84621dca1f88..85c41fc75d492c4e7515a48a16ef8bb5ea70abd9 100644 (file)
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -76,8 +76,6 @@ def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
  def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
                                          "true",
                                          "Use NEON for single precision FP">;
-// Allow more precision in FP computation
-def FPContractions : Predicate<"!TM.Options.NoExcessFPPrecision">;
  
  // Disable 32-bit to 16-bit narrowing for experimentation.
  def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td

index 6b8f4cc432781237d538a38f0a90a5b255ab9a2b..37284f979d4c13ae1cbf16b904b89ccb8e78956f 100644 (file)
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -181,11 +181,11 @@ def HasVFP3          : Predicate<"Subtarget->hasVFP3()">,
                                   AssemblerPredicate<"FeatureVFP3">;
  def HasVFP4          : Predicate<"Subtarget->hasVFP4()">,
                                   AssemblerPredicate<"FeatureVFP4">;
-def NoVFP4            : Predicate<"!Subtarget->hasVFP4()">;
+def NoVFP4           : Predicate<"!Subtarget->hasVFP4()">;
  def HasNEON          : Predicate<"Subtarget->hasNEON()">,
                                   AssemblerPredicate<"FeatureNEON">;
  def HasNEON2         : Predicate<"Subtarget->hasNEON2()">,
-                                 AssemblerPredicate<"FeatureNEON2">;
+                                 AssemblerPredicate<"FeatureNEON,FeatureVFP4">;
  def NoNEON2          : Predicate<"!Subtarget->hasNEON2()">;
  def HasFP16          : Predicate<"Subtarget->hasFP16()">,
                                   AssemblerPredicate<"FeatureFP16">;
@@ -221,6 +221,9 @@ def UseMovt          : Predicate<"Subtarget->useMovt()">;
  def DontUseMovt      : Predicate<"!Subtarget->useMovt()">;
  def UseFPVMLx        : Predicate<"Subtarget->useFPVMLx()">;
  
+// Allow more precision in FP computation
+def FPContractions : Predicate<"!TM.Options.NoExcessFPPrecision">;
+
  //===----------------------------------------------------------------------===//
  // ARM Flag Definitions.
  
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td

index 99dbb95431ae9299b0c4bf225438250fbb16a3ae..501cc8f4db912ab2976258eb42a77d8c14df6439 100644 (file)
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -4115,7 +4115,6 @@ defm VQDMLSL  : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
                              "vqdmlsl", "s", int_arm_neon_vqdmlsl>;
  defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>;
  
-
  // Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations.
  def  VFMAfd   : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32",
                            v2f32, fmul_su, fadd_mlx>,
@@ -4136,10 +4135,10 @@ def  VFMSfq   : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32",
  // Match @llvm.fma.* intrinsics
  def : Pat<(fma (v2f32 DPR:$src1), (v2f32 DPR:$Vn), (v2f32 DPR:$Vm)),
            (VFMAfd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
-          Requires<[HasNEON, HasVFP4]>;
+          Requires<[HasNEON2]>;
  def : Pat<(fma (v4f32 QPR:$src1), (v4f32 QPR:$Vn), (v4f32 QPR:$Vm)),
            (VFMAfq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
-          Requires<[HasNEON, HasVFP4]>;
+          Requires<[HasNEON2]>;
  
  // Vector Subtract Operations.
  
@@ -5497,9 +5496,9 @@ def : N3VSMulOpPat<fmul, fadd, VMLAfd>,
  def : N3VSMulOpPat<fmul, fsub, VMLSfd>,
        Requires<[HasNEON, UseNEONForFP, UseFPVMLx, NoNEON2]>;
  def : N3VSMulOpPat<fmul, fadd, VFMAfd>,
-      Requires<[HasNEON2, UseNEONForFP,FPContractions]>;
+      Requires<[HasNEON2, UseNEONForFP, FPContractions]>;
  def : N3VSMulOpPat<fmul, fsub, VFMSfd>,
-      Requires<[HasNEON2, UseNEONForFP,FPContractions]>;
+      Requires<[HasNEON2, UseNEONForFP, FPContractions]>;
  def : N2VSPat<fabs, VABSfd>;
  def : N2VSPat<fneg, VNEGfd>;
  def : N3VSPat<NEONfmax, VMAXfd>;
diff --git a/lib/Target/ARM/ARMScheduleA8.td b/lib/Target/ARM/ARMScheduleA8.td

index 8d86c01dc7411042b107686458d214c2334272bb..8b1fb9386ad53c097835dbdd750153c3867c21cd 100644 (file)
--- a/lib/Target/ARM/ARMScheduleA8.td
+++ b/lib/Target/ARM/ARMScheduleA8.td
@@ -324,6 +324,15 @@ def CortexA8Itineraries : ProcessorItineraries<
                                 InstrStage<19, [A8_NPipe], 0>,
                                 InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
    //
+  // Single-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [7, 2, 1, 1]>,
+  //
+  // Double-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC64, [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<19, [A8_NPipe], 0>,
+                               InstrStage<19, [A8_NLSPipe]>], [19, 2, 1, 1]>,
+  //
    // Single-precision FP DIV
    InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                 InstrStage<20, [A8_NPipe], 0>,
@@ -860,6 +869,16 @@ def CortexA8Itineraries : ProcessorItineraries<
    InstrItinData<IIC_VMACQ,    [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                 InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
    //
+  // Double-register Fused FP Multiple-Accumulate
+  InstrItinData<IIC_VFMACD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<1, [A8_NPipe]>], [9, 3, 2, 2]>,
+  //
+  // Quad-register Fused FP Multiple-Accumulate
+  // Result written in N9, but that is relative to the last cycle of multicycle,
+  // so we use 10 for those cases
+  InstrItinData<IIC_VFMACQ,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
+                               InstrStage<2, [A8_NPipe]>], [10, 3, 2, 2]>,
+  //
    // Double-register Reciprical Step
    InstrItinData<IIC_VRECSD,   [InstrStage<1, [A8_Pipe0, A8_Pipe1], 0>,
                                 InstrStage<1, [A8_NPipe]>], [9, 2, 2]>,
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td

index 49fedf63f8bca209b925b0ac6d908deb31dad507..0d710cc1acee63d6bc9e092eee70b8a889ae5285 100644 (file)
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -604,6 +604,22 @@ def CortexA9Itineraries : ProcessorItineraries<
                                 InstrStage<2,  [A9_NPipe]>],
                                [9, 1, 1, 1]>,
    //
+  // Single-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsVFP], 0, Required>,
+                               InstrStage<9, [A9_DRegsN],   0, Reserved>,
+                               InstrStage<1, [A9_NPipe]>],
+                              [8, 1, 1, 1]>,
+  //
+  // Double-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC64, [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1,  [A9_MUX0], 0>,
+                               InstrStage<1,  [A9_DRegsVFP], 0, Required>,
+                               InstrStage<10, [A9_DRegsN],  0, Reserved>,
+                               InstrStage<2,  [A9_NPipe]>],
+                              [9, 1, 1, 1]>,
+  //
    // Single-precision FP DIV
    InstrItinData<IIC_fpDIV32 , [InstrStage<1,  [A9_Issue0, A9_Issue1], 0>,
                                 InstrStage<1,  [A9_MUX0], 0>,
@@ -1697,6 +1713,26 @@ def CortexA9Itineraries : ProcessorItineraries<
                                 InstrStage<4, [A9_NPipe]>],
                                [8, 4, 2, 1]>,
    //
+  // Double-register Fused FP Multiple-Accumulate
+  InstrItinData<IIC_VFMACD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 7 cycles
+                               InstrStage<8, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<2, [A9_NPipe]>],
+                              [6, 3, 2, 1]>,
+  //
+  // Quad-register Fused FP Multiple-Accumulate
+  // Result written in N9, but that is relative to the last cycle of multicycle,
+  // so we use 10 for those cases
+  InstrItinData<IIC_VFMACQ,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
+                               InstrStage<1, [A9_MUX0], 0>,
+                               InstrStage<1, [A9_DRegsN],   0, Required>,
+                               // Extra latency cycles since wbck is 9 cycles
+                               InstrStage<10, [A9_DRegsVFP], 0, Reserved>,
+                               InstrStage<4, [A9_NPipe]>],
+                              [8, 4, 2, 1]>,
+  //
    // Double-register Reciprical Step
    InstrItinData<IIC_VRECSD,   [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
                                 InstrStage<1, [A9_MUX0], 0>,
diff --git a/lib/Target/ARM/ARMScheduleV6.td b/lib/Target/ARM/ARMScheduleV6.td

index 4d959f565e0170d693b401c45fa25df69455efa4..0ace9bc1796d02ac3ad10147d53b33f8533c2d24 100644 (file)
--- a/lib/Target/ARM/ARMScheduleV6.td
+++ b/lib/Target/ARM/ARMScheduleV6.td
@@ -243,6 +243,12 @@ def ARMV6Itineraries : ProcessorItineraries<
    // Double-precision FP MAC
    InstrItinData<IIC_fpMAC64 , [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>,
    //
+  // Single-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [V6_Pipe]>], [9, 2, 2, 2]>,
+  //
+  // Double-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC64, [InstrStage<2, [V6_Pipe]>], [9, 2, 2, 2]>,
+  //
    // Single-precision FP DIV
    InstrItinData<IIC_fpDIV32 , [InstrStage<15, [V6_Pipe]>], [20, 2, 2]>,
    //
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h

index 3d9c03d5dd2275c8feeda75047c9d2086cfd6a13..5cf54b94a8a9564ca0ae495403b3d7daeb9128d7 100644 (file)
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -45,7 +45,7 @@ protected:
    bool HasV6T2Ops;
    bool HasV7Ops;
  
-  /// HasVFPv2, HasVFPv3, HasVFPv4, HasNEON, HasNEONVFPv4 - Specify what
+  /// HasVFPv2, HasVFPv3, HasVFPv4, HasNEON, HasNEON2 - Specify what
    /// floating point ISAs are supported.
    bool HasVFPv2;
    bool HasVFPv3;
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp

index 34dadf88238ca23a421383665d1fe6c30dcfabe5..8fa7378ffff6fc974343b739f49c5b5a4d42eaec 100644 (file)
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -4659,6 +4659,7 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
          Mnemonic == "fmrs" || Mnemonic == "fsqrts" || Mnemonic == "fsubs" ||
          Mnemonic == "fsts" || Mnemonic == "fcpys" || Mnemonic == "fdivs" ||
          Mnemonic == "fmuls" || Mnemonic == "fcmps" || Mnemonic == "fcmpzs" ||
+        Mnemonic == "vfms" || Mnemonic == "vfnms" ||
          (Mnemonic == "movs" && isThumb()))) {
      Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1);
      CarrySetting = true;
@@ -4702,6 +4703,7 @@ getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
        Mnemonic == "orr" || Mnemonic == "mvn" ||
        Mnemonic == "rsb" || Mnemonic == "rsc" || Mnemonic == "orn" ||
        Mnemonic == "sbc" || Mnemonic == "eor" || Mnemonic == "neg" ||
+      Mnemonic == "vfm" || Mnemonic == "vfnm" ||
        (!isThumb() && (Mnemonic == "smull" || Mnemonic == "mov" ||
                        Mnemonic == "mla" || Mnemonic == "smlal" ||
                        Mnemonic == "umlal" || Mnemonic == "umull"))) {
diff --git a/test/MC/ARM/vfp4.s b/test/MC/ARM/vfp4.s

new file mode 100644 (file)

index 0000000..009d31d
--- /dev/null
+++ b/test/MC/ARM/vfp4.s
@@ -0,0 +1,50 @@
+@ RUN: llvm-mc < %s -triple armv7-unknown-unknown -show-encoding -mattr=+neon,+vfp4   | FileCheck %s --check-prefix=ARM
+@ RUN: llvm-mc < %s -triple thumbv7-unknown-unknown -show-encoding -mattr=+neon,+vfp4 | FileCheck %s --check-prefix=THUMB
+
+        @ ARM: vfma.f64 d16, d18, d17 @ encoding: [0xa1,0x0b,0xe2,0xee]
+@ THUMB: vfma.f64 d16, d18, d17 @ encoding: [0xe2,0xee,0xa1,0x0b]
+vfma.f64 d16, d18, d17
+
+@ ARM: vfma.f32 s2, s4, s0 @ encoding: [0x00,0x1a,0xa2,0xee]
+@ THUMB: vfma.f32 s2, s4, s0 @ encoding: [0xa2,0xee,0x00,0x1a]
+vfma.f32 s2, s4, s0
+
+@ ARM: vfma.f32 d16, d18, d17 @ encoding: [0xb1,0x0c,0x42,0xf2]
+@ THUMB: vfma.f32 d16, d18, d17 @ encoding: [0x42,0xef,0xb1,0x0c]
+vfma.f32 d16, d18, d17
+
+@ ARM: vfma.f32 q2, q4, q0 @ encoding: [0x50,0x4c,0x08,0xf2]
+@ THUMB: vfma.f32      q2, q4, q0 @ encoding: [0x08,0xef,0x50,0x4c]
+vfma.f32 q2, q4, q0
+
+@ ARM: vfnma.f64 d16, d18, d17 @ encoding: [0xe1,0x0b,0xd2,0xee]
+@ THUMB: vfnma.f64 d16, d18, d17 @ encoding: [0xd2,0xee,0xe1,0x0b]
+vfnma.f64 d16, d18, d17
+
+@ ARM: vfnma.f32 s2, s4, s0 @ encoding: [0x40,0x1a,0x92,0xee]
+@ THUMB: vfnma.f32 s2, s4, s0 @ encoding: [0x92,0xee,0x40,0x1a]
+vfnma.f32 s2, s4, s0
+
+@ ARM: vfms.f64 d16, d18, d17 @ encoding: [0xe1,0x0b,0xe2,0xee]
+@ THUMB: vfms.f64 d16, d18, d17 @ encoding: [0xe2,0xee,0xe1,0x0b]
+vfms.f64 d16, d18, d17
+
+@ ARM: vfms.f32 s2, s4, s0 @ encoding: [0x40,0x1a,0xa2,0xee]
+@ THUMB: vfms.f32 s2, s4, s0 @ encoding: [0xa2,0xee,0x40,0x1a]
+vfms.f32 s2, s4, s0
+
+@ ARM: vfms.f32 d16, d18, d17 @ encoding: [0xb1,0x0c,0x62,0xf2]
+@ THUMB: vfms.f32 d16, d18, d17 @ encoding: [0x62,0xef,0xb1,0x0c]
+vfms.f32 d16, d18, d17
+
+@ ARM: vfms.f32 q2, q4, q0 @ encoding: [0x50,0x4c,0x28,0xf2]
+@ THUMB: vfms.f32      q2, q4, q0 @ encoding: [0x28,0xef,0x50,0x4c]
+vfms.f32 q2, q4, q0
+
+@ ARM: vfnms.f64 d16, d18, d17 @ encoding: [0xa1,0x0b,0xd2,0xee]
+@ THUMB: vfnms.f64 d16, d18, d17 @ encoding: [0xd2,0xee,0xa1,0x0b]
+vfnms.f64 d16, d18, d17
+
+@ ARM: vfnms.f32 s2, s4, s0 @ encoding: [0x00,0x1a,0x92,0xee]
+@ THUMB: vfnms.f32 s2, s4, s0 @ encoding: [0x92,0xee,0x00,0x1a]
+vfnms.f32 s2, s4, s0
diff --git a/test/MC/Disassembler/ARM/vfp4.txt b/test/MC/Disassembler/ARM/vfp4.txt

new file mode 100644 (file)

index 0000000..4f2c732
--- /dev/null
+++ b/test/MC/Disassembler/ARM/vfp4.txt
@@ -0,0 +1,37 @@
+# RUN: llvm-mc < %s -triple thumbv7-unknown-unknown --disassemble -mattr=+neon,+vfp4 | FileCheck %s
+
+# CHECK: vfma.f64 d16, d18, d17
+0xe2 0xee 0xa1 0x0b
+
+# CHECK: vfma.f32 s2, s4, s0
+0xa2 0xee 0x00 0x1a
+
+# CHECK: vfma.f32 d16, d18, d17
+0x42 0xef 0xb1 0x0c
+
+# CHECK: vfma.f32 q2, q4, q0
+0x08 0xef 0x50 0x4c
+
+# CHECK: vfnms.f64 d16, d18, d17
+0xd2 0xee 0xa1 0x0b
+
+# CHECK: vfnms.f32 s2, s4, s0
+0x92 0xee 0x00 0x1a
+
+# CHECK: vfms.f64 d16, d18, d17
+0xe2 0xee 0xe1 0x0b
+
+# CHECK: vfms.f32 s2, s4, s0
+0xa2 0xee 0x40 0x1a
+
+# CHECK: vfms.f32 d16, d18, d17
+0x62 0xef 0xb1 0x0c
+
+# CHECK: vfms.f32 q2, q4, q0
+0x28 0xef 0x50 0x4c
+
+# CHECK: vfnma.f64 d16, d18, d17
+0xd2 0xee 0xe1 0x0b
+
+# CHECK: vfnma.f32 s2, s4, s0
+0x92 0xee 0x40 0x1a
author	Evan Cheng <evan.cheng@apple.com>
	Wed, 11 Apr 2012 00:13:00 +0000 (00:13 +0000)
committer	Evan Cheng <evan.cheng@apple.com>
	Wed, 11 Apr 2012 00:13:00 +0000 (00:13 +0000)
lib/Target/ARM/ARM.td		patch \| blob \| history
lib/Target/ARM/ARMInstrInfo.td		patch \| blob \| history
lib/Target/ARM/ARMInstrNEON.td		patch \| blob \| history
lib/Target/ARM/ARMScheduleA8.td		patch \| blob \| history
lib/Target/ARM/ARMScheduleA9.td		patch \| blob \| history
lib/Target/ARM/ARMScheduleV6.td		patch \| blob \| history
lib/Target/ARM/ARMSubtarget.h		patch \| blob \| history
lib/Target/ARM/AsmParser/ARMAsmParser.cpp		patch \| blob \| history
test/MC/ARM/vfp4.s	[new file with mode: 0644]	patch \| blob
test/MC/Disassembler/ARM/vfp4.txt	[new file with mode: 0644]	patch \| blob