Distribute (A + B) * C to (A * C) + (B * C) to make use of NEON multiplier

author Evan Cheng <evan.cheng@apple.com>

Thu, 31 Mar 2011 19:38:48 +0000 (19:38 +0000)

committer Evan Cheng <evan.cheng@apple.com>

Thu, 31 Mar 2011 19:38:48 +0000 (19:38 +0000)
author Evan Cheng <evan.cheng@apple.com>
Thu, 31 Mar 2011 19:38:48 +0000 (19:38 +0000)
committer Evan Cheng <evan.cheng@apple.com>
Thu, 31 Mar 2011 19:38:48 +0000 (19:38 +0000)
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td

index bf4315fc6c3e93ba606eb3f9081f4b8de4c12fb5..e690e18672987ce2ece66ecc6c980c571422addd 100644 (file)
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -51,6 +51,12 @@ def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
  // to just not use them.
  def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
                                           "Disable VFP / NEON MAC instructions">;
+
+// Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding.
+def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding",
+                                       "HasVMLxForwarding", "true",
+                                       "Has multiplier accumulator forwarding">;
+
  // Some processors benefit from using NEON instructions for scalar
  // single-precision FP operations.
  def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
@@ -100,11 +106,12 @@ def ProcOthers  : SubtargetFeature<"others", "ARMProcFamily", "Others",
  def ProcA8      : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8",
                                     "Cortex-A8 ARM processors",
                                     [FeatureSlowFPBrcc, FeatureNEONForFP,
-                                    FeatureHasSlowFPVMLx, FeatureT2XtPk]>;
+                                    FeatureHasSlowFPVMLx, FeatureVMLxForwarding,
+                                    FeatureT2XtPk]>;
  def ProcA9      : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9",
                                     "Cortex-A9 ARM processors",
-                                   [FeatureHasSlowFPVMLx, FeatureT2XtPk,
-                                    FeatureFP16]>;
+                                   [FeatureHasSlowFPVMLx, FeatureVMLxForwarding,
+                                    FeatureT2XtPk, FeatureFP16]>;
  
  class ProcNoItin<string Name, list<SubtargetFeature> Features>
   : Processor<Name, GenericItineraries, Features>;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp

index 16b110f39b0dfbbc30278db04e244270840a3132..5838181497a643586fee9b803f91802800ab28ac 100644 (file)
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -5224,6 +5224,42 @@ static SDValue PerformSUBCombine(SDNode *N,
    return SDValue();
  }
  
+/// PerformVMULCombine
+/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
+/// special multiplier accumulator forwarding.
+///   vmul d3, d0, d2
+///   vmla d3, d1, d2
+/// is faster than
+///   vadd d3, d0, d1
+///   vmul d3, d3, d2
+static SDValue PerformVMULCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const ARMSubtarget *Subtarget) {
+  if (!Subtarget->hasVMLxForwarding())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  unsigned Opcode = N0.getOpcode();
+  if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
+      Opcode != ISD::FADD && Opcode != ISD::FSUB) {
+    Opcode = N0.getOpcode();
+    if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
+        Opcode != ISD::FADD && Opcode != ISD::FSUB)
+      return SDValue();
+    std::swap(N0, N1);
+  }
+
+  EVT VT = N->getValueType(0);
+  DebugLoc DL = N->getDebugLoc();
+  SDValue N00 = N0->getOperand(0);
+  SDValue N01 = N0->getOperand(1);
+  return DAG.getNode(Opcode, DL, VT,
+                     DAG.getNode(ISD::MUL, DL, VT, N00, N1),
+                     DAG.getNode(ISD::MUL, DL, VT, N01, N1));
+}
+
  static SDValue PerformMULCombine(SDNode *N,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const ARMSubtarget *Subtarget) {
@@ -5236,6 +5272,8 @@ static SDValue PerformMULCombine(SDNode *N,
      return SDValue();
  
    EVT VT = N->getValueType(0);
+  if (VT.is64BitVector() || VT.is128BitVector())
+    return PerformVMULCombine(N, DCI, Subtarget);
    if (VT != MVT::i32)
      return SDValue();
  
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h

index 76c1c3fb41b129e81a5a378282a20adceef06a28..e024182c4748ba0c056bc7f23750b93f5790f34c 100644 (file)
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -61,6 +61,10 @@ protected:
    /// whether the FP VML[AS] instructions are slow (if so, don't use them).
    bool SlowFPVMLx;
  
+  /// HasVMLxForwarding - If true, NEON has special multiplier accumulator
+  /// forwarding to allow mul + mla being issued back to back.
+  bool HasVMLxForwarding;
+
    /// SlowFPBrcc - True if floating point compare + branch is slow.
    bool SlowFPBrcc;
  
@@ -182,6 +186,7 @@ protected:
    bool hasT2ExtractPack() const { return HasT2ExtractPack; }
    bool hasDataBarrier() const { return HasDataBarrier; }
    bool useFPVMLx() const { return !SlowFPVMLx; }
+  bool hasVMLxForwarding() const { return HasVMLxForwarding; }
    bool isFPBrccSlow() const { return SlowFPBrcc; }
    bool isFPOnlySP() const { return FPOnlySP; }
    bool prefers32BitThumb() const { return Pref32BitThumb; }
diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll

index 80ba9be3bd2d3dd09732699701e7a7098397b969..1fd6581ae0816b1bf5a0e426fd8ffa645fdc11f6 100644 (file)
--- a/test/CodeGen/ARM/vmul.ll
+++ b/test/CodeGen/ARM/vmul.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
  
  define <8 x i8> @vmuli8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
  ;CHECK: vmuli8:
@@ -466,3 +466,29 @@ entry:
  declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32) nounwind readonly
  
  declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
+
+; Take advantage of the Cortex-A8 multiplier accumulator forward.
+
+%struct.uint8x8_t = type { <8 x i8> }
+
+define void @distribue2(%struct.uint8x8_t* nocapture %dst, i8* %src, i32 %mul) nounwind {
+entry:
+; CHECK: distribue2
+; CHECK-NOT: vadd.i8
+; CHECK: vmul.i8
+; CHECK: vmla.i8
+  %0 = trunc i32 %mul to i8
+  %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+  %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  %3 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %src, i32 1)
+  %4 = bitcast <16 x i8> %3 to <2 x double>
+  %5 = extractelement <2 x double> %4, i32 1
+  %6 = bitcast double %5 to <8 x i8>
+  %7 = extractelement <2 x double> %4, i32 0
+  %8 = bitcast double %7 to <8 x i8>
+  %9 = add <8 x i8> %6, %8
+  %10 = mul <8 x i8> %9, %2
+  %11 = getelementptr inbounds %struct.uint8x8_t* %dst, i32 0, i32 0
+  store <8 x i8> %10, <8 x i8>* %11, align 8
+  ret void
+}
author	Evan Cheng <evan.cheng@apple.com>
	Thu, 31 Mar 2011 19:38:48 +0000 (19:38 +0000)
committer	Evan Cheng <evan.cheng@apple.com>
	Thu, 31 Mar 2011 19:38:48 +0000 (19:38 +0000)
lib/Target/ARM/ARM.td		patch \| blob \| history
lib/Target/ARM/ARMISelLowering.cpp		patch \| blob \| history
lib/Target/ARM/ARMSubtarget.h		patch \| blob \| history
test/CodeGen/ARM/vmul.ll		patch \| blob \| history