From: Chris Bieneman <beanz@apple.com>
Date: Tue, 15 Jul 2014 17:18:41 +0000 (+0000)
Subject: [RegisterCoalescer] Add new subtarget hook allowing targets to opt-out of coalescing.
X-Git-Url: http://plrg.eecs.uci.edu/git/?p=oota-llvm.git;a=commitdiff_plain;h=01d8611240d26dbf124d216983d2396234f62b95

[RegisterCoalescer] Add new subtarget hook allowing targets to opt-out of coalescing.

The coalescer is very aggressive at propagating constraints on the register classes, and the register allocator doesn’t know how to split sub-registers later to recover. This patch provides an escape valve for targets that encounter this problem to limit coalescing.

This patch also implements such for ARM to lower register pressure when using lots of large register classes. This works around PR18825.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213078 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/include/llvm/Target/TargetSubtargetInfo.h b/include/llvm/Target/TargetSubtargetInfo.h
index bbb83efc780..e2aea45f4c1 100644
--- a/include/llvm/Target/TargetSubtargetInfo.h
+++ b/include/llvm/Target/TargetSubtargetInfo.h
@@ -115,6 +115,16 @@ public:
 
   /// \brief Reset the features for the subtarget.
   virtual void resetSubtargetFeatures(const MachineFunction *MF) { }
+
+  /// \brief SrcRC and DstRC will be morphed into NewRC if this returns true.
+  virtual bool shouldCoalesce(MachineInstr *MI,
+                              const TargetRegisterClass *SrcRC,
+                              unsigned SubReg,
+                              const TargetRegisterClass *DstRC,
+                              unsigned DstSubReg,
+                              const TargetRegisterClass *NewRC) const
+  { return true; }
+
 };
 
 } // End llvm namespace
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 5aaeb874d68..0bda4c79987 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -1037,6 +1037,23 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
     return false;
   }
 
+  if (CP.getNewRC()) {
+    const TargetSubtargetInfo &ST = TM->getSubtarget<TargetSubtargetInfo>();
+    auto SrcRC = MRI->getRegClass(CP.getSrcReg());
+    auto DstRC = MRI->getRegClass(CP.getDstReg());
+    unsigned SrcIdx = CP.getSrcIdx();
+    unsigned DstIdx = CP.getDstIdx();
+    if (CP.isFlipped()) {
+      std::swap(SrcIdx, DstIdx);
+      std::swap(SrcRC, DstRC);
+    }
+    if (!ST.shouldCoalesce(CopyMI, SrcRC, SrcIdx, DstRC, DstIdx,
+                            CP.getNewRC())) {
+      DEBUG(dbgs() << "\tSubtarget bailed on coalescing.\n");
+      return false;
+    }
+  }
+
   // Dead code elimination. This really should be handled by MachineDCE, but
   // sometimes dead copies slip through, and we can't generate invalid live
   // ranges.
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index 44a9e3495b9..d3fabc3ebb0 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
 
 namespace llvm {
 
@@ -118,6 +119,10 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// being passed on the stack
   unsigned ArgumentStackSize;
 
+  /// CoalescedWeights - mapping of basic blocks to the rolling counter of
+  /// coalesced weights.
+  DenseMap<const MachineBasicBlock*, unsigned> CoalescedWeights;
+
 public:
   ARMFunctionInfo() :
     isThumb(false),
@@ -221,6 +226,15 @@ public:
     else
       return -1U;
   }
+
+  DenseMap<const MachineBasicBlock*, unsigned>::iterator getCoalescedWeight(
+                                                  MachineBasicBlock* MBB) {
+    auto It = CoalescedWeights.find(MBB);
+    if (It == CoalescedWeights.end()) {
+      It = CoalescedWeights.insert(std::make_pair(MBB, 0)).first;
+    }
+    return It;
+  }
 };
 } // End llvm namespace
 
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index f21413b33ef..0c6ff529653 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -18,6 +18,7 @@
 #include "ARMJITInfo.h"
 #include "ARMSelectionDAGInfo.h"
 #include "ARMSubtarget.h"
+#include "ARMMachineFunctionInfo.h"
 #include "Thumb1FrameLowering.h"
 #include "Thumb1InstrInfo.h"
 #include "Thumb2InstrInfo.h"
@@ -27,6 +28,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 
 using namespace llvm;
 
@@ -449,3 +452,51 @@ bool ARMSubtarget::useMovt(const MachineFunction &MF) const {
                      !MF.getFunction()->getAttributes().hasAttribute(
                          AttributeSet::FunctionIndex, Attribute::MinSize));
 }
+
+bool ARMSubtarget::shouldCoalesce(MachineInstr *MI,
+                                  const TargetRegisterClass *SrcRC,
+                                  unsigned SubReg,
+                                  const TargetRegisterClass *DstRC,
+                                  unsigned DstSubReg,
+                                  const TargetRegisterClass *NewRC) const {
+  auto MBB = MI->getParent();
+  auto MF = MBB->getParent();
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  // If not copying into a sub-register this should be ok because we shouldn't
+  // need to split the reg.
+  if (!DstSubReg)
+    return true;
+  // Small registers don't frequently cause a problem, so we can coalesce them.
+  if (NewRC->getSize() < 32 && DstRC->getSize() < 32 && SrcRC->getSize() < 32)
+    return true;
+
+  auto NewRCWeight =
+              MRI.getTargetRegisterInfo()->getRegClassWeight(NewRC);
+  auto SrcRCWeight =
+              MRI.getTargetRegisterInfo()->getRegClassWeight(SrcRC);
+  auto DstRCWeight =
+              MRI.getTargetRegisterInfo()->getRegClassWeight(DstRC);
+  // If the source register class is more expensive than the destination, the
+  // coalescing is probably profitable.
+  if (SrcRCWeight.RegWeight > NewRCWeight.RegWeight)
+    return true;
+  if (DstRCWeight.RegWeight > NewRCWeight.RegWeight)
+    return true;
+
+  // If the register allocator isn't constrained, we can always allow coalescing
+  // unfortunately we don't know yet if we will be constrained.
+  // The goal of this heuristic is to restrict how many expensive registers
+  // we allow to coalesce in a given basic block.
+  auto AFI = MF->getInfo<ARMFunctionInfo>();
+  auto It = AFI->getCoalescedWeight(MBB);
+
+  DEBUG(dbgs() << "\tARM::shouldCoalesce - Coalesced Weight: " << It->second << "\n");
+  DEBUG(dbgs() << "\tARM::shouldCoalesce - Reg Weight: " << NewRCWeight.RegWeight << "\n");
+  unsigned SizeMultiplier = MBB->size()/100;
+  SizeMultiplier = SizeMultiplier ? SizeMultiplier : 1;
+  if (It->second < NewRCWeight.WeightLimit * SizeMultiplier) {
+    It->second += NewRCWeight.RegWeight;
+    return true;
+  }
+  return false;
+}
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 44d2159cb54..626bb0e7860 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -451,6 +451,14 @@ public:
   /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect
   /// symbol.
   bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const;
+
+  /// \brief SrcRC and DstRC will be morphed into NewRC if this returns true
+  bool shouldCoalesce(MachineInstr *MI,
+                      const TargetRegisterClass *SrcRC,
+                      unsigned SubReg,
+                      const TargetRegisterClass *DstRC,
+                      unsigned DstSubReg,
+                      const TargetRegisterClass *NewRC) const override;
 };
 } // End llvm namespace
 
diff --git a/test/CodeGen/ARM/out-of-registers.ll b/test/CodeGen/ARM/out-of-registers.ll
new file mode 100644
index 00000000000..790e4165d4c
--- /dev/null
+++ b/test/CodeGen/ARM/out-of-registers.ll
@@ -0,0 +1,42 @@
+; RUN: llc -O3 %s -o - | FileCheck %s
+; ModuleID = 'fo.c'
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:32-n8:16:32-S64"
+target triple = "thumbv7-none-linux-gnueabi"
+
+; CHECK: vpush
+; CHECK: vpop
+
+define void @foo(float* nocapture %A) #0 {
+  %1= bitcast float* %A to i8*
+  %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4)
+  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0
+  %divp_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %3
+  %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 1
+  %div3p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %4
+  %5 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 2
+  %div8p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %5
+  %6 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 3
+  %div13p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %6
+  tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %divp_vec, <4 x float> %div3p_vec, <4 x float> %div8p_vec, <4 x float> %div13p_vec, i32 4)
+ ret void
+}
+
+; Function Attrs: nounwind
+declare i32 @llvm.annotation.i32(i32, i8*, i8*, i32) #1
+
+; Function Attrs: nounwind readonly
+
+; Function Attrs: nounwind
+declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) #1
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32) #2
+
+; Function Attrs: nounwind
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readonly }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"Snapdragon LLVM ARM Compiler 3.4"}
+!1 = metadata !{metadata !1}
diff --git a/test/CodeGen/ARM/vector-spilling.ll b/test/CodeGen/ARM/vector-spilling.ll
new file mode 100644
index 00000000000..746c6dfcd11
--- /dev/null
+++ b/test/CodeGen/ARM/vector-spilling.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=arm -mtriple=armv7-linux-gnueabihf -arm-atomic-cfg-tidy=0 -float-abi=hard -mcpu=cortex-a9 -O3 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32-S64"
+
+; This test will generate spills/fills using vldmia instructions that access 24 bytes of memory.
+; Check that we don't crash when we generate these instructions on Cortex-A9.
+
+; CHECK: test:
+; CHECK: vstmia
+; CHECK: vldmia
+define void @test(<8 x i64>* %src) #0 {
+entry:
+  %0 = getelementptr inbounds <8 x i64>* %src, i32 0
+  %1 = load <8 x i64>* %0, align 8
+
+  %2 = getelementptr inbounds <8 x i64>* %src, i32 1
+  %3 = load <8 x i64>* %2, align 8
+
+  %4 = getelementptr inbounds <8 x i64>* %src, i32 2
+  %5 = load <8 x i64>* %4, align 8
+
+  %6 = getelementptr inbounds <8 x i64>* %src, i32 3
+  %7 = load <8 x i64>* %6, align 8
+
+  %8 = shufflevector <8 x i64> %1, <8 x i64> %3, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %9 = shufflevector <8 x i64> %1, <8 x i64> %3, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+
+  tail call void(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>)* @foo(<8 x i64> %1, <8 x i64> %3, <8 x i64> %5, <8 x i64> %7, <8 x i64> %8, <8 x i64> %9)
+  ret void
+}
+
+declare void @foo(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>)
+
+attributes #0 = { noredzone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/ARM/vldm-sched-a9.ll b/test/CodeGen/ARM/vldm-sched-a9.ll
index f2e5eb9b7e0..64f3770e3d2 100644
--- a/test/CodeGen/ARM/vldm-sched-a9.ll
+++ b/test/CodeGen/ARM/vldm-sched-a9.ll
@@ -2,12 +2,12 @@
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32-S64"
 
-; This test will generate spills/fills using vldmia instructions that access 64 bytes of memory.
-; Check that we don't crash when we generate these instructions on Cortex-A9.
+; This test used to test vector spilling using vstmia/vldmia instructions, but
+; the changes for PR:18825 prevent that spilling.
 
 ; CHECK: test:
-; CHECK: vstmia
-; CHECK: vldmia
+; CHECK-NOT: vstmia
+; CHECK-NOT: vldmia
 define void @test(i64* %src) #0 {
 entry:
   %arrayidx39 = getelementptr inbounds i64* %src, i32 13