[RegisterCoalescer] Add new subtarget hook allowing targets to opt-out of coalescing.

author Chris Bieneman <beanz@apple.com>

Tue, 15 Jul 2014 17:18:41 +0000 (17:18 +0000)

committer Chris Bieneman <beanz@apple.com>

Tue, 15 Jul 2014 17:18:41 +0000 (17:18 +0000)
author Chris Bieneman <beanz@apple.com>
Tue, 15 Jul 2014 17:18:41 +0000 (17:18 +0000)
committer Chris Bieneman <beanz@apple.com>
Tue, 15 Jul 2014 17:18:41 +0000 (17:18 +0000)
diff --git a/include/llvm/Target/TargetSubtargetInfo.h b/include/llvm/Target/TargetSubtargetInfo.h

index bbb83efc780bc25049f82334af2051d9ec220df8..e2aea45f4c1d8d2a7c050945138960f398543250 100644 (file)
--- a/include/llvm/Target/TargetSubtargetInfo.h
+++ b/include/llvm/Target/TargetSubtargetInfo.h
@@ -115,6 +115,16 @@ public:
  
    /// \brief Reset the features for the subtarget.
    virtual void resetSubtargetFeatures(const MachineFunction *MF) { }
+
+  /// \brief SrcRC and DstRC will be morphed into NewRC if this returns true.
+  virtual bool shouldCoalesce(MachineInstr *MI,
+                              const TargetRegisterClass *SrcRC,
+                              unsigned SubReg,
+                              const TargetRegisterClass *DstRC,
+                              unsigned DstSubReg,
+                              const TargetRegisterClass *NewRC) const
+  { return true; }
+
  };
  
  } // End llvm namespace
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp

index 5aaeb874d68cbf8df74113d6488a86ab8ce6e684..0bda4c7998717a419428987128081d6fd4ea88cf 100644 (file)
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -1037,6 +1037,23 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
      return false;
    }
  
+  if (CP.getNewRC()) {
+    const TargetSubtargetInfo &ST = TM->getSubtarget<TargetSubtargetInfo>();
+    auto SrcRC = MRI->getRegClass(CP.getSrcReg());
+    auto DstRC = MRI->getRegClass(CP.getDstReg());
+    unsigned SrcIdx = CP.getSrcIdx();
+    unsigned DstIdx = CP.getDstIdx();
+    if (CP.isFlipped()) {
+      std::swap(SrcIdx, DstIdx);
+      std::swap(SrcRC, DstRC);
+    }
+    if (!ST.shouldCoalesce(CopyMI, SrcRC, SrcIdx, DstRC, DstIdx,
+                            CP.getNewRC())) {
+      DEBUG(dbgs() << "\tSubtarget bailed on coalescing.\n");
+      return false;
+    }
+  }
+
    // Dead code elimination. This really should be handled by MachineDCE, but
    // sometimes dead copies slip through, and we can't generate invalid live
    // ranges.
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h

index 44a9e3495b90169304c5b603bb967d8997b76bd4..d3fabc3ebb0401e4e0648eed19a3fff9fdc8bee6 100644 (file)
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -19,6 +19,7 @@
  #include "llvm/CodeGen/MachineFunction.h"
  #include "llvm/Target/TargetMachine.h"
  #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
  
  namespace llvm {
  
@@ -118,6 +119,10 @@ class ARMFunctionInfo : public MachineFunctionInfo {
    /// being passed on the stack
    unsigned ArgumentStackSize;
  
+  /// CoalescedWeights - mapping of basic blocks to the rolling counter of
+  /// coalesced weights.
+  DenseMap<const MachineBasicBlock*, unsigned> CoalescedWeights;
+
  public:
    ARMFunctionInfo() :
      isThumb(false),
@@ -221,6 +226,15 @@ public:
      else
        return -1U;
    }
+
+  DenseMap<const MachineBasicBlock*, unsigned>::iterator getCoalescedWeight(
+                                                  MachineBasicBlock* MBB) {
+    auto It = CoalescedWeights.find(MBB);
+    if (It == CoalescedWeights.end()) {
+      It = CoalescedWeights.insert(std::make_pair(MBB, 0)).first;
+    }
+    return It;
+  }
  };
  } // End llvm namespace
  
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp

index f21413b33ef3190549fbc96635e5f943a033aff8..0c6ff529653f2161bc29ad9b5f5d280c0398a36b 100644 (file)
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -18,6 +18,7 @@
  #include "ARMJITInfo.h"
  #include "ARMSelectionDAGInfo.h"
  #include "ARMSubtarget.h"
+#include "ARMMachineFunctionInfo.h"
  #include "Thumb1FrameLowering.h"
  #include "Thumb1InstrInfo.h"
  #include "Thumb2InstrInfo.h"
@@ -27,6 +28,8 @@
  #include "llvm/Support/CommandLine.h"
  #include "llvm/Target/TargetInstrInfo.h"
  #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
  
  using namespace llvm;
  
@@ -449,3 +452,51 @@ bool ARMSubtarget::useMovt(const MachineFunction &MF) const {
                       !MF.getFunction()->getAttributes().hasAttribute(
                           AttributeSet::FunctionIndex, Attribute::MinSize));
  }
+
+bool ARMSubtarget::shouldCoalesce(MachineInstr *MI,
+                                  const TargetRegisterClass *SrcRC,
+                                  unsigned SubReg,
+                                  const TargetRegisterClass *DstRC,
+                                  unsigned DstSubReg,
+                                  const TargetRegisterClass *NewRC) const {
+  auto MBB = MI->getParent();
+  auto MF = MBB->getParent();
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  // If not copying into a sub-register this should be ok because we shouldn't
+  // need to split the reg.
+  if (!DstSubReg)
+    return true;
+  // Small registers don't frequently cause a problem, so we can coalesce them.
+  if (NewRC->getSize() < 32 && DstRC->getSize() < 32 && SrcRC->getSize() < 32)
+    return true;
+
+  auto NewRCWeight =
+              MRI.getTargetRegisterInfo()->getRegClassWeight(NewRC);
+  auto SrcRCWeight =
+              MRI.getTargetRegisterInfo()->getRegClassWeight(SrcRC);
+  auto DstRCWeight =
+              MRI.getTargetRegisterInfo()->getRegClassWeight(DstRC);
+  // If the source register class is more expensive than the destination, the
+  // coalescing is probably profitable.
+  if (SrcRCWeight.RegWeight > NewRCWeight.RegWeight)
+    return true;
+  if (DstRCWeight.RegWeight > NewRCWeight.RegWeight)
+    return true;
+
+  // If the register allocator isn't constrained, we can always allow coalescing
+  // unfortunately we don't know yet if we will be constrained.
+  // The goal of this heuristic is to restrict how many expensive registers
+  // we allow to coalesce in a given basic block.
+  auto AFI = MF->getInfo<ARMFunctionInfo>();
+  auto It = AFI->getCoalescedWeight(MBB);
+
+  DEBUG(dbgs() << "\tARM::shouldCoalesce - Coalesced Weight: " << It->second << "\n");
+  DEBUG(dbgs() << "\tARM::shouldCoalesce - Reg Weight: " << NewRCWeight.RegWeight << "\n");
+  unsigned SizeMultiplier = MBB->size()/100;
+  SizeMultiplier = SizeMultiplier ? SizeMultiplier : 1;
+  if (It->second < NewRCWeight.WeightLimit * SizeMultiplier) {
+    It->second += NewRCWeight.RegWeight;
+    return true;
+  }
+  return false;
+}
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h

index 44d2159cb548c54bf6be436ccde773e127db9fc8..626bb0e7860946af3602526d2861a766537ca4d3 100644 (file)
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -451,6 +451,14 @@ public:
    /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect
    /// symbol.
    bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const;
+
+  /// \brief SrcRC and DstRC will be morphed into NewRC if this returns true
+  bool shouldCoalesce(MachineInstr *MI,
+                      const TargetRegisterClass *SrcRC,
+                      unsigned SubReg,
+                      const TargetRegisterClass *DstRC,
+                      unsigned DstSubReg,
+                      const TargetRegisterClass *NewRC) const override;
  };
  } // End llvm namespace
  
diff --git a/test/CodeGen/ARM/out-of-registers.ll b/test/CodeGen/ARM/out-of-registers.ll

new file mode 100644 (file)

index 0000000..790e416
--- /dev/null
+++ b/test/CodeGen/ARM/out-of-registers.ll
@@ -0,0 +1,42 @@
+; RUN: llc -O3 %s -o - | FileCheck %s
+; ModuleID = 'fo.c'
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:32-n8:16:32-S64"
+target triple = "thumbv7-none-linux-gnueabi"
+
+; CHECK: vpush
+; CHECK: vpop
+
+define void @foo(float* nocapture %A) #0 {
+  %1= bitcast float* %A to i8*
+  %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4)
+  %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0
+  %divp_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %3
+  %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 1
+  %div3p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %4
+  %5 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 2
+  %div8p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %5
+  %6 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 3
+  %div13p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %6
+  tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %divp_vec, <4 x float> %div3p_vec, <4 x float> %div8p_vec, <4 x float> %div13p_vec, i32 4)
+ ret void
+}
+
+; Function Attrs: nounwind
+declare i32 @llvm.annotation.i32(i32, i8*, i8*, i32) #1
+
+; Function Attrs: nounwind readonly
+
+; Function Attrs: nounwind
+declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) #1
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32) #2
+
+; Function Attrs: nounwind
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readonly }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"Snapdragon LLVM ARM Compiler 3.4"}
+!1 = metadata !{metadata !1}
diff --git a/test/CodeGen/ARM/vector-spilling.ll b/test/CodeGen/ARM/vector-spilling.ll

new file mode 100644 (file)

index 0000000..746c6df
--- /dev/null
+++ b/test/CodeGen/ARM/vector-spilling.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=arm -mtriple=armv7-linux-gnueabihf -arm-atomic-cfg-tidy=0 -float-abi=hard -mcpu=cortex-a9 -O3 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32-S64"
+
+; This test will generate spills/fills using vldmia instructions that access 24 bytes of memory.
+; Check that we don't crash when we generate these instructions on Cortex-A9.
+
+; CHECK: test:
+; CHECK: vstmia
+; CHECK: vldmia
+define void @test(<8 x i64>* %src) #0 {
+entry:
+  %0 = getelementptr inbounds <8 x i64>* %src, i32 0
+  %1 = load <8 x i64>* %0, align 8
+
+  %2 = getelementptr inbounds <8 x i64>* %src, i32 1
+  %3 = load <8 x i64>* %2, align 8
+
+  %4 = getelementptr inbounds <8 x i64>* %src, i32 2
+  %5 = load <8 x i64>* %4, align 8
+
+  %6 = getelementptr inbounds <8 x i64>* %src, i32 3
+  %7 = load <8 x i64>* %6, align 8
+
+  %8 = shufflevector <8 x i64> %1, <8 x i64> %3, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %9 = shufflevector <8 x i64> %1, <8 x i64> %3, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+
+  tail call void(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>)* @foo(<8 x i64> %1, <8 x i64> %3, <8 x i64> %5, <8 x i64> %7, <8 x i64> %8, <8 x i64> %9)
+  ret void
+}
+
+declare void @foo(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>)
+
+attributes #0 = { noredzone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/ARM/vldm-sched-a9.ll b/test/CodeGen/ARM/vldm-sched-a9.ll

index f2e5eb9b7e035086f36ea12a2f670527b4dcbef3..64f3770e3d21272fba75df6c19419198f976b356 100644 (file)
--- a/test/CodeGen/ARM/vldm-sched-a9.ll
+++ b/test/CodeGen/ARM/vldm-sched-a9.ll
@@ -2,12 +2,12 @@
  
  target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32-S64"
  
-; This test will generate spills/fills using vldmia instructions that access 64 bytes of memory.
-; Check that we don't crash when we generate these instructions on Cortex-A9.
+; This test used to test vector spilling using vstmia/vldmia instructions, but
+; the changes for PR:18825 prevent that spilling.
  
  ; CHECK: test:
-; CHECK: vstmia
-; CHECK: vldmia
+; CHECK-NOT: vstmia
+; CHECK-NOT: vldmia
  define void @test(i64* %src) #0 {
  entry:
    %arrayidx39 = getelementptr inbounds i64* %src, i32 13
author	Chris Bieneman <beanz@apple.com>
	Tue, 15 Jul 2014 17:18:41 +0000 (17:18 +0000)
committer	Chris Bieneman <beanz@apple.com>
	Tue, 15 Jul 2014 17:18:41 +0000 (17:18 +0000)
include/llvm/Target/TargetSubtargetInfo.h		patch \| blob \| history
lib/CodeGen/RegisterCoalescer.cpp		patch \| blob \| history
lib/Target/ARM/ARMMachineFunctionInfo.h		patch \| blob \| history
lib/Target/ARM/ARMSubtarget.cpp		patch \| blob \| history
lib/Target/ARM/ARMSubtarget.h		patch \| blob \| history
test/CodeGen/ARM/out-of-registers.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/ARM/vector-spilling.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/ARM/vldm-sched-a9.ll		patch \| blob \| history