ARM: prefer allocating VFP regs at stride 4 on Darwin.

author Tim Northover <tnorthover@apple.com>

Mon, 3 Aug 2015 17:20:10 +0000 (17:20 +0000)

committer Tim Northover <tnorthover@apple.com>

Mon, 3 Aug 2015 17:20:10 +0000 (17:20 +0000)
author Tim Northover <tnorthover@apple.com>
Mon, 3 Aug 2015 17:20:10 +0000 (17:20 +0000)
committer Tim Northover <tnorthover@apple.com>
Mon, 3 Aug 2015 17:20:10 +0000 (17:20 +0000)
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td

index 45cc9ea91f3762e2560ad2961ea8193854b898c6..7c0319a01ee71d58aadbc82ff618a78f26751e23 100644 (file)
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -266,12 +266,19 @@ def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> {
  }
  
  // Scalar single precision floating point register class..
-// FIXME: Allocation order changed to s0, s2, s4, ... as a quick hack to
-// avoid partial-write dependencies on D registers (S registers are
-// renamed as portions of D registers).
-def SPR : RegisterClass<"ARM", [f32], 32, (add (decimate
-                                                (sequence "S%u", 0, 31), 2),
-                                               (sequence "S%u", 0, 31))>;
+// FIXME: Allocation order changed to s0, s2, ... or s0, s4, ... as a quick hack
+// to avoid partial-write dependencies on D or Q (depending on platform)
+// registers (S registers are renamed as portions of D/Q registers).
+def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)> {
+  let AltOrders = [(add (decimate SPR, 2), SPR),
+                   (add (decimate SPR, 4),
+                        (decimate SPR, 2),
+                        (decimate (rotl SPR, 1), 4),
+                        (decimate (rotl SPR, 1), 2))];
+  let AltOrderSelect = [{
+    return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF);
+  }];
+}
  
  // Subset of SPR which can be used as a source of NEON scalars for 16-bit
  // operations
@@ -283,9 +290,13 @@ def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)>;
  // is double-word alignment though.
  def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
                          (sequence "D%u", 0, 31)> {
-  // Allocate non-VFP2 registers D16-D31 first.
-  let AltOrders = [(rotl DPR, 16)];
-  let AltOrderSelect = [{ return 1; }];
+  // Allocate non-VFP2 registers D16-D31 first, and prefer even registers on
+  // Darwin platforms.
+  let AltOrders = [(rotl DPR, 16),
+                   (add (decimate (rotl DPR, 16), 2), (rotl DPR, 16))];
+  let AltOrderSelect = [{
+    return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF);
+  }];
  }
  
  // Subset of DPR that are accessible with VFP2 (and so that also have
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp

index 4c6e69654d516471581f1d19a9d521bd714c6ebb..b91e9ae650c62655fc4580ba54fdec414135dbcb 100644 (file)
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -167,6 +167,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
        ArchFS = FS;
    }
    ParseSubtargetFeatures(CPUString, ArchFS);
+  printf("A-class: %d\n", static_cast<bool>(getFeatureBits()[ARM::ProcSwift]));
  
    // FIXME: This used enable V6T2 support implicitly for Thumb2 mode.
    // Assert this for now to make the change obvious.
@@ -285,6 +286,10 @@ bool ARMSubtarget::enableAtomicExpand() const {
    return hasAnyDataBarrier() && !isThumb1Only();
  }
  
+bool ARMSubtarget::useStride4VFPs(const MachineFunction &MF) const {
+  return isSwift() && !MF.getFunction()->hasFnAttribute(Attribute::MinSize);
+}
+
  bool ARMSubtarget::useMovt(const MachineFunction &MF) const {
    // NOTE Windows on ARM needs to use mov.w/mov.t pairs to materialise 32-bit
    // immediates as it is inherently position independent, and may be out of
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h

index d6d3d83c87ec47f2eda22440254a888ceecd12f7..e95096146b5b06fa000b6d1a1bb51ab87145ce54 100644 (file)
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -413,6 +413,8 @@ public:
      return isTargetMachO() ? (ReserveR9 || !HasV6Ops) : ReserveR9;
    }
  
+  bool useStride4VFPs(const MachineFunction &MF) const;
+
    bool useMovt(const MachineFunction &MF) const;
  
    bool supportsTailCall() const { return SupportsTailCall; }
diff --git a/test/CodeGen/ARM/fold-stack-adjust.ll b/test/CodeGen/ARM/fold-stack-adjust.ll

index b62d0dfee07d88a83c6e100684e9f4ac0a03010e..991742135623ff0efe7acb2a8e030bd854304d34 100644 (file)
--- a/test/CodeGen/ARM/fold-stack-adjust.ll
+++ b/test/CodeGen/ARM/fold-stack-adjust.ll
@@ -60,8 +60,6 @@ define void @check_vfp_fold() minsize {
  ; CHECK: vpush {d6, d7, d8, d9}
  ; CHECK-NOT: sub sp,
  ; ...
-; CHECK: vldmia r[[GLOBREG]], {d8, d9}
-; ...
  ; CHECK-NOT: add sp,
  ; CHECK: vpop {d6, d7, d8, d9}
  ; CHECKL pop {r[[GLOBREG]], pc}
@@ -82,9 +80,8 @@ define void @check_vfp_fold() minsize {
  
    %var = alloca i8, i32 16
  
-  %tmp = load %bigVec, %bigVec* @var
+  call void asm "", "r,~{d8},~{d9}"(i8* %var)
    call void @bar(i8* %var)
-  store %bigVec %tmp, %bigVec* @var
  
    ret void
  }
diff --git a/test/CodeGen/ARM/vfp-reg-stride.ll b/test/CodeGen/ARM/vfp-reg-stride.ll

new file mode 100644 (file)

index 0000000..5484cc8
--- /dev/null
+++ b/test/CodeGen/ARM/vfp-reg-stride.ll
@@ -0,0 +1,33 @@
+; RUN: llc -mcpu=swift -mtriple=thumbv7s-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-STRIDE4
+; RUN: llc -mcpu=cortex-a57 -mtriple=thumbv7-linux-gnueabihf -o - %s | FileCheck %s --check-prefix=CHECK-GENERIC
+
+define void @test_reg_stride(double %a, double %b) {
+; CHECK-STRIDE4-LABEL: test_reg_stride:
+; CHECK-STRIDE4-DAG: vmov d16, r
+; CHECK-STRIDE4-DAG: vmov d18, r
+
+; CHECK-GENERIC-LABEL: test_reg_stride:
+; CHECK-GENERIC-DAG: vmov.f64 d16, {{d[01]}}
+; CHECK-GENERIC-DAG: vmov.f64 d17, {{d[01]}}
+
+  call void asm "", "~{r0},~{r1},~{d0},~{d1}"()
+  call arm_aapcs_vfpcc void @eat_doubles(double %a, double %b)
+  ret void
+}
+
+define void @test_stride_minsize(float %a, float %b) minsize {
+; CHECK-STRIDE4-LABEL: test_stride_minsize:
+; CHECK-STRIDE4: vmov d2, {{r[01]}}
+; CHECK-STRIDE4: vmov d3, {{r[01]}}
+
+; CHECK-GENERIC-LABEL: test_stride_minsize:
+; CHECK-GENERIC-DAG: vmov.f32 s4, {{s[01]}}
+; CHECK-GENERIC-DAG: vmov.f32 s6, {{s[01]}}
+  call void asm "", "~{r0},~{r1},~{s0},~{s1},~{d0},~{d1}"()
+  call arm_aapcs_vfpcc void @eat_floats(float %a, float %b)
+  ret void
+}
+
+
+declare arm_aapcs_vfpcc void @eat_doubles(double, double)
+declare arm_aapcs_vfpcc void @eat_floats(float, float)
author	Tim Northover <tnorthover@apple.com>
	Mon, 3 Aug 2015 17:20:10 +0000 (17:20 +0000)
committer	Tim Northover <tnorthover@apple.com>
	Mon, 3 Aug 2015 17:20:10 +0000 (17:20 +0000)
lib/Target/ARM/ARMRegisterInfo.td		patch \| blob \| history
lib/Target/ARM/ARMSubtarget.cpp		patch \| blob \| history
lib/Target/ARM/ARMSubtarget.h		patch \| blob \| history
test/CodeGen/ARM/fold-stack-adjust.ll		patch \| blob \| history
test/CodeGen/ARM/vfp-reg-stride.ll	[new file with mode: 0644]	patch \| blob