AArch64: treat [N x Ty] as a block during procedure calls.

author Tim Northover <tnorthover@apple.com>

Thu, 27 Nov 2014 21:02:42 +0000 (21:02 +0000)

committer Tim Northover <tnorthover@apple.com>

Thu, 27 Nov 2014 21:02:42 +0000 (21:02 +0000)
author Tim Northover <tnorthover@apple.com>
Thu, 27 Nov 2014 21:02:42 +0000 (21:02 +0000)
committer Tim Northover <tnorthover@apple.com>
Thu, 27 Nov 2014 21:02:42 +0000 (21:02 +0000)
diff --git a/include/llvm/CodeGen/CallingConvLower.h b/include/llvm/CodeGen/CallingConvLower.h

index 0b2ccc6ee6d7effe64c06dcdfbad34c25d2194de..903a395496061d49292e536172448e279373d56c 100644 (file)
--- a/include/llvm/CodeGen/CallingConvLower.h
+++ b/include/llvm/CodeGen/CallingConvLower.h
@@ -345,8 +345,13 @@ public:
    /// AllocateRegBlock - Attempt to allocate a block of RegsRequired consecutive
    /// registers. If this is not possible, return zero. Otherwise, return the first
    /// register of the block that were allocated, marking the entire block as allocated.
-  unsigned AllocateRegBlock(const uint16_t *Regs, unsigned NumRegs, unsigned RegsRequired) {
-    for (unsigned StartIdx = 0; StartIdx <= NumRegs - RegsRequired; ++StartIdx) {
+  unsigned AllocateRegBlock(ArrayRef<const uint16_t> Regs,
+                            unsigned RegsRequired) {
+    if (RegsRequired > Regs.size())
+      return 0;
+
+    for (unsigned StartIdx = 0; StartIdx <= Regs.size() - RegsRequired;
+         ++StartIdx) {
        bool BlockAvailable = true;
        // Check for already-allocated regs in this block
        for (unsigned BlockIdx = 0; BlockIdx < RegsRequired; ++BlockIdx) {
diff --git a/include/llvm/IR/DataLayout.h b/include/llvm/IR/DataLayout.h

index 4580a4f56a0774ccc70e5f11b5064ba529b99d72..a9e75955ce74cfe42b0ffdaf22bba18a1c287bb6 100644 (file)
--- a/include/llvm/IR/DataLayout.h
+++ b/include/llvm/IR/DataLayout.h
@@ -228,6 +228,8 @@ public:
      return (StackNaturalAlign != 0) && (Align > StackNaturalAlign);
    }
  
+  unsigned getStackAlignment() const { return StackNaturalAlign; }
+
    bool hasMicrosoftFastStdCallMangling() const {
      return ManglingMode == MM_WINCOFF;
    }
diff --git a/lib/Target/AArch64/AArch64CallingConvention.h b/lib/Target/AArch64/AArch64CallingConvention.h

new file mode 100644 (file)

index 0000000..b2be99e
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CallingConvention.h
@@ -0,0 +1,136 @@
+//=== AArch64CallingConv.h - Custom Calling Convention Routines -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the AArch64 Calling Convention
+// that aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLINGCONVENTION_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLINGCONVENTION_H
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace {
+using namespace llvm;
+
+static const uint16_t XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2,
+                                    AArch64::X3, AArch64::X4, AArch64::X5,
+                                    AArch64::X6, AArch64::X7};
+static const uint16_t SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2,
+                                    AArch64::S3, AArch64::S4, AArch64::S5,
+                                    AArch64::S6, AArch64::S7};
+static const uint16_t DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2,
+                                    AArch64::D3, AArch64::D4, AArch64::D5,
+                                    AArch64::D6, AArch64::D7};
+static const uint16_t QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
+                                    AArch64::Q3, AArch64::Q4, AArch64::Q5,
+                                    AArch64::Q6, AArch64::Q7};
+
+static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
+                             MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
+                             CCState &State, unsigned SlotAlign) {
+  unsigned Size = LocVT.getSizeInBits() / 8;
+  unsigned StackAlign = State.getMachineFunction()
+                            .getSubtarget()
+                            .getDataLayout()
+                            ->getStackAlignment();
+  unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign);
+
+  for (auto &It : PendingMembers) {
+    It.convertToMem(State.AllocateStack(Size, std::max(Align, SlotAlign)));
+    State.addLoc(It);
+    SlotAlign = 1;
+  }
+
+  // All pending members have now been allocated
+  PendingMembers.clear();
+  return true;
+}
+
+/// The Darwin variadic PCS places anonymous arguments in 8-byte stack slots. An
+/// [N x Ty] type must still be contiguous in memory though.
+static bool CC_AArch64_Custom_Stack_Block(
+      unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+      ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+  // Add the argument to the list to be allocated once we know the size of the
+  // block.
+  PendingMembers.push_back(
+      CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+  if (!ArgFlags.isInConsecutiveRegsLast())
+    return true;
+
+  return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, 8);
+}
+
+/// Given an [N x Ty] block, it should be passed in a consecutive sequence of
+/// registers. If no such sequence is available, mark the rest of the registers
+/// of that type as used and place the argument on the stack.
+static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                    CCValAssign::LocInfo &LocInfo,
+                                    ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  // Try to allocate a contiguous block of registers, each of the correct
+  // size to hold one member.
+  ArrayRef<const uint16_t> RegList;
+  if (LocVT.SimpleTy == MVT::i64)
+    RegList = XRegList;
+  else if (LocVT.SimpleTy == MVT::f32)
+    RegList = SRegList;
+  else if (LocVT.SimpleTy == MVT::f64)
+    RegList = DRegList;
+  else if (LocVT.SimpleTy == MVT::v2f64)
+    RegList = QRegList;
+  else {
+    // Not an array we want to split up after all.
+    return false;
+  }
+
+  SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+  // Add the argument to the list to be allocated once we know the size of the
+  // block.
+  PendingMembers.push_back(
+      CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+  if (!ArgFlags.isInConsecutiveRegsLast())
+    return true;
+
+  unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size());
+  if (RegResult) {
+    for (auto &It : PendingMembers) {
+      It.convertToReg(RegResult);
+      State.addLoc(It);
+      ++RegResult;
+    }
+    PendingMembers.clear();
+    return true;
+  }
+
+  // Mark all regs in the class as unavailable
+  for (auto Reg : RegList)
+    State.AllocateReg(Reg);
+
+  const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>(
+      State.getMachineFunction().getSubtarget());
+  unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8;
+
+  return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign);
+}
+
+}
+
+#endif
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td

index 9e707e4083c1417bcc9385f7ec2324db3e8de399..1a8040275ca8c91e7d2d2fde50d8178708f1b471 100644 (file)
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -40,6 +40,8 @@ def CC_AArch64_AAPCS : CallingConv<[
    // slot is 64-bit.
    CCIfByVal<CCPassByVal<8, 8>>,
  
+  CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
+
    // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
    // up to eight each of GPR and FPR.
    CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
@@ -119,6 +121,8 @@ def CC_AArch64_DarwinPCS : CallingConv<[
    // slot is 64-bit.
    CCIfByVal<CCPassByVal<8, 8>>,
  
+  CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
+
    // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
    // up to eight each of GPR and FPR.
    CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
@@ -159,6 +163,8 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
    CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
    CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
  
+  CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Stack_Block">>,
+
    // Handle all scalar types as either i64 or f64.
    CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
    CCIfType<[f16, f32],     CCPromoteToType<f64>>,
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp

index fb0326bf0bf4f19955e50b9134247ea852fcdcce..d942ace427bd17d050b6fa947a151f6ac899f9be 100644 (file)
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -14,6 +14,7 @@
  //===----------------------------------------------------------------------===//
  
  #include "AArch64.h"
+#include "AArch64CallingConvention.h"
  #include "AArch64Subtarget.h"
  #include "AArch64TargetMachine.h"
  #include "MCTargetDesc/AArch64AddressingModes.h"
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp

index 70cf7599a395d1cc737e9d7544c5adab4dc8124f..6da468ed6b14d650bb6e3d88eafeaa889dfbed2a 100644 (file)
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -12,6 +12,7 @@
  //===----------------------------------------------------------------------===//
  
  #include "AArch64ISelLowering.h"
+#include "AArch64CallingConvention.h"
  #include "AArch64MachineFunctionInfo.h"
  #include "AArch64PerfectShuffle.h"
  #include "AArch64Subtarget.h"
@@ -8842,3 +8843,8 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
                  Val, Stxr->getFunctionType()->getParamType(0)),
        Addr);
  }
+
+bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
+    Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
+  return Ty->isArrayTy();
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h

index c76f6a865dce9027965614f6be44a3f2e55817b6..458489bd6c6516e798fda0fe65d27e60a5377b6e 100644 (file)
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -473,6 +473,10 @@ private:
  
    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
                            SelectionDAG &DAG) const override;
+
+  bool functionArgumentNeedsConsecutiveRegisters(Type *Ty,
+                                                 CallingConv::ID CallConv,
+                                                 bool isVarArg) const;
  };
  
  namespace AArch64 {
diff --git a/lib/Target/ARM/ARMCallingConv.h b/lib/Target/ARM/ARMCallingConv.h

index bd07236d0ead29fdfc521c7a05a6058efd77aa08..4567a2a1afd8b495c783398f10a9823f87be9c1a 100644 (file)
--- a/lib/Target/ARM/ARMCallingConv.h
+++ b/lib/Target/ARM/ARMCallingConv.h
@@ -194,20 +194,16 @@ static bool CC_ARM_AAPCS_Custom_HA(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
  
      // Try to allocate a contiguous block of registers, each of the correct
      // size to hold one member.
-    const uint16_t *RegList;
-    unsigned NumRegs;
+    ArrayRef<const uint16_t> RegList;
      switch (LocVT.SimpleTy) {
      case MVT::f32:
        RegList = SRegList;
-      NumRegs = 16;
        break;
      case MVT::f64:
        RegList = DRegList;
-      NumRegs = 8;
        break;
      case MVT::v2f64:
        RegList = QRegList;
-      NumRegs = 4;
        break;
      default:
        llvm_unreachable("Unexpected member type for HA");
@@ -215,7 +211,7 @@ static bool CC_ARM_AAPCS_Custom_HA(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
      }
  
      unsigned RegResult =
-        State.AllocateRegBlock(RegList, NumRegs, PendingHAMembers.size());
+        State.AllocateRegBlock(RegList, PendingHAMembers.size());
  
      if (RegResult) {
        for (SmallVectorImpl<CCValAssign>::iterator It = PendingHAMembers.begin();
diff --git a/test/CodeGen/AArch64/argument-blocks.ll b/test/CodeGen/AArch64/argument-blocks.ll

new file mode 100644 (file)

index 0000000..cc65541
--- /dev/null
+++ b/test/CodeGen/AArch64/argument-blocks.ll
@@ -0,0 +1,92 @@
+; RUN: llc -mtriple=aarch64-apple-ios7.0 -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DARWINPCS
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AAPCS
+
+declare void @callee(...)
+
+define float @test_hfa_regs(float, [2 x float] %in) {
+; CHECK-LABEL: test_hfa_regs:
+; CHECK: fadd s0, s1, s2
+
+  %lhs = extractvalue [2 x float] %in, 0
+  %rhs = extractvalue [2 x float] %in, 1
+  %sum = fadd float %lhs, %rhs
+  ret float %sum
+}
+
+; Check that the array gets allocated to a contiguous block on the stack (rather
+; than the default of 2 8-byte slots).
+define float @test_hfa_block([7 x float], [2 x float] %in) {
+; CHECK-LABEL: test_hfa_block:
+; CHECK: ldp [[LHS:s[0-9]+]], [[RHS:s[0-9]+]], [sp]
+; CHECK: fadd s0, [[LHS]], [[RHS]]
+
+  %lhs = extractvalue [2 x float] %in, 0
+  %rhs = extractvalue [2 x float] %in, 1
+  %sum = fadd float %lhs, %rhs
+  ret float %sum
+}
+
+; Check that an HFA prevents backfilling of VFP registers (i.e. %rhs must go on
+; the stack rather than in s7).
+define float @test_hfa_block_consume([7 x float], [2 x float] %in, float %rhs) {
+; CHECK-LABEL: test_hfa_block_consume:
+; CHECK-DAG: ldr [[LHS:s[0-9]+]], [sp]
+; CHECK-DAG: ldr [[RHS:s[0-9]+]], [sp, #8]
+; CHECK: fadd s0, [[LHS]], [[RHS]]
+
+  %lhs = extractvalue [2 x float] %in, 0
+  %sum = fadd float %lhs, %rhs
+  ret float %sum
+}
+
+define float @test_hfa_stackalign([8 x float], [1 x float], [2 x float] %in) {
+; CHECK-LABEL: test_hfa_stackalign:
+; CHECK-AAPCS: ldp [[LHS:s[0-9]+]], [[RHS:s[0-9]+]], [sp, #8]
+; CHECK-DARWINPCS: ldp [[LHS:s[0-9]+]], [[RHS:s[0-9]+]], [sp, #4]
+; CHECK: fadd s0, [[LHS]], [[RHS]]
+  %lhs = extractvalue [2 x float] %in, 0
+  %rhs = extractvalue [2 x float] %in, 1
+  %sum = fadd float %lhs, %rhs
+  ret float %sum
+}
+
+; An HFA that ends up on the stack should not have any effect on where
+; integer-based arguments go.
+define i64 @test_hfa_ignores_gprs([7 x float], [2 x float] %in, i64, i64 %res) {
+; CHECK-LABEL: test_hfa_ignores_gprs:
+; CHECK: mov x0, x1
+  ret i64 %res
+}
+
+; [2 x float] should not be promoted to double by the Darwin varargs handling,
+; but should go in an 8-byte aligned slot.
+define void @test_varargs_stackalign() {
+; CHECK-LABEL: test_varargs_stackalign:
+; CHECK-DARWINPCS: stp {{w[0-9]+}}, {{w[0-9]+}}, [sp, #16]
+
+  call void(...)* @callee([3 x float] undef, [2 x float] [float 1.0, float 2.0])
+  ret void
+}
+
+define i64 @test_smallstruct_block([7 x i64], [2 x i64] %in) {
+; CHECK-LABEL: test_smallstruct_block:
+; CHECK: ldp [[LHS:x[0-9]+]], [[RHS:x[0-9]+]], [sp]
+; CHECK: add x0, [[LHS]], [[RHS]]
+  %lhs = extractvalue [2 x i64] %in, 0
+  %rhs = extractvalue [2 x i64] %in, 1
+  %sum = add i64 %lhs, %rhs
+  ret i64 %sum
+}
+
+; Check that a small struct prevents backfilling of registers (i.e. %rhs
+; must go on the stack rather than in x7).
+define i64 @test_smallstruct_block_consume([7 x i64], [2 x i64] %in, i64 %rhs) {
+; CHECK-LABEL: test_smallstruct_block_consume:
+; CHECK-DAG: ldr [[LHS:x[0-9]+]], [sp]
+; CHECK-DAG: ldr [[RHS:x[0-9]+]], [sp, #16]
+; CHECK: add x0, [[LHS]], [[RHS]]
+
+  %lhs = extractvalue [2 x i64] %in, 0
+  %sum = add i64 %lhs, %rhs
+  ret i64 %sum
+}
diff --git a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll

index 36a7bfd92520a50f6ab676f865cec4b474dbfb72..8e41304ffad47d6fd92700cf4c4a01194072e0b5 100644 (file)
--- a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
+++ b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
@@ -96,7 +96,7 @@ define void @test_nospare([8 x i64], [8 x float], ...) {
  
  ; If there are non-variadic arguments on the stack (here two i64s) then the
  ; __stack field should point just past them.
-define void @test_offsetstack([10 x i64], [3 x float], ...) {
+define void @test_offsetstack([8 x i64], [2 x i64], [3 x float], ...) {
  ; CHECK-LABEL: test_offsetstack:
  ; CHECK: sub sp, sp, #80
  ; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #96
author	Tim Northover <tnorthover@apple.com>
	Thu, 27 Nov 2014 21:02:42 +0000 (21:02 +0000)
committer	Tim Northover <tnorthover@apple.com>
	Thu, 27 Nov 2014 21:02:42 +0000 (21:02 +0000)
include/llvm/CodeGen/CallingConvLower.h		patch \| blob \| history
include/llvm/IR/DataLayout.h		patch \| blob \| history
lib/Target/AArch64/AArch64CallingConvention.h	[new file with mode: 0644]	patch \| blob
lib/Target/AArch64/AArch64CallingConvention.td		patch \| blob \| history
lib/Target/AArch64/AArch64FastISel.cpp		patch \| blob \| history
lib/Target/AArch64/AArch64ISelLowering.cpp		patch \| blob \| history
lib/Target/AArch64/AArch64ISelLowering.h		patch \| blob \| history
lib/Target/ARM/ARMCallingConv.h		patch \| blob \| history
test/CodeGen/AArch64/argument-blocks.ll	[new file with mode: 0644]	patch \| blob
test/CodeGen/AArch64/arm64-variadic-aapcs.ll		patch \| blob \| history