[AArch64] Add experimental PBQP support
authorArnaud A. de Grandmaison <arnaud.degrandmaison@arm.com>
Wed, 10 Sep 2014 14:06:10 +0000 (14:06 +0000)
committerArnaud A. de Grandmaison <arnaud.degrandmaison@arm.com>
Wed, 10 Sep 2014 14:06:10 +0000 (14:06 +0000)
This adds target specific support for using the PBQP register allocator on the
AArch64, for the A57 cpu.

By default, the PBQP allocator is not used, unless explicitely required
on the command line with "-aarch64-pbqp".

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217504 91177308-0d34-0410-b5e6-96231b3b80d8

lib/Target/AArch64/AArch64.h
lib/Target/AArch64/AArch64PBQPRegAlloc.cpp [new file with mode: 0644]
lib/Target/AArch64/AArch64TargetMachine.cpp
lib/Target/AArch64/AArch64TargetMachine.h
lib/Target/AArch64/CMakeLists.txt
test/CodeGen/AArch64/PBQP.ll [new file with mode: 0644]

index 3e85bacbda27efa5c7c7773e4d84b7778c8ca780..425a9028488b275cd04df6a154f26371cf16c3cb 100644 (file)
@@ -39,6 +39,7 @@ ModulePass *createAArch64PromoteConstantPass();
 FunctionPass *createAArch64ConditionOptimizerPass();
 FunctionPass *createAArch64AddressTypePromotionPass();
 FunctionPass *createAArch64A57FPLoadBalancing();
+FunctionPass *createAArch64A57PBQPRegAlloc();
 /// \brief Creates an ARM-specific Target Transformation Info pass.
 ImmutablePass *
 createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM);
diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
new file mode 100644 (file)
index 0000000..c4787f4
--- /dev/null
@@ -0,0 +1,413 @@
+//===-- AArch64PBQPRegAlloc.cpp - AArch64 specific PBQP constraints -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file contains the AArch64 / Cortex-A57 specific register allocation
+// constraints for use by the PBQP register allocator.
+//
+// It is essentially a transcription of what is contained in
+// AArch64A57FPLoadBalancing, which tries to use a balanced
+// mix of odd and even D-registers when performing a critical sequence of
+// independent, non-quadword FP/ASIMD floating-point multiply-accumulates.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "aarch64-pbqp"
+
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegAllocPBQP.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define PBQP_BUILDER PBQPBuilderWithCoalescing
+//#define PBQP_BUILDER PBQPBuilder
+
+using namespace llvm;
+
+namespace {
+
+bool isFPReg(unsigned reg) {
+  return AArch64::FPR32RegClass.contains(reg) ||
+         AArch64::FPR64RegClass.contains(reg) ||
+         AArch64::FPR128RegClass.contains(reg);
+};
+
+bool isOdd(unsigned reg) {
+  switch (reg) {
+  default:
+    llvm_unreachable("Register is not from the expected class !");
+  case AArch64::S1:
+  case AArch64::S3:
+  case AArch64::S5:
+  case AArch64::S7:
+  case AArch64::S9:
+  case AArch64::S11:
+  case AArch64::S13:
+  case AArch64::S15:
+  case AArch64::S17:
+  case AArch64::S19:
+  case AArch64::S21:
+  case AArch64::S23:
+  case AArch64::S25:
+  case AArch64::S27:
+  case AArch64::S29:
+  case AArch64::S31:
+  case AArch64::D1:
+  case AArch64::D3:
+  case AArch64::D5:
+  case AArch64::D7:
+  case AArch64::D9:
+  case AArch64::D11:
+  case AArch64::D13:
+  case AArch64::D15:
+  case AArch64::D17:
+  case AArch64::D19:
+  case AArch64::D21:
+  case AArch64::D23:
+  case AArch64::D25:
+  case AArch64::D27:
+  case AArch64::D29:
+  case AArch64::D31:
+  case AArch64::Q1:
+  case AArch64::Q3:
+  case AArch64::Q5:
+  case AArch64::Q7:
+  case AArch64::Q9:
+  case AArch64::Q11:
+  case AArch64::Q13:
+  case AArch64::Q15:
+  case AArch64::Q17:
+  case AArch64::Q19:
+  case AArch64::Q21:
+  case AArch64::Q23:
+  case AArch64::Q25:
+  case AArch64::Q27:
+  case AArch64::Q29:
+  case AArch64::Q31:
+    return true;
+  case AArch64::S0:
+  case AArch64::S2:
+  case AArch64::S4:
+  case AArch64::S6:
+  case AArch64::S8:
+  case AArch64::S10:
+  case AArch64::S12:
+  case AArch64::S14:
+  case AArch64::S16:
+  case AArch64::S18:
+  case AArch64::S20:
+  case AArch64::S22:
+  case AArch64::S24:
+  case AArch64::S26:
+  case AArch64::S28:
+  case AArch64::S30:
+  case AArch64::D0:
+  case AArch64::D2:
+  case AArch64::D4:
+  case AArch64::D6:
+  case AArch64::D8:
+  case AArch64::D10:
+  case AArch64::D12:
+  case AArch64::D14:
+  case AArch64::D16:
+  case AArch64::D18:
+  case AArch64::D20:
+  case AArch64::D22:
+  case AArch64::D24:
+  case AArch64::D26:
+  case AArch64::D28:
+  case AArch64::D30:
+  case AArch64::Q0:
+  case AArch64::Q2:
+  case AArch64::Q4:
+  case AArch64::Q6:
+  case AArch64::Q8:
+  case AArch64::Q10:
+  case AArch64::Q12:
+  case AArch64::Q14:
+  case AArch64::Q16:
+  case AArch64::Q18:
+  case AArch64::Q20:
+  case AArch64::Q22:
+  case AArch64::Q24:
+  case AArch64::Q26:
+  case AArch64::Q28:
+  case AArch64::Q30:
+    return false;
+
+  }
+}
+
+bool haveSameParity(unsigned reg1, unsigned reg2) {
+  assert(isFPReg(reg1) && "Expecting an FP register for reg1");
+  assert(isFPReg(reg2) && "Expecting an FP register for reg2");
+
+  return isOdd(reg1) == isOdd(reg2);
+}
+
+class A57PBQPBuilder : public PBQP_BUILDER {
+public:
+  A57PBQPBuilder() : PBQP_BUILDER(), TRI(nullptr), LIs(nullptr), Chains() {}
+
+  // Build a PBQP instance to represent the register allocation problem for
+  // the given MachineFunction.
+  std::unique_ptr<PBQPRAProblem>
+  build(MachineFunction *MF, const LiveIntervals *LI,
+        const MachineBlockFrequencyInfo *blockInfo,
+        const RegSet &VRegs) override;
+
+private:
+  const AArch64RegisterInfo *TRI;
+  const LiveIntervals *LIs;
+  SmallSetVector<unsigned, 32> Chains;
+
+  // Return true if reg is a physical register
+  bool isPhysicalReg(unsigned reg) const {
+    return TRI->isPhysicalRegister(reg);
+  }
+
+  // Add the accumulator chaining constraint, inside the chain, i.e. so that
+  // parity(Rd) == parity(Ra).
+  // \return true if a constraint was added
+  bool addIntraChainConstraint(PBQPRAProblem *p, unsigned Rd, unsigned Ra);
+
+  // Add constraints between existing chains
+  void addInterChainConstraint(PBQPRAProblem *p, unsigned Rd, unsigned Ra);
+};
+} // Anonymous namespace
+
+bool A57PBQPBuilder::addIntraChainConstraint(PBQPRAProblem *p, unsigned Rd,
+                                             unsigned Ra) {
+  if (Rd == Ra)
+    return false;
+
+  if (isPhysicalReg(Rd) || isPhysicalReg(Ra)) {
+    dbgs() << "Rd is a physical reg:" << isPhysicalReg(Rd) << '\n';
+    dbgs() << "Ra is a physical reg:" << isPhysicalReg(Ra) << '\n';
+    return false;
+  }
+
+  const PBQPRAProblem::AllowedSet *vRdAllowed = &p->getAllowedSet(Rd);
+  const PBQPRAProblem::AllowedSet *vRaAllowed = &p->getAllowedSet(Ra);
+
+  PBQPRAGraph &g = p->getGraph();
+  PBQPRAGraph::NodeId node1 = p->getNodeForVReg(Rd);
+  PBQPRAGraph::NodeId node2 = p->getNodeForVReg(Ra);
+  PBQPRAGraph::EdgeId edge = g.findEdge(node1, node2);
+
+  // The edge does not exist. Create one with the appropriate interference
+  // costs.
+  if (edge == g.invalidEdgeId()) {
+    const LiveInterval &ld = LIs->getInterval(Rd);
+    const LiveInterval &la = LIs->getInterval(Ra);
+    bool livesOverlap = ld.overlaps(la);
+
+    PBQP::Matrix costs(vRdAllowed->size() + 1, vRaAllowed->size() + 1, 0);
+    for (unsigned i = 0; i != vRdAllowed->size(); ++i) {
+      unsigned pRd = (*vRdAllowed)[i];
+      for (unsigned j = 0; j != vRaAllowed->size(); ++j) {
+        unsigned pRa = (*vRaAllowed)[j];
+        if (livesOverlap && TRI->regsOverlap(pRd, pRa))
+          costs[i + 1][j + 1] = std::numeric_limits<PBQP::PBQPNum>::infinity();
+        else
+          costs[i + 1][j + 1] = haveSameParity(pRd, pRa) ? 0.0 : 1.0;
+      }
+    }
+    g.addEdge(node1, node2, std::move(costs));
+    return true;
+  }
+
+  if (g.getEdgeNode1Id(edge) == node2) {
+    std::swap(node1, node2);
+    std::swap(vRdAllowed, vRaAllowed);
+  }
+
+  // Enforce minCost(sameParity(RaClass)) > maxCost(otherParity(RdClass))
+  PBQP::Matrix costs(g.getEdgeCosts(edge));
+  for (unsigned i = 0; i != vRdAllowed->size(); ++i) {
+    unsigned pRd = (*vRdAllowed)[i];
+
+    // Get the maximum cost (excluding unallocatable reg) for same parity
+    // registers
+    PBQP::PBQPNum sameParityMax = std::numeric_limits<PBQP::PBQPNum>::min();
+    for (unsigned j = 0; j != vRaAllowed->size(); ++j) {
+      unsigned pRa = (*vRaAllowed)[j];
+      if (haveSameParity(pRd, pRa))
+        if (costs[i + 1][j + 1] !=
+                std::numeric_limits<PBQP::PBQPNum>::infinity() &&
+            costs[i + 1][j + 1] > sameParityMax)
+          sameParityMax = costs[i + 1][j + 1];
+    }
+
+    // Ensure all registers with a different parity have a higher cost
+    // than sameParityMax
+    for (unsigned j = 0; j != vRaAllowed->size(); ++j) {
+      unsigned pRa = (*vRaAllowed)[j];
+      if (!haveSameParity(pRd, pRa))
+        if (sameParityMax > costs[i + 1][j + 1])
+          costs[i + 1][j + 1] = sameParityMax + 1.0;
+    }
+  }
+  g.setEdgeCosts(edge, costs);
+
+  return true;
+}
+
+void
+A57PBQPBuilder::addInterChainConstraint(PBQPRAProblem *p, unsigned Rd,
+                                        unsigned Ra) {
+  // Do some Chain management
+  if (Chains.count(Ra)) {
+    if (Rd != Ra) {
+      DEBUG(dbgs() << "Moving acc chain from " << PrintReg(Ra, TRI) << " to "
+                   << PrintReg(Rd, TRI) << '\n';);
+      Chains.remove(Ra);
+      Chains.insert(Rd);
+    }
+  } else {
+    DEBUG(dbgs() << "Creating new acc chain for " << PrintReg(Rd, TRI)
+                 << '\n';);
+    Chains.insert(Rd);
+  }
+
+  const LiveInterval &ld = LIs->getInterval(Rd);
+  for (auto r : Chains) {
+    // Skip self
+    if (r == Rd)
+      continue;
+
+    const LiveInterval &lr = LIs->getInterval(r);
+    if (ld.overlaps(lr)) {
+      const PBQPRAProblem::AllowedSet *vRdAllowed = &p->getAllowedSet(Rd);
+      const PBQPRAProblem::AllowedSet *vRrAllowed = &p->getAllowedSet(r);
+
+      PBQPRAGraph &g = p->getGraph();
+      PBQPRAGraph::NodeId node1 = p->getNodeForVReg(Rd);
+      PBQPRAGraph::NodeId node2 = p->getNodeForVReg(r);
+      PBQPRAGraph::EdgeId edge = g.findEdge(node1, node2);
+      assert(edge != g.invalidEdgeId() &&
+             "PBQP error ! The edge should exist !");
+
+      DEBUG(dbgs() << "Refining constraint !\n";);
+
+      if (g.getEdgeNode1Id(edge) == node2) {
+        std::swap(node1, node2);
+        std::swap(vRdAllowed, vRrAllowed);
+      }
+
+      // Enforce that cost is higher with all other Chains of the same parity
+      PBQP::Matrix costs(g.getEdgeCosts(edge));
+      for (unsigned i = 0; i != vRdAllowed->size(); ++i) {
+        unsigned pRd = (*vRdAllowed)[i];
+
+        // Get the maximum cost (excluding unallocatable reg) for all other
+        // parity registers
+        PBQP::PBQPNum sameParityMax = std::numeric_limits<PBQP::PBQPNum>::min();
+        for (unsigned j = 0; j != vRrAllowed->size(); ++j) {
+          unsigned pRa = (*vRrAllowed)[j];
+          if (!haveSameParity(pRd, pRa))
+            if (costs[i + 1][j + 1] !=
+                    std::numeric_limits<PBQP::PBQPNum>::infinity() &&
+                costs[i + 1][j + 1] > sameParityMax)
+              sameParityMax = costs[i + 1][j + 1];
+        }
+
+        // Ensure all registers with same parity have a higher cost
+        // than sameParityMax
+        for (unsigned j = 0; j != vRrAllowed->size(); ++j) {
+          unsigned pRa = (*vRrAllowed)[j];
+          if (haveSameParity(pRd, pRa))
+            if (sameParityMax > costs[i + 1][j + 1])
+              costs[i + 1][j + 1] = sameParityMax + 1.0;
+        }
+      }
+      g.setEdgeCosts(edge, costs);
+    }
+  }
+}
+
+std::unique_ptr<PBQPRAProblem>
+A57PBQPBuilder::build(MachineFunction *MF, const LiveIntervals *LI,
+                      const MachineBlockFrequencyInfo *blockInfo,
+                      const RegSet &VRegs) {
+  std::unique_ptr<PBQPRAProblem> p =
+      PBQP_BUILDER::build(MF, LI, blockInfo, VRegs);
+
+  TRI = static_cast<const AArch64RegisterInfo *>(
+      MF->getTarget().getSubtargetImpl()->getRegisterInfo());
+  LIs = LI;
+
+  DEBUG(MF->dump(););
+
+  for (MachineFunction::const_iterator mbbItr = MF->begin(), mbbEnd = MF->end();
+       mbbItr != mbbEnd; ++mbbItr) {
+    const MachineBasicBlock *MBB = &*mbbItr;
+    Chains.clear(); // FIXME: really needed ? Could not work at MF level ?
+
+    for (MachineBasicBlock::const_iterator miItr = MBB->begin(),
+                                           miEnd = MBB->end();
+         miItr != miEnd; ++miItr) {
+      const MachineInstr *MI = &*miItr;
+      switch (MI->getOpcode()) {
+      case AArch64::FMSUBSrrr:
+      case AArch64::FMADDSrrr:
+      case AArch64::FNMSUBSrrr:
+      case AArch64::FNMADDSrrr:
+      case AArch64::FMSUBDrrr:
+      case AArch64::FMADDDrrr:
+      case AArch64::FNMSUBDrrr:
+      case AArch64::FNMADDDrrr: {
+        unsigned Rd = MI->getOperand(0).getReg();
+        unsigned Ra = MI->getOperand(3).getReg();
+
+        if (addIntraChainConstraint(p.get(), Rd, Ra))
+          addInterChainConstraint(p.get(), Rd, Ra);
+        break;
+      }
+
+      case AArch64::FMLAv2f32:
+      case AArch64::FMLSv2f32: {
+        unsigned Rd = MI->getOperand(0).getReg();
+        addInterChainConstraint(p.get(), Rd, Rd);
+        break;
+      }
+
+      default:
+        // Forget Chains which have been killed
+        for (auto r : Chains) {
+          SmallVector<unsigned, 8> toDel;
+          if (MI->killsRegister(r)) {
+            DEBUG(dbgs() << "Killing chain " << PrintReg(r, TRI) << " at ";
+                  MI->print(dbgs()););
+            toDel.push_back(r);
+          }
+
+          while (!toDel.empty()) {
+            Chains.remove(toDel.back());
+            toDel.pop_back();
+          }
+        }
+      }
+    }
+  }
+
+  return p;
+}
+
+// Factory function used by AArch64TargetMachine to add the pass to the
+// passmanager.
+FunctionPass *llvm::createAArch64A57PBQPRegAlloc() {
+  std::unique_ptr<PBQP_BUILDER> builder = llvm::make_unique<A57PBQPBuilder>();
+  return createPBQPRegisterAllocator(std::move(builder), nullptr);
+}
index e867524226bd85a9a00325dacf5fb7b21d6f9888..1f5978198e4b4e6bc8a89ec38bb91ca2e01e0d15 100644 (file)
@@ -13,6 +13,7 @@
 #include "AArch64.h"
 #include "AArch64TargetMachine.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -73,6 +74,10 @@ EnableCondOpt("aarch64-condopt",
               cl::desc("Enable the condition optimizer pass"),
               cl::init(true), cl::Hidden);
 
+static cl::opt<bool>
+EnablePBQP("aarch64-pbqp", cl::Hidden,
+           cl::desc("Use PBQP register allocator (experimental)"),
+           cl::init(false));
 
 extern "C" void LLVMInitializeAArch64Target() {
   // Register the target.
@@ -90,8 +95,14 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT,
                                            CodeGenOpt::Level OL,
                                            bool LittleEndian)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-      Subtarget(TT, CPU, FS, *this, LittleEndian) {
+      Subtarget(TT, CPU, FS, *this, LittleEndian),
+      usingPBQP(false) {
   initAsmInfo();
+
+  if (EnablePBQP && Subtarget.isCortexA57() && OL != CodeGenOpt::None) {
+    usingPBQP = true;
+    RegisterRegAlloc::setDefault(createAArch64A57PBQPRegAlloc);
+  }
 }
 
 void AArch64leTargetMachine::anchor() { }
@@ -216,7 +227,8 @@ bool AArch64PassConfig::addPostRegAlloc() {
   if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination)
     addPass(createAArch64DeadRegisterDefinitions());
   if (TM->getOptLevel() != CodeGenOpt::None &&
-      TM->getSubtarget<AArch64Subtarget>().isCortexA57())
+      TM->getSubtarget<AArch64Subtarget>().isCortexA57() &&
+      !static_cast<const AArch64TargetMachine *>(TM)->isPBQPUsed())
     // Improve performance for some FP/SIMD code for A57.
     addPass(createAArch64A57FPLoadBalancing());
   return true;
index 101d10c29a812cca2102d82080137bb5662f0a7d..42d7dc573284d5bc868c2d4ce839df9393c01e54 100644 (file)
@@ -40,6 +40,12 @@ public:
 
   /// \brief Register AArch64 analysis passes with a pass manager.
   void addAnalysisPasses(PassManagerBase &PM) override;
+
+  /// \brief Query if the PBQP register allocator is being used
+  bool isPBQPUsed() const { return usingPBQP; }
+
+private:
+  bool usingPBQP;
 };
 
 // AArch64leTargetMachine - AArch64 little endian target machine.
index ebf7ae0d347246ab71eee8b6149a559b6911accb..d68a2de94b6f449b5d3625971746430cb89b4a5b 100644 (file)
@@ -34,6 +34,7 @@ add_llvm_target(AArch64CodeGen
   AArch64LoadStoreOptimizer.cpp
   AArch64MCInstLower.cpp
   AArch64PromoteConstant.cpp
+  AArch64PBQPRegAlloc.cpp
   AArch64RegisterInfo.cpp
   AArch64SelectionDAGInfo.cpp
   AArch64StorePairSuppress.cpp
diff --git a/test/CodeGen/AArch64/PBQP.ll b/test/CodeGen/AArch64/PBQP.ll
new file mode 100644 (file)
index 0000000..491998e
--- /dev/null
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 -aarch64-pbqp -o - %s | FileCheck %s
+
+define i32 @foo(i32 %a) {
+; CHECK-LABEL: foo:
+; CHECK: bl bar
+; CHECK-NEXT: bl baz
+  %call = call i32 @bar(i32 %a)
+  %call1 = call i32 @baz(i32 %call)
+  ret i32 %call1
+}
+
+declare i32 @bar(i32)
+declare i32 @baz(i32)
+