From 438669ca81f193ce2c75a2337850dc0400dc3411 Mon Sep 17 00:00:00 2001 From: "Arnaud A. de Grandmaison" Date: Wed, 10 Sep 2014 14:06:10 +0000 Subject: [PATCH] [AArch64] Add experimental PBQP support This adds target specific support for using the PBQP register allocator on the AArch64, for the A57 cpu. By default, the PBQP allocator is not used, unless explicitely required on the command line with "-aarch64-pbqp". git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217504 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64.h | 1 + lib/Target/AArch64/AArch64PBQPRegAlloc.cpp | 413 ++++++++++++++++++++ lib/Target/AArch64/AArch64TargetMachine.cpp | 16 +- lib/Target/AArch64/AArch64TargetMachine.h | 6 + lib/Target/AArch64/CMakeLists.txt | 1 + test/CodeGen/AArch64/PBQP.ll | 14 + 6 files changed, 449 insertions(+), 2 deletions(-) create mode 100644 lib/Target/AArch64/AArch64PBQPRegAlloc.cpp create mode 100644 test/CodeGen/AArch64/PBQP.ll diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h index 3e85bacbda2..425a9028488 100644 --- a/lib/Target/AArch64/AArch64.h +++ b/lib/Target/AArch64/AArch64.h @@ -39,6 +39,7 @@ ModulePass *createAArch64PromoteConstantPass(); FunctionPass *createAArch64ConditionOptimizerPass(); FunctionPass *createAArch64AddressTypePromotionPass(); FunctionPass *createAArch64A57FPLoadBalancing(); +FunctionPass *createAArch64A57PBQPRegAlloc(); /// \brief Creates an ARM-specific Target Transformation Info pass. ImmutablePass * createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM); diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp new file mode 100644 index 00000000000..c4787f4afd6 --- /dev/null +++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp @@ -0,0 +1,413 @@ +//===-- AArch64PBQPRegAlloc.cpp - AArch64 specific PBQP constraints -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This file contains the AArch64 / Cortex-A57 specific register allocation +// constraints for use by the PBQP register allocator. +// +// It is essentially a transcription of what is contained in +// AArch64A57FPLoadBalancing, which tries to use a balanced +// mix of odd and even D-registers when performing a critical sequence of +// independent, non-quadword FP/ASIMD floating-point multiply-accumulates. +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "aarch64-pbqp" + +#include "AArch64.h" +#include "AArch64RegisterInfo.h" + +#include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/RegAllocPBQP.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" + +#define PBQP_BUILDER PBQPBuilderWithCoalescing +//#define PBQP_BUILDER PBQPBuilder + +using namespace llvm; + +namespace { + +bool isFPReg(unsigned reg) { + return AArch64::FPR32RegClass.contains(reg) || + AArch64::FPR64RegClass.contains(reg) || + AArch64::FPR128RegClass.contains(reg); +}; + +bool isOdd(unsigned reg) { + switch (reg) { + default: + llvm_unreachable("Register is not from the expected class !"); + case AArch64::S1: + case AArch64::S3: + case AArch64::S5: + case AArch64::S7: + case AArch64::S9: + case AArch64::S11: + case AArch64::S13: + case AArch64::S15: + case AArch64::S17: + case AArch64::S19: + case AArch64::S21: + case AArch64::S23: + case AArch64::S25: + case AArch64::S27: + case AArch64::S29: + case AArch64::S31: + case AArch64::D1: + case AArch64::D3: + case AArch64::D5: + case AArch64::D7: + case AArch64::D9: + case AArch64::D11: + case AArch64::D13: + case AArch64::D15: + case AArch64::D17: + case AArch64::D19: + case AArch64::D21: + case AArch64::D23: + case AArch64::D25: + case AArch64::D27: + case AArch64::D29: + case AArch64::D31: + case AArch64::Q1: + case AArch64::Q3: + case AArch64::Q5: + case AArch64::Q7: + case AArch64::Q9: + case AArch64::Q11: + case AArch64::Q13: + case AArch64::Q15: + case AArch64::Q17: + case AArch64::Q19: + case AArch64::Q21: + case AArch64::Q23: + case AArch64::Q25: + case AArch64::Q27: + case AArch64::Q29: + case AArch64::Q31: + return true; + case AArch64::S0: + case AArch64::S2: + case AArch64::S4: + case AArch64::S6: + case AArch64::S8: + case AArch64::S10: + case AArch64::S12: + case AArch64::S14: + case AArch64::S16: + case AArch64::S18: + case AArch64::S20: + case AArch64::S22: + case AArch64::S24: + case AArch64::S26: + case AArch64::S28: + case AArch64::S30: + case AArch64::D0: + case AArch64::D2: + case AArch64::D4: + case AArch64::D6: + case AArch64::D8: + case AArch64::D10: + case AArch64::D12: + case AArch64::D14: + case AArch64::D16: + case AArch64::D18: + case AArch64::D20: + case AArch64::D22: + case AArch64::D24: + case AArch64::D26: + case AArch64::D28: + case AArch64::D30: + case AArch64::Q0: + case AArch64::Q2: + case AArch64::Q4: + case AArch64::Q6: + case AArch64::Q8: + case AArch64::Q10: + case AArch64::Q12: + case AArch64::Q14: + case AArch64::Q16: + case AArch64::Q18: + case AArch64::Q20: + case AArch64::Q22: + case AArch64::Q24: + case AArch64::Q26: + case AArch64::Q28: + case AArch64::Q30: + return false; + + } +} + +bool haveSameParity(unsigned reg1, unsigned reg2) { + assert(isFPReg(reg1) && "Expecting an FP register for reg1"); + assert(isFPReg(reg2) && "Expecting an FP register for reg2"); + + return isOdd(reg1) == isOdd(reg2); +} + +class A57PBQPBuilder : public PBQP_BUILDER { +public: + A57PBQPBuilder() : PBQP_BUILDER(), TRI(nullptr), LIs(nullptr), Chains() {} + + // Build a PBQP instance to represent the register allocation problem for + // the given MachineFunction. + std::unique_ptr + build(MachineFunction *MF, const LiveIntervals *LI, + const MachineBlockFrequencyInfo *blockInfo, + const RegSet &VRegs) override; + +private: + const AArch64RegisterInfo *TRI; + const LiveIntervals *LIs; + SmallSetVector Chains; + + // Return true if reg is a physical register + bool isPhysicalReg(unsigned reg) const { + return TRI->isPhysicalRegister(reg); + } + + // Add the accumulator chaining constraint, inside the chain, i.e. so that + // parity(Rd) == parity(Ra). + // \return true if a constraint was added + bool addIntraChainConstraint(PBQPRAProblem *p, unsigned Rd, unsigned Ra); + + // Add constraints between existing chains + void addInterChainConstraint(PBQPRAProblem *p, unsigned Rd, unsigned Ra); +}; +} // Anonymous namespace + +bool A57PBQPBuilder::addIntraChainConstraint(PBQPRAProblem *p, unsigned Rd, + unsigned Ra) { + if (Rd == Ra) + return false; + + if (isPhysicalReg(Rd) || isPhysicalReg(Ra)) { + dbgs() << "Rd is a physical reg:" << isPhysicalReg(Rd) << '\n'; + dbgs() << "Ra is a physical reg:" << isPhysicalReg(Ra) << '\n'; + return false; + } + + const PBQPRAProblem::AllowedSet *vRdAllowed = &p->getAllowedSet(Rd); + const PBQPRAProblem::AllowedSet *vRaAllowed = &p->getAllowedSet(Ra); + + PBQPRAGraph &g = p->getGraph(); + PBQPRAGraph::NodeId node1 = p->getNodeForVReg(Rd); + PBQPRAGraph::NodeId node2 = p->getNodeForVReg(Ra); + PBQPRAGraph::EdgeId edge = g.findEdge(node1, node2); + + // The edge does not exist. Create one with the appropriate interference + // costs. + if (edge == g.invalidEdgeId()) { + const LiveInterval &ld = LIs->getInterval(Rd); + const LiveInterval &la = LIs->getInterval(Ra); + bool livesOverlap = ld.overlaps(la); + + PBQP::Matrix costs(vRdAllowed->size() + 1, vRaAllowed->size() + 1, 0); + for (unsigned i = 0; i != vRdAllowed->size(); ++i) { + unsigned pRd = (*vRdAllowed)[i]; + for (unsigned j = 0; j != vRaAllowed->size(); ++j) { + unsigned pRa = (*vRaAllowed)[j]; + if (livesOverlap && TRI->regsOverlap(pRd, pRa)) + costs[i + 1][j + 1] = std::numeric_limits::infinity(); + else + costs[i + 1][j + 1] = haveSameParity(pRd, pRa) ? 0.0 : 1.0; + } + } + g.addEdge(node1, node2, std::move(costs)); + return true; + } + + if (g.getEdgeNode1Id(edge) == node2) { + std::swap(node1, node2); + std::swap(vRdAllowed, vRaAllowed); + } + + // Enforce minCost(sameParity(RaClass)) > maxCost(otherParity(RdClass)) + PBQP::Matrix costs(g.getEdgeCosts(edge)); + for (unsigned i = 0; i != vRdAllowed->size(); ++i) { + unsigned pRd = (*vRdAllowed)[i]; + + // Get the maximum cost (excluding unallocatable reg) for same parity + // registers + PBQP::PBQPNum sameParityMax = std::numeric_limits::min(); + for (unsigned j = 0; j != vRaAllowed->size(); ++j) { + unsigned pRa = (*vRaAllowed)[j]; + if (haveSameParity(pRd, pRa)) + if (costs[i + 1][j + 1] != + std::numeric_limits::infinity() && + costs[i + 1][j + 1] > sameParityMax) + sameParityMax = costs[i + 1][j + 1]; + } + + // Ensure all registers with a different parity have a higher cost + // than sameParityMax + for (unsigned j = 0; j != vRaAllowed->size(); ++j) { + unsigned pRa = (*vRaAllowed)[j]; + if (!haveSameParity(pRd, pRa)) + if (sameParityMax > costs[i + 1][j + 1]) + costs[i + 1][j + 1] = sameParityMax + 1.0; + } + } + g.setEdgeCosts(edge, costs); + + return true; +} + +void +A57PBQPBuilder::addInterChainConstraint(PBQPRAProblem *p, unsigned Rd, + unsigned Ra) { + // Do some Chain management + if (Chains.count(Ra)) { + if (Rd != Ra) { + DEBUG(dbgs() << "Moving acc chain from " << PrintReg(Ra, TRI) << " to " + << PrintReg(Rd, TRI) << '\n';); + Chains.remove(Ra); + Chains.insert(Rd); + } + } else { + DEBUG(dbgs() << "Creating new acc chain for " << PrintReg(Rd, TRI) + << '\n';); + Chains.insert(Rd); + } + + const LiveInterval &ld = LIs->getInterval(Rd); + for (auto r : Chains) { + // Skip self + if (r == Rd) + continue; + + const LiveInterval &lr = LIs->getInterval(r); + if (ld.overlaps(lr)) { + const PBQPRAProblem::AllowedSet *vRdAllowed = &p->getAllowedSet(Rd); + const PBQPRAProblem::AllowedSet *vRrAllowed = &p->getAllowedSet(r); + + PBQPRAGraph &g = p->getGraph(); + PBQPRAGraph::NodeId node1 = p->getNodeForVReg(Rd); + PBQPRAGraph::NodeId node2 = p->getNodeForVReg(r); + PBQPRAGraph::EdgeId edge = g.findEdge(node1, node2); + assert(edge != g.invalidEdgeId() && + "PBQP error ! The edge should exist !"); + + DEBUG(dbgs() << "Refining constraint !\n";); + + if (g.getEdgeNode1Id(edge) == node2) { + std::swap(node1, node2); + std::swap(vRdAllowed, vRrAllowed); + } + + // Enforce that cost is higher with all other Chains of the same parity + PBQP::Matrix costs(g.getEdgeCosts(edge)); + for (unsigned i = 0; i != vRdAllowed->size(); ++i) { + unsigned pRd = (*vRdAllowed)[i]; + + // Get the maximum cost (excluding unallocatable reg) for all other + // parity registers + PBQP::PBQPNum sameParityMax = std::numeric_limits::min(); + for (unsigned j = 0; j != vRrAllowed->size(); ++j) { + unsigned pRa = (*vRrAllowed)[j]; + if (!haveSameParity(pRd, pRa)) + if (costs[i + 1][j + 1] != + std::numeric_limits::infinity() && + costs[i + 1][j + 1] > sameParityMax) + sameParityMax = costs[i + 1][j + 1]; + } + + // Ensure all registers with same parity have a higher cost + // than sameParityMax + for (unsigned j = 0; j != vRrAllowed->size(); ++j) { + unsigned pRa = (*vRrAllowed)[j]; + if (haveSameParity(pRd, pRa)) + if (sameParityMax > costs[i + 1][j + 1]) + costs[i + 1][j + 1] = sameParityMax + 1.0; + } + } + g.setEdgeCosts(edge, costs); + } + } +} + +std::unique_ptr +A57PBQPBuilder::build(MachineFunction *MF, const LiveIntervals *LI, + const MachineBlockFrequencyInfo *blockInfo, + const RegSet &VRegs) { + std::unique_ptr p = + PBQP_BUILDER::build(MF, LI, blockInfo, VRegs); + + TRI = static_cast( + MF->getTarget().getSubtargetImpl()->getRegisterInfo()); + LIs = LI; + + DEBUG(MF->dump();); + + for (MachineFunction::const_iterator mbbItr = MF->begin(), mbbEnd = MF->end(); + mbbItr != mbbEnd; ++mbbItr) { + const MachineBasicBlock *MBB = &*mbbItr; + Chains.clear(); // FIXME: really needed ? Could not work at MF level ? + + for (MachineBasicBlock::const_iterator miItr = MBB->begin(), + miEnd = MBB->end(); + miItr != miEnd; ++miItr) { + const MachineInstr *MI = &*miItr; + switch (MI->getOpcode()) { + case AArch64::FMSUBSrrr: + case AArch64::FMADDSrrr: + case AArch64::FNMSUBSrrr: + case AArch64::FNMADDSrrr: + case AArch64::FMSUBDrrr: + case AArch64::FMADDDrrr: + case AArch64::FNMSUBDrrr: + case AArch64::FNMADDDrrr: { + unsigned Rd = MI->getOperand(0).getReg(); + unsigned Ra = MI->getOperand(3).getReg(); + + if (addIntraChainConstraint(p.get(), Rd, Ra)) + addInterChainConstraint(p.get(), Rd, Ra); + break; + } + + case AArch64::FMLAv2f32: + case AArch64::FMLSv2f32: { + unsigned Rd = MI->getOperand(0).getReg(); + addInterChainConstraint(p.get(), Rd, Rd); + break; + } + + default: + // Forget Chains which have been killed + for (auto r : Chains) { + SmallVector toDel; + if (MI->killsRegister(r)) { + DEBUG(dbgs() << "Killing chain " << PrintReg(r, TRI) << " at "; + MI->print(dbgs());); + toDel.push_back(r); + } + + while (!toDel.empty()) { + Chains.remove(toDel.back()); + toDel.pop_back(); + } + } + } + } + } + + return p; +} + +// Factory function used by AArch64TargetMachine to add the pass to the +// passmanager. +FunctionPass *llvm::createAArch64A57PBQPRegAlloc() { + std::unique_ptr builder = llvm::make_unique(); + return createPBQPRegisterAllocator(std::move(builder), nullptr); +} diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index e867524226b..1f5978198e4 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -13,6 +13,7 @@ #include "AArch64.h" #include "AArch64TargetMachine.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/PassManager.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/TargetRegistry.h" @@ -73,6 +74,10 @@ EnableCondOpt("aarch64-condopt", cl::desc("Enable the condition optimizer pass"), cl::init(true), cl::Hidden); +static cl::opt +EnablePBQP("aarch64-pbqp", cl::Hidden, + cl::desc("Use PBQP register allocator (experimental)"), + cl::init(false)); extern "C" void LLVMInitializeAArch64Target() { // Register the target. @@ -90,8 +95,14 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT, CodeGenOpt::Level OL, bool LittleEndian) : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), - Subtarget(TT, CPU, FS, *this, LittleEndian) { + Subtarget(TT, CPU, FS, *this, LittleEndian), + usingPBQP(false) { initAsmInfo(); + + if (EnablePBQP && Subtarget.isCortexA57() && OL != CodeGenOpt::None) { + usingPBQP = true; + RegisterRegAlloc::setDefault(createAArch64A57PBQPRegAlloc); + } } void AArch64leTargetMachine::anchor() { } @@ -216,7 +227,8 @@ bool AArch64PassConfig::addPostRegAlloc() { if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination) addPass(createAArch64DeadRegisterDefinitions()); if (TM->getOptLevel() != CodeGenOpt::None && - TM->getSubtarget().isCortexA57()) + TM->getSubtarget().isCortexA57() && + !static_cast(TM)->isPBQPUsed()) // Improve performance for some FP/SIMD code for A57. addPass(createAArch64A57FPLoadBalancing()); return true; diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h index 101d10c29a8..42d7dc57328 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.h +++ b/lib/Target/AArch64/AArch64TargetMachine.h @@ -40,6 +40,12 @@ public: /// \brief Register AArch64 analysis passes with a pass manager. void addAnalysisPasses(PassManagerBase &PM) override; + + /// \brief Query if the PBQP register allocator is being used + bool isPBQPUsed() const { return usingPBQP; } + +private: + bool usingPBQP; }; // AArch64leTargetMachine - AArch64 little endian target machine. diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt index ebf7ae0d347..d68a2de94b6 100644 --- a/lib/Target/AArch64/CMakeLists.txt +++ b/lib/Target/AArch64/CMakeLists.txt @@ -34,6 +34,7 @@ add_llvm_target(AArch64CodeGen AArch64LoadStoreOptimizer.cpp AArch64MCInstLower.cpp AArch64PromoteConstant.cpp + AArch64PBQPRegAlloc.cpp AArch64RegisterInfo.cpp AArch64SelectionDAGInfo.cpp AArch64StorePairSuppress.cpp diff --git a/test/CodeGen/AArch64/PBQP.ll b/test/CodeGen/AArch64/PBQP.ll new file mode 100644 index 00000000000..491998e1e30 --- /dev/null +++ b/test/CodeGen/AArch64/PBQP.ll @@ -0,0 +1,14 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 -aarch64-pbqp -o - %s | FileCheck %s + +define i32 @foo(i32 %a) { +; CHECK-LABEL: foo: +; CHECK: bl bar +; CHECK-NEXT: bl baz + %call = call i32 @bar(i32 %a) + %call1 = call i32 @baz(i32 %call) + ret i32 %call1 +} + +declare i32 @bar(i32) +declare i32 @baz(i32) + -- 2.34.1